CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
thread_store.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2015, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include <cuda.h>
37 
38 #include "../util_ptx.cuh"
39 #include "../util_type.cuh"
40 #include "../util_namespace.cuh"
41 
43 CUB_NS_PREFIX
44 
46 namespace cub {
47 
54 //-----------------------------------------------------------------------------
55 // Tags and constants
56 //-----------------------------------------------------------------------------
57 
62 {
69 };
70 
71 
110 template <
111  CacheStoreModifier MODIFIER,
112  typename OutputIteratorT,
113  typename T>
114 __device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val);
115 
116 
118 
119 
120 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
121 
122 
124 template <int COUNT, int MAX>
125 struct IterateThreadStore
126 {
127  template <CacheStoreModifier MODIFIER, typename T>
128  static __device__ __forceinline__ void Store(T *ptr, T *vals)
129  {
130  ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
131  IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
132  }
133 
134  template <typename OutputIteratorT, typename T>
135  static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals)
136  {
137  ptr[COUNT] = vals[COUNT];
138  IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
139  }
140 
141 };
142 
144 template <int MAX>
145 struct IterateThreadStore<MAX, MAX>
146 {
147  template <CacheStoreModifier MODIFIER, typename T>
148  static __device__ __forceinline__ void Store(T *ptr, T *vals) {}
149 
150  template <typename OutputIteratorT, typename T>
151  static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals) {}
152 };
153 
154 
158 #define CUB_STORE_16(cub_modifier, ptx_modifier) \
159  template<> \
160  __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val) \
161  { \
162  asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \
163  _CUB_ASM_PTR_(ptr), \
164  "r"(val.x), \
165  "r"(val.y), \
166  "r"(val.z), \
167  "r"(val.w)); \
168  } \
169  template<> \
170  __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val) \
171  { \
172  asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \
173  _CUB_ASM_PTR_(ptr), \
174  "l"(val.x), \
175  "l"(val.y)); \
176  }
177 
178 
182 #define CUB_STORE_8(cub_modifier, ptx_modifier) \
183  template<> \
184  __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val) \
185  { \
186  asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \
187  _CUB_ASM_PTR_(ptr), \
188  "h"(val.x), \
189  "h"(val.y), \
190  "h"(val.z), \
191  "h"(val.w)); \
192  } \
193  template<> \
194  __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val) \
195  { \
196  asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \
197  _CUB_ASM_PTR_(ptr), \
198  "r"(val.x), \
199  "r"(val.y)); \
200  } \
201  template<> \
202  __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val) \
203  { \
204  asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \
205  _CUB_ASM_PTR_(ptr), \
206  "l"(val)); \
207  }
208 
212 #define CUB_STORE_4(cub_modifier, ptx_modifier) \
213  template<> \
214  __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val) \
215  { \
216  asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \
217  _CUB_ASM_PTR_(ptr), \
218  "r"(val)); \
219  }
220 
221 
225 #define CUB_STORE_2(cub_modifier, ptx_modifier) \
226  template<> \
227  __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val) \
228  { \
229  asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \
230  _CUB_ASM_PTR_(ptr), \
231  "h"(val)); \
232  }
233 
234 
238 #define CUB_STORE_1(cub_modifier, ptx_modifier) \
239  template<> \
240  __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val) \
241  { \
242  asm volatile ( \
243  "{" \
244  " .reg .u8 datum;" \
245  " cvt.u8.u16 datum, %1;" \
246  " st."#ptx_modifier".u8 [%0], datum;" \
247  "}" : : \
248  _CUB_ASM_PTR_(ptr), \
249  "h"((unsigned short) val)); \
250  }
251 
255 #define CUB_STORE_ALL(cub_modifier, ptx_modifier) \
256  CUB_STORE_16(cub_modifier, ptx_modifier) \
257  CUB_STORE_8(cub_modifier, ptx_modifier) \
258  CUB_STORE_4(cub_modifier, ptx_modifier) \
259  CUB_STORE_2(cub_modifier, ptx_modifier) \
260  CUB_STORE_1(cub_modifier, ptx_modifier) \
261 
262 
266 #if CUB_PTX_ARCH >= 200
267  CUB_STORE_ALL(STORE_WB, ca)
268  CUB_STORE_ALL(STORE_CG, cg)
269  CUB_STORE_ALL(STORE_CS, cs)
270  CUB_STORE_ALL(STORE_WT, wt)
271 #else
272  CUB_STORE_ALL(STORE_WB, global)
273  CUB_STORE_ALL(STORE_CG, global)
274  CUB_STORE_ALL(STORE_CS, global)
275  CUB_STORE_ALL(STORE_WT, volatile.global)
276 #endif
277 
278 
279 // Macro cleanup
280 #undef CUB_STORE_ALL
281 #undef CUB_STORE_1
282 #undef CUB_STORE_2
283 #undef CUB_STORE_4
284 #undef CUB_STORE_8
285 #undef CUB_STORE_16
286 
287 
291 template <typename OutputIteratorT, typename T>
292 __device__ __forceinline__ void ThreadStore(
293  OutputIteratorT itr,
294  T val,
295  Int2Type<STORE_DEFAULT> modifier,
296  Int2Type<false> is_pointer)
297 {
298  *itr = val;
299 }
300 
301 
305 template <typename T>
306 __device__ __forceinline__ void ThreadStore(
307  T *ptr,
308  T val,
309  Int2Type<STORE_DEFAULT> modifier,
310  Int2Type<true> is_pointer)
311 {
312  *ptr = val;
313 }
314 
315 
319 template <typename T>
320 __device__ __forceinline__ void ThreadStoreVolatilePtr(
321  T *ptr,
322  T val,
323  Int2Type<true> is_primitive)
324 {
325  *reinterpret_cast<volatile T*>(ptr) = val;
326 }
327 
328 
332 template <typename T>
333 __device__ __forceinline__ void ThreadStoreVolatilePtr(
334  T *ptr,
335  T val,
336  Int2Type<false> is_primitive)
337 {
338 #if CUB_PTX_ARCH <= 130
339 
340  *ptr = val;
341  __threadfence_block();
342 
343 #else
344 
345  // Create a temporary using shuffle-words, then store using volatile-words
346  typedef typename UnitWord<T>::VolatileWord VolatileWord;
347  typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
348 
349  const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
350  const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord);
351 
352  VolatileWord words[VOLATILE_MULTIPLE];
353 
354  #pragma unroll
355  for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
356  reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
357 
358  IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
359  reinterpret_cast<volatile VolatileWord*>(ptr),
360  words);
361 
362 #endif // CUB_PTX_ARCH <= 130
363 
364 }
365 
366 
370 template <typename T>
371 __device__ __forceinline__ void ThreadStore(
372  T *ptr,
373  T val,
374  Int2Type<STORE_VOLATILE> modifier,
375  Int2Type<true> is_pointer)
376 {
377  ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
378 }
379 
380 
384 template <typename T, int MODIFIER>
385 __device__ __forceinline__ void ThreadStore(
386  T *ptr,
387  T val,
388  Int2Type<MODIFIER> modifier,
389  Int2Type<true> is_pointer)
390 {
391  // Create a temporary using shuffle-words, then store using device-words
392  typedef typename UnitWord<T>::DeviceWord DeviceWord;
393  typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
394 
395  const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
396  const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord);
397 
398  DeviceWord words[DEVICE_MULTIPLE];
399 
400  #pragma unroll
401  for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
402  reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
403 
404  IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
405  reinterpret_cast<DeviceWord*>(ptr),
406  words);
407 }
408 
409 
413 template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
414 __device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
415 {
416  ThreadStore(
417  itr,
418  val,
419  Int2Type<MODIFIER>(),
420  Int2Type<IsPointer<OutputIteratorT>::VALUE>());
421 }
422 
423 
424 
425 #endif // DOXYGEN_SHOULD_SKIP_THIS
426 
427  // end group UtilIo
429 
430 
431 } // CUB namespace
432 CUB_NS_POSTFIX // Optional outer namespace(s)