38 #include "../util_ptx.cuh"
39 #include "../util_type.cuh"
40 #include "../util_namespace.cuh"
112 typename OutputIteratorT,
114 __device__ __forceinline__
void ThreadStore(OutputIteratorT itr, T val);
120 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
124 template <
int COUNT,
int MAX>
125 struct IterateThreadStore
127 template <CacheStoreModifier MODIFIER,
typename T>
128 static __device__ __forceinline__
void Store(T *ptr, T *vals)
130 ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
131 IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
134 template <
typename OutputIteratorT,
typename T>
135 static __device__ __forceinline__
void Dereference(OutputIteratorT ptr, T *vals)
137 ptr[COUNT] = vals[COUNT];
138 IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
145 struct IterateThreadStore<MAX, MAX>
147 template <CacheStoreModifier MODIFIER,
typename T>
148 static __device__ __forceinline__
void Store(T *ptr, T *vals) {}
150 template <
typename OutputIteratorT,
typename T>
151 static __device__ __forceinline__
void Dereference(OutputIteratorT ptr, T *vals) {}
158 #define _CUB_STORE_16(cub_modifier, ptx_modifier) \
160 __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val) \
162 asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \
163 _CUB_ASM_PTR_(ptr), \
170 __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val) \
172 asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \
173 _CUB_ASM_PTR_(ptr), \
182 #define _CUB_STORE_8(cub_modifier, ptx_modifier) \
184 __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val) \
186 asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \
187 _CUB_ASM_PTR_(ptr), \
194 __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val) \
196 asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \
197 _CUB_ASM_PTR_(ptr), \
202 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val) \
204 asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \
205 _CUB_ASM_PTR_(ptr), \
212 #define _CUB_STORE_4(cub_modifier, ptx_modifier) \
214 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val) \
216 asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \
217 _CUB_ASM_PTR_(ptr), \
225 #define _CUB_STORE_2(cub_modifier, ptx_modifier) \
227 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val) \
229 asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \
230 _CUB_ASM_PTR_(ptr), \
238 #define _CUB_STORE_1(cub_modifier, ptx_modifier) \
240 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val) \
245 " cvt.u8.u16 datum, %1;" \
246 " st."#ptx_modifier".u8 [%0], datum;" \
248 _CUB_ASM_PTR_(ptr), \
249 "h"((unsigned short) val)); \
255 #define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \
256 _CUB_STORE_16(cub_modifier, ptx_modifier) \
257 _CUB_STORE_8(cub_modifier, ptx_modifier) \
258 _CUB_STORE_4(cub_modifier, ptx_modifier) \
259 _CUB_STORE_2(cub_modifier, ptx_modifier) \
260 _CUB_STORE_1(cub_modifier, ptx_modifier) \
266 #if CUB_PTX_ARCH >= 200
273 _CUB_STORE_ALL(STORE_CG, global)
274 _CUB_STORE_ALL(STORE_CS, global)
275 _CUB_STORE_ALL(STORE_WT, volatile.global)
280 #undef _CUB_STORE_ALL
291 template <
typename OutputIteratorT,
typename T>
295 Int2Type<STORE_DEFAULT> modifier,
296 Int2Type<false> is_pointer)
305 template <
typename T>
309 Int2Type<STORE_DEFAULT> modifier,
310 Int2Type<true> is_pointer)
319 template <
typename T>
320 __device__ __forceinline__
void ThreadStoreVolatilePtr(
323 Int2Type<true> is_primitive)
325 *
reinterpret_cast<volatile T*
>(ptr) = val;
332 template <
typename T>
333 __device__ __forceinline__
void ThreadStoreVolatilePtr(
336 Int2Type<false> is_primitive)
338 #if CUB_PTX_ARCH <= 130
341 __threadfence_block();
346 typedef typename UnitWord<T>::VolatileWord VolatileWord;
347 typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
349 const int VOLATILE_MULTIPLE =
sizeof(T) /
sizeof(VolatileWord);
350 const int SHUFFLE_MULTIPLE =
sizeof(T) /
sizeof(ShuffleWord);
352 VolatileWord words[VOLATILE_MULTIPLE];
355 for (
int i = 0; i < SHUFFLE_MULTIPLE; ++i)
356 reinterpret_cast<ShuffleWord*>(words)[i] =
reinterpret_cast<ShuffleWord*
>(&val)[i];
358 IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
359 reinterpret_cast<volatile VolatileWord*>(ptr),
362 #endif // CUB_PTX_ARCH <= 130
370 template <
typename T>
374 Int2Type<STORE_VOLATILE> modifier,
375 Int2Type<true> is_pointer)
377 ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
384 template <
typename T,
int MODIFIER>
388 Int2Type<MODIFIER> modifier,
389 Int2Type<true> is_pointer)
392 typedef typename UnitWord<T>::DeviceWord DeviceWord;
393 typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
395 const int DEVICE_MULTIPLE =
sizeof(T) /
sizeof(DeviceWord);
396 const int SHUFFLE_MULTIPLE =
sizeof(T) /
sizeof(ShuffleWord);
398 DeviceWord words[DEVICE_MULTIPLE];
401 for (
int i = 0; i < SHUFFLE_MULTIPLE; ++i)
402 reinterpret_cast<ShuffleWord*>(words)[i] =
reinterpret_cast<ShuffleWord*
>(&val)[i];
404 IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
405 reinterpret_cast<DeviceWord*
>(ptr),
413 template <CacheStoreModifier MODIFIER,
typename OutputIteratorT,
typename T>
414 __device__ __forceinline__
void ThreadStore(OutputIteratorT itr, T val)
419 Int2Type<MODIFIER>(),
420 Int2Type<IsPointer<OutputIteratorT>::VALUE>());
425 #endif // DOXYGEN_SHOULD_SKIP_THIS