39 #include "util_namespace.cuh"
60 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
65 #if defined(_WIN64) || defined(__LP64__)
66 #define __CUB_LP64__ 1
68 #define _CUB_ASM_PTR_ "l"
69 #define _CUB_ASM_PTR_SIZE_ "u64"
71 #define __CUB_LP64__ 0
73 #define _CUB_ASM_PTR_ "r"
74 #define _CUB_ASM_PTR_SIZE_ "u32"
77 #endif // DOXYGEN_SHOULD_SKIP_THIS
87 __device__ __forceinline__
unsigned int SHR_ADD(
93 #if CUB_PTX_ARCH >= 200
94 asm volatile(
"vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
95 "=r"(ret) :
"r"(x),
"r"(shift),
"r"(addend));
97 ret = (x >> shift) + addend;
106 __device__ __forceinline__
unsigned int SHL_ADD(
112 #if CUB_PTX_ARCH >= 200
113 asm volatile(
"vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
114 "=r"(ret) :
"r"(x),
"r"(shift),
"r"(addend));
116 ret = (x << shift) + addend;
121 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
126 template <
typename Un
signedBits,
int BYTE_LEN>
127 __device__ __forceinline__
unsigned int BFE(
129 unsigned int bit_start,
130 unsigned int num_bits,
131 Int2Type<BYTE_LEN> byte_len)
134 #if CUB_PTX_ARCH >= 200
135 asm volatile(
"bfe.u32 %0, %1, %2, %3;" :
"=r"(bits) :
"r"((
unsigned int) source),
"r"(bit_start),
"r"(num_bits));
137 const unsigned int MASK = (1 << num_bits) - 1;
138 bits = (source >> bit_start) & MASK;
147 template <
typename Un
signedBits>
148 __device__ __forceinline__
unsigned int BFE(
150 unsigned int bit_start,
151 unsigned int num_bits,
152 Int2Type<8> byte_len)
154 const unsigned long long MASK = (1ull << num_bits) - 1;
155 return (source >> bit_start) & MASK;
158 #endif // DOXYGEN_SHOULD_SKIP_THIS
163 template <
typename Un
signedBits>
164 __device__ __forceinline__
unsigned int BFE(
166 unsigned int bit_start,
167 unsigned int num_bits)
169 return BFE(source, bit_start, num_bits, Int2Type<
sizeof(UnsignedBits)>());
176 __device__ __forceinline__
void BFI(
180 unsigned int bit_start,
181 unsigned int num_bits)
183 #if CUB_PTX_ARCH >= 200
184 asm volatile(
"bfi.b32 %0, %1, %2, %3, %4;" :
185 "=r"(ret) :
"r"(y),
"r"(x),
"r"(bit_start),
"r"(num_bits));
188 unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start;
189 unsigned int MASK_Y = ~MASK_X;
190 ret = (y & MASK_Y) | (x & MASK_X);
198 __device__ __forceinline__
unsigned int IADD3(
unsigned int x,
unsigned int y,
unsigned int z)
200 #if CUB_PTX_ARCH >= 200
201 asm volatile(
"vadd.u32.u32.u32.add %0, %1, %2, %3;" :
"=r"(x) :
"r"(x),
"r"(y),
"r"(z));
235 __device__ __forceinline__
int PRMT(
unsigned int a,
unsigned int b,
unsigned int index)
238 asm volatile(
"prmt.b32 %0, %1, %2, %3;" :
"=r"(ret) :
"r"(a),
"r"(b),
"r"(index));
242 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
247 __device__ __forceinline__
void BAR(
int count)
249 asm volatile(
"bar.sync 1, %0;" : :
"r"(count));
256 __device__ __forceinline__
float FMUL_RZ(
float a,
float b)
259 asm volatile(
"mul.rz.f32 %0, %1, %2;" :
"=f"(d) :
"f"(a),
"f"(b));
267 __device__ __forceinline__
float FFMA_RZ(
float a,
float b,
float c)
270 asm volatile(
"fma.rz.f32 %0, %1, %2, %3;" :
"=f"(d) :
"f"(a),
"f"(b),
"f"(c));
274 #endif // DOXYGEN_SHOULD_SKIP_THIS
280 asm volatile(
"exit;");
287 __device__ __forceinline__
int RowMajorTid(
int block_dim_x,
int block_dim_y,
int block_dim_z)
289 return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
290 ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
298 __device__ __forceinline__
unsigned int LaneId()
301 asm volatile(
"mov.u32 %0, %laneid;" :
"=r"(ret) );
309 __device__ __forceinline__
unsigned int WarpId()
312 asm volatile(
"mov.u32 %0, %warpid;" :
"=r"(ret) );
322 asm volatile(
"mov.u32 %0, %lanemask_lt;" :
"=r"(ret) );
332 asm volatile(
"mov.u32 %0, %lanemask_le;" :
"=r"(ret) );
342 asm volatile(
"mov.u32 %0, %lanemask_gt;" :
"=r"(ret) );
352 asm volatile(
"mov.u32 %0, %lanemask_ge;" :
"=r"(ret) );
360 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
366 template <
typename ShuffleWordT,
int STEP>
367 __device__ __forceinline__
void ShuffleUp(
369 ShuffleWordT* output,
374 unsigned int word = input[STEP];
375 asm volatile(
"shfl.up.b32 %0, %1, %2, %3;"
376 :
"=r"(word) :
"r"(word),
"r"(src_offset),
"r"(first_lane));
377 output[STEP] = (ShuffleWordT) word;
379 ShuffleUp(input, output, src_offset, first_lane, Int2Type<STEP - 1>());
386 template <
typename ShuffleWordT>
387 __device__ __forceinline__
void ShuffleUp(
389 ShuffleWordT* output,
400 template <
typename ShuffleWordT,
int STEP>
403 ShuffleWordT* output,
408 unsigned int word = input[STEP];
409 asm volatile(
"shfl.down.b32 %0, %1, %2, %3;"
410 :
"=r"(word) :
"r"(word),
"r"(src_offset),
"r"(last_lane));
411 output[STEP] = (ShuffleWordT) word;
413 ShuffleDown(input, output, src_offset, last_lane, Int2Type<STEP - 1>());
420 template <
typename ShuffleWordT>
423 ShuffleWordT* output,
433 template <
typename ShuffleWordT,
int STEP>
434 __device__ __forceinline__
void ShuffleIdx(
436 ShuffleWordT* output,
441 unsigned int word = input[STEP];
442 asm volatile(
"shfl.idx.b32 %0, %1, %2, %3;"
443 :
"=r"(word) :
"r"(word),
"r"(src_lane),
"r"(last_lane));
444 output[STEP] = (ShuffleWordT) word;
446 ShuffleIdx(input, output, src_lane, last_lane, Int2Type<STEP - 1>());
453 template <
typename ShuffleWordT>
454 __device__ __forceinline__
void ShuffleIdx(
456 ShuffleWordT* output,
465 #endif // DOXYGEN_SHOULD_SKIP_THIS // Do not document
497 template <
typename T>
503 typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
505 const int WORDS = (
sizeof(T) +
sizeof(ShuffleWord) - 1) /
sizeof(ShuffleWord);
508 ShuffleWord *output_alias =
reinterpret_cast<ShuffleWord *
>(&output);
509 ShuffleWord *input_alias =
reinterpret_cast<ShuffleWord *
>(&input);
511 unsigned int shuffle_word;
512 asm volatile(
"shfl.up.b32 %0, %1, %2, %3;"
513 :
"=r"(shuffle_word) :
"r"((
unsigned int) input_alias[0]),
"r"(src_offset),
"r"(first_lane));
514 output_alias[0] = shuffle_word;
517 for (
int WORD = 1; WORD < WORDS; ++WORD)
519 asm volatile(
"shfl.up.b32 %0, %1, %2, %3;"
520 :
"=r"(shuffle_word) :
"r"((
unsigned int) input_alias[WORD]),
"r"(src_offset),
"r"(first_lane));
521 output_alias[WORD] = shuffle_word;
558 template <
typename T>
562 int last_lane = CUB_PTX_WARP_THREADS - 1)
564 typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
566 const int WORDS = (
sizeof(T) +
sizeof(ShuffleWord) - 1) /
sizeof(ShuffleWord);
569 ShuffleWord *output_alias =
reinterpret_cast<ShuffleWord *
>(&output);
570 ShuffleWord *input_alias =
reinterpret_cast<ShuffleWord *
>(&input);
572 unsigned int shuffle_word;
573 asm volatile(
"shfl.down.b32 %0, %1, %2, %3;"
574 :
"=r"(shuffle_word) :
"r"((
unsigned int) input_alias[0]),
"r"(src_offset),
"r"(last_lane));
575 output_alias[0] = shuffle_word;
578 for (
int WORD = 1; WORD < WORDS; ++WORD)
580 asm volatile(
"shfl.down.b32 %0, %1, %2, %3;"
581 :
"=r"(shuffle_word) :
"r"((
unsigned int) input_alias[WORD]),
"r"(src_offset),
"r"(last_lane));
582 output_alias[WORD] = shuffle_word;
590 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
599 template <
typename T>
603 int logical_warp_threads)
605 typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
607 const int WORDS = (
sizeof(T) +
sizeof(ShuffleWord) - 1) /
sizeof(ShuffleWord);
610 ShuffleWord *output_alias =
reinterpret_cast<ShuffleWord *
>(&output);
611 ShuffleWord *input_alias =
reinterpret_cast<ShuffleWord *
>(&input);
613 unsigned int shuffle_word;
614 asm volatile(
"shfl.idx.b32 %0, %1, %2, %3;"
615 :
"=r"(shuffle_word) :
"r"((
unsigned int) input_alias[0]),
"r"(src_lane),
"r"(logical_warp_threads - 1));
616 output_alias[0] = shuffle_word;
619 for (
int WORD = 1; WORD < WORDS; ++WORD)
621 asm volatile(
"shfl.idx.b32 %0, %1, %2, %3;"
622 :
"=r"(shuffle_word) :
"r"((
unsigned int) input_alias[WORD]),
"r"(src_lane),
"r"(logical_warp_threads - 1));
623 output_alias[WORD] = shuffle_word;
631 #endif // DOXYGEN_SHOULD_SKIP_THIS
662 template <
typename T>
667 return ShuffleIndex(input, src_lane, CUB_PTX_WARP_THREADS);
678 __device__ __forceinline__
int WarpAll(
int cond)
680 #if CUB_PTX_ARCH < 120
682 __shared__
volatile int warp_signals[32];
685 warp_signals[
WarpId()] = 1;
688 warp_signals[
WarpId()] = 0;
690 return warp_signals[
WarpId()];
694 return ::__all(cond);
704 __device__ __forceinline__
int WarpAny(
int cond)
706 #if CUB_PTX_ARCH < 120
708 __shared__
volatile int warp_signals[32];
711 warp_signals[
WarpId()] = 0;
714 warp_signals[
WarpId()] = 1;
716 return warp_signals[
WarpId()];
720 return ::__any(cond);