CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
util_ptx.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
35 #pragma once
36 
37 #include "util_type.cuh"
38 #include "util_arch.cuh"
39 #include "util_namespace.cuh"
40 #include "util_debug.cuh"
41 
42 
44 CUB_NS_PREFIX
45 
47 namespace cub {
48 
49 
56 /******************************************************************************
57  * PTX helper macros
58  ******************************************************************************/
59 
60 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
61 
65 #if defined(_WIN64) || defined(__LP64__)
66  #define __CUB_LP64__ 1
67  // 64-bit register modifier for inlined asm
68  #define _CUB_ASM_PTR_ "l"
69  #define _CUB_ASM_PTR_SIZE_ "u64"
70 #else
71  #define __CUB_LP64__ 0
72  // 32-bit register modifier for inlined asm
73  #define _CUB_ASM_PTR_ "r"
74  #define _CUB_ASM_PTR_SIZE_ "u32"
75 #endif
76 
77 #endif // DOXYGEN_SHOULD_SKIP_THIS
78 
79 
80 /******************************************************************************
81  * Inlined PTX intrinsics
82  ******************************************************************************/
83 
87 __device__ __forceinline__ unsigned int SHR_ADD(
88  unsigned int x,
89  unsigned int shift,
90  unsigned int addend)
91 {
92  unsigned int ret;
93 #if CUB_PTX_ARCH >= 200
94  asm volatile("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
95  "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
96 #else
97  ret = (x >> shift) + addend;
98 #endif
99  return ret;
100 }
101 
102 
106 __device__ __forceinline__ unsigned int SHL_ADD(
107  unsigned int x,
108  unsigned int shift,
109  unsigned int addend)
110 {
111  unsigned int ret;
112 #if CUB_PTX_ARCH >= 200
113  asm volatile("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
114  "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
115 #else
116  ret = (x << shift) + addend;
117 #endif
118  return ret;
119 }
120 
121 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
122 
126 template <typename UnsignedBits, int BYTE_LEN>
127 __device__ __forceinline__ unsigned int BFE(
128  UnsignedBits source,
129  unsigned int bit_start,
130  unsigned int num_bits,
131  Int2Type<BYTE_LEN> byte_len)
132 {
133  unsigned int bits;
134 #if CUB_PTX_ARCH >= 200
135  asm volatile("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
136 #else
137  const unsigned int MASK = (1 << num_bits) - 1;
138  bits = (source >> bit_start) & MASK;
139 #endif
140  return bits;
141 }
142 
143 
147 template <typename UnsignedBits>
148 __device__ __forceinline__ unsigned int BFE(
149  UnsignedBits source,
150  unsigned int bit_start,
151  unsigned int num_bits,
152  Int2Type<8> byte_len)
153 {
154  const unsigned long long MASK = (1ull << num_bits) - 1;
155  return (source >> bit_start) & MASK;
156 }
157 
158 #endif // DOXYGEN_SHOULD_SKIP_THIS
159 
163 template <typename UnsignedBits>
164 __device__ __forceinline__ unsigned int BFE(
165  UnsignedBits source,
166  unsigned int bit_start,
167  unsigned int num_bits)
168 {
169  return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
170 }
171 
172 
176 __device__ __forceinline__ void BFI(
177  unsigned int &ret,
178  unsigned int x,
179  unsigned int y,
180  unsigned int bit_start,
181  unsigned int num_bits)
182 {
183 #if CUB_PTX_ARCH >= 200
184  asm volatile("bfi.b32 %0, %1, %2, %3, %4;" :
185  "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
186 #else
187  x <<= bit_start;
188  unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start;
189  unsigned int MASK_Y = ~MASK_X;
190  ret = (y & MASK_Y) | (x & MASK_X);
191 #endif
192 }
193 
194 
198 __device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
199 {
200 #if CUB_PTX_ARCH >= 200
201  asm volatile("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
202 #else
203  x = x + y + z;
204 #endif
205  return x;
206 }
207 
208 
235 __device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
236 {
237  int ret;
238  asm volatile("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
239  return ret;
240 }
241 
242 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
243 
247 __device__ __forceinline__ void BAR(int count)
248 {
249  asm volatile("bar.sync 1, %0;" : : "r"(count));
250 }
251 
252 
256 __device__ __forceinline__ float FMUL_RZ(float a, float b)
257 {
258  float d;
259  asm volatile("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
260  return d;
261 }
262 
263 
267 __device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
268 {
269  float d;
270  asm volatile("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
271  return d;
272 }
273 
274 #endif // DOXYGEN_SHOULD_SKIP_THIS
275 
279 __device__ __forceinline__ void ThreadExit() {
280  asm volatile("exit;");
281 }
282 
283 
287 __device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
288 {
289  return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
290  ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
291  threadIdx.x;
292 }
293 
294 
298 __device__ __forceinline__ unsigned int LaneId()
299 {
300  unsigned int ret;
301  asm volatile("mov.u32 %0, %laneid;" : "=r"(ret) );
302  return ret;
303 }
304 
305 
309 __device__ __forceinline__ unsigned int WarpId()
310 {
311  unsigned int ret;
312  asm volatile("mov.u32 %0, %warpid;" : "=r"(ret) );
313  return ret;
314 }
315 
319 __device__ __forceinline__ unsigned int LaneMaskLt()
320 {
321  unsigned int ret;
322  asm volatile("mov.u32 %0, %lanemask_lt;" : "=r"(ret) );
323  return ret;
324 }
325 
329 __device__ __forceinline__ unsigned int LaneMaskLe()
330 {
331  unsigned int ret;
332  asm volatile("mov.u32 %0, %lanemask_le;" : "=r"(ret) );
333  return ret;
334 }
335 
339 __device__ __forceinline__ unsigned int LaneMaskGt()
340 {
341  unsigned int ret;
342  asm volatile("mov.u32 %0, %lanemask_gt;" : "=r"(ret) );
343  return ret;
344 }
345 
349 __device__ __forceinline__ unsigned int LaneMaskGe()
350 {
351  unsigned int ret;
352  asm volatile("mov.u32 %0, %lanemask_ge;" : "=r"(ret) );
353  return ret;
354 }
355  // end group UtilPtx
357 
358 
359 
360 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
361 
362 
366 template <typename ShuffleWordT, int STEP>
367 __device__ __forceinline__ void ShuffleUp(
368  ShuffleWordT* input,
369  ShuffleWordT* output,
370  int src_offset,
371  int first_lane,
372  Int2Type<STEP> step)
373 {
374  unsigned int word = input[STEP];
375  asm volatile("shfl.up.b32 %0, %1, %2, %3;"
376  : "=r"(word) : "r"(word), "r"(src_offset), "r"(first_lane));
377  output[STEP] = (ShuffleWordT) word;
378 
379  ShuffleUp(input, output, src_offset, first_lane, Int2Type<STEP - 1>());
380 }
381 
382 
386 template <typename ShuffleWordT>
387 __device__ __forceinline__ void ShuffleUp(
388  ShuffleWordT* input,
389  ShuffleWordT* output,
390  int src_offset,
391  int first_lane,
392  Int2Type<-1> step)
393 {}
394 
395 
396 
400 template <typename ShuffleWordT, int STEP>
401 __device__ __forceinline__ void ShuffleDown(
402  ShuffleWordT* input,
403  ShuffleWordT* output,
404  int src_offset,
405  int last_lane,
406  Int2Type<STEP> step)
407 {
408  unsigned int word = input[STEP];
409  asm volatile("shfl.down.b32 %0, %1, %2, %3;"
410  : "=r"(word) : "r"(word), "r"(src_offset), "r"(last_lane));
411  output[STEP] = (ShuffleWordT) word;
412 
413  ShuffleDown(input, output, src_offset, last_lane, Int2Type<STEP - 1>());
414 }
415 
416 
420 template <typename ShuffleWordT>
421 __device__ __forceinline__ void ShuffleDown(
422  ShuffleWordT* input,
423  ShuffleWordT* output,
424  int src_offset,
425  int last_lane,
426  Int2Type<-1> step)
427 {}
428 
429 
433 template <typename ShuffleWordT, int STEP>
434 __device__ __forceinline__ void ShuffleIdx(
435  ShuffleWordT* input,
436  ShuffleWordT* output,
437  int src_lane,
438  int last_lane,
439  Int2Type<STEP> step)
440 {
441  unsigned int word = input[STEP];
442  asm volatile("shfl.idx.b32 %0, %1, %2, %3;"
443  : "=r"(word) : "r"(word), "r"(src_lane), "r"(last_lane));
444  output[STEP] = (ShuffleWordT) word;
445 
446  ShuffleIdx(input, output, src_lane, last_lane, Int2Type<STEP - 1>());
447 }
448 
449 
453 template <typename ShuffleWordT>
454 __device__ __forceinline__ void ShuffleIdx(
455  ShuffleWordT* input,
456  ShuffleWordT* output,
457  int src_lane,
458  int last_lane,
459  Int2Type<-1> step)
460 {}
461 
462 
463 
464 
465 #endif // DOXYGEN_SHOULD_SKIP_THIS // Do not document
466 
467 
468 
497 template <typename T>
498 __device__ __forceinline__ T ShuffleUp(
499  T input,
500  int src_offset,
501  int first_lane = 0)
502 {
503  typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
504 
505  const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
506 
507  T output;
508  ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
509  ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
510 
511  unsigned int shuffle_word;
512  asm volatile("shfl.up.b32 %0, %1, %2, %3;"
513  : "=r"(shuffle_word) : "r"((unsigned int) input_alias[0]), "r"(src_offset), "r"(first_lane));
514  output_alias[0] = shuffle_word;
515 
516  #pragma unroll
517  for (int WORD = 1; WORD < WORDS; ++WORD)
518  {
519  asm volatile("shfl.up.b32 %0, %1, %2, %3;"
520  : "=r"(shuffle_word) : "r"((unsigned int) input_alias[WORD]), "r"(src_offset), "r"(first_lane));
521  output_alias[WORD] = shuffle_word;
522  }
523 
524 // ShuffleUp(input_alias, output_alias, src_offset, first_lane, Int2Type<WORDS - 1>());
525 
526  return output;
527 }
528 
529 
558 template <typename T>
559 __device__ __forceinline__ T ShuffleDown(
560  T input,
561  int src_offset,
562  int last_lane = CUB_PTX_WARP_THREADS - 1)
563 {
564  typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
565 
566  const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
567 
568  T output;
569  ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
570  ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
571 
572  unsigned int shuffle_word;
573  asm volatile("shfl.down.b32 %0, %1, %2, %3;"
574  : "=r"(shuffle_word) : "r"((unsigned int) input_alias[0]), "r"(src_offset), "r"(last_lane));
575  output_alias[0] = shuffle_word;
576 
577  #pragma unroll
578  for (int WORD = 1; WORD < WORDS; ++WORD)
579  {
580  asm volatile("shfl.down.b32 %0, %1, %2, %3;"
581  : "=r"(shuffle_word) : "r"((unsigned int) input_alias[WORD]), "r"(src_offset), "r"(last_lane));
582  output_alias[WORD] = shuffle_word;
583  }
584 
585 // ShuffleDown(input_alias, output_alias, src_offset, last_lane, Int2Type<WORDS - 1>());
586 
587  return output;
588 }
589 
590 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
591 
599 template <typename T>
600 __device__ __forceinline__ T ShuffleIndex(
601  T input,
602  int src_lane,
603  int logical_warp_threads)
604 {
605  typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
606 
607  const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
608 
609  T output;
610  ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
611  ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
612 
613  unsigned int shuffle_word;
614  asm volatile("shfl.idx.b32 %0, %1, %2, %3;"
615  : "=r"(shuffle_word) : "r"((unsigned int) input_alias[0]), "r"(src_lane), "r"(logical_warp_threads - 1));
616  output_alias[0] = shuffle_word;
617 
618  #pragma unroll
619  for (int WORD = 1; WORD < WORDS; ++WORD)
620  {
621  asm volatile("shfl.idx.b32 %0, %1, %2, %3;"
622  : "=r"(shuffle_word) : "r"((unsigned int) input_alias[WORD]), "r"(src_lane), "r"(logical_warp_threads - 1));
623  output_alias[WORD] = shuffle_word;
624  }
625 
626 // ShuffleIdx(input_alias, output_alias, src_lane, logical_warp_threads - 1, Int2Type<WORDS - 1>());
627 
628  return output;
629 }
630 
631 #endif // DOXYGEN_SHOULD_SKIP_THIS
632 
633 
662 template <typename T>
663 __device__ __forceinline__ T ShuffleIndex(
664  T input,
665  int src_lane)
666 {
667  return ShuffleIndex(input, src_lane, CUB_PTX_WARP_THREADS);
668 }
669 
670 
671 
672 
673 
678 __device__ __forceinline__ int WarpAll(int cond)
679 {
680 #if CUB_PTX_ARCH < 120
681 
682  __shared__ volatile int warp_signals[32];
683 
684  if (LaneId() == 0)
685  warp_signals[WarpId()] = 1;
686 
687  if (cond == 0)
688  warp_signals[WarpId()] = 0;
689 
690  return warp_signals[WarpId()];
691 
692 #else
693 
694  return ::__all(cond);
695 
696 #endif
697 }
698 
699 
704 __device__ __forceinline__ int WarpAny(int cond)
705 {
706 #if CUB_PTX_ARCH < 120
707 
708  __shared__ volatile int warp_signals[32];
709 
710  if (LaneId() == 0)
711  warp_signals[WarpId()] = 0;
712 
713  if (cond)
714  warp_signals[WarpId()] = 1;
715 
716  return warp_signals[WarpId()];
717 
718 #else
719 
720  return ::__any(cond);
721 
722 #endif
723 }
724 
725 
726 } // CUB namespace
727 CUB_NS_POSTFIX // Optional outer namespace(s)