CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
block_store.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include <iterator>
37 
38 #include "block_exchange.cuh"
39 #include "../util_ptx.cuh"
40 #include "../util_macro.cuh"
41 #include "../util_type.cuh"
42 #include "../util_namespace.cuh"
43 
45 CUB_NS_PREFIX
46 
48 namespace cub {
49 
56 /******************************************************************/
60 
70 template <
71  typename T,
72  int ITEMS_PER_THREAD,
73  typename OutputIteratorT>
74 __device__ __forceinline__ void StoreDirectBlocked(
75  int linear_tid,
76  OutputIteratorT block_itr,
77  T (&items)[ITEMS_PER_THREAD])
78 {
79  // Store directly in thread-blocked order
80  #pragma unroll
81  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
82  {
83  block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM] = items[ITEM];
84  }
85 }
86 
87 
97 template <
98  typename T,
99  int ITEMS_PER_THREAD,
100  typename OutputIteratorT>
101 __device__ __forceinline__ void StoreDirectBlocked(
102  int linear_tid,
103  OutputIteratorT block_itr,
104  T (&items)[ITEMS_PER_THREAD],
105  int valid_items)
106 {
107  // Store directly in thread-blocked order
108  #pragma unroll
109  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
110  {
111  if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
112  {
113  block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM] = items[ITEM];
114  }
115  }
116 }
117 
118 
136 template <
137  typename T,
138  int ITEMS_PER_THREAD>
139 __device__ __forceinline__ void StoreDirectBlockedVectorized(
140  int linear_tid,
141  T *block_ptr,
142  T (&items)[ITEMS_PER_THREAD])
143 {
144  enum
145  {
146  // Maximum CUDA vector size is 4 elements
147  MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
148 
149  // Vector size must be a power of two and an even divisor of the items per thread
150  VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
151  MAX_VEC_SIZE :
152  1,
153 
154  VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
155  };
156 
157  // Vector type
158  typedef typename CubVector<T, VEC_SIZE>::Type Vector;
159 
160  // Alias global pointer
161  Vector *block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
162 
163  // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
164  Vector raw_vector[VECTORS_PER_THREAD];
165  T *raw_items = reinterpret_cast<T*>(raw_vector);
166 
167  // Copy
168  #pragma unroll
169  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
170  {
171  raw_items[ITEM] = items[ITEM];
172  }
173 
174  // Direct-store using vector types
175  StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
176 }
177 
178 
179 
181 /******************************************************************/
185 
186 
197 template <
198  int BLOCK_THREADS,
199  typename T,
200  int ITEMS_PER_THREAD,
201  typename OutputIteratorT>
202 __device__ __forceinline__ void StoreDirectStriped(
203  int linear_tid,
204  OutputIteratorT block_itr,
205  T (&items)[ITEMS_PER_THREAD])
206 {
207  // Store directly in striped order
208  #pragma unroll
209  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
210  {
211  block_itr[(ITEM * BLOCK_THREADS) + linear_tid] = items[ITEM];
212  }
213 }
214 
215 
226 template <
227  int BLOCK_THREADS,
228  typename T,
229  int ITEMS_PER_THREAD,
230  typename OutputIteratorT>
231 __device__ __forceinline__ void StoreDirectStriped(
232  int linear_tid,
233  OutputIteratorT block_itr,
234  T (&items)[ITEMS_PER_THREAD],
235  int valid_items)
236 {
237  // Store directly in striped order
238  #pragma unroll
239  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
240  {
241  if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
242  {
243  block_itr[(ITEM * BLOCK_THREADS) + linear_tid] = items[ITEM];
244  }
245  }
246 }
247 
248 
249 
251 /******************************************************************/
255 
256 
269 template <
270  typename T,
271  int ITEMS_PER_THREAD,
272  typename OutputIteratorT>
273 __device__ __forceinline__ void StoreDirectWarpStriped(
274  int linear_tid,
275  OutputIteratorT block_itr,
276  T (&items)[ITEMS_PER_THREAD])
277 {
278  int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
279  int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
280  int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
281 
282  // Store directly in warp-striped order
283  #pragma unroll
284  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
285  {
286  block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
287  }
288 }
289 
290 
303 template <
304  typename T,
305  int ITEMS_PER_THREAD,
306  typename OutputIteratorT>
307 __device__ __forceinline__ void StoreDirectWarpStriped(
308  int linear_tid,
309  OutputIteratorT block_itr,
310  T (&items)[ITEMS_PER_THREAD],
311  int valid_items)
312 {
313  int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
314  int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
315  int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
316 
317  // Store directly in warp-striped order
318  #pragma unroll
319  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
320  {
321  if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
322  {
323  block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
324  }
325  }
326 }
327 
328 
330 
331  // end group UtilIo
333 
334 
335 //-----------------------------------------------------------------------------
336 // Generic BlockStore abstraction
337 //-----------------------------------------------------------------------------
338 
343 {
355 
375 
388 
405 
424 
425 };
426 
427 
495 template <
496  typename OutputIteratorT,
497  int BLOCK_DIM_X,
498  int ITEMS_PER_THREAD,
500  int BLOCK_DIM_Y = 1,
501  int BLOCK_DIM_Z = 1,
502  int PTX_ARCH = CUB_PTX_ARCH>
504 {
505 private:
506  /******************************************************************************
507  * Constants and typed definitions
508  ******************************************************************************/
509 
511  enum
512  {
514  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
515  };
516 
517  // Data type of input iterator
518  typedef typename std::iterator_traits<OutputIteratorT>::value_type T;
519 
520 
521  /******************************************************************************
522  * Algorithmic variants
523  ******************************************************************************/
524 
526  template <BlockStoreAlgorithm _POLICY, int DUMMY>
527  struct StoreInternal;
528 
529 
533  template <int DUMMY>
534  struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
535  {
537  typedef NullType TempStorage;
538 
540  int linear_tid;
541 
543  __device__ __forceinline__ StoreInternal(
544  TempStorage &temp_storage,
545  int linear_tid)
546  :
547  linear_tid(linear_tid)
548  {}
549 
551  __device__ __forceinline__ void Store(
552  OutputIteratorT block_itr,
553  T (&items)[ITEMS_PER_THREAD])
554  {
555  StoreDirectBlocked(linear_tid, block_itr, items);
556  }
557 
559  __device__ __forceinline__ void Store(
560  OutputIteratorT block_itr,
561  T (&items)[ITEMS_PER_THREAD],
562  int valid_items)
563  {
564  StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
565  }
566  };
567 
568 
572  template <int DUMMY>
573  struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
574  {
576  typedef NullType TempStorage;
577 
579  int linear_tid;
580 
582  __device__ __forceinline__ StoreInternal(
583  TempStorage &temp_storage,
584  int linear_tid)
585  :
586  linear_tid(linear_tid)
587  {}
588 
590  __device__ __forceinline__ void Store(
591  T *block_ptr,
592  T (&items)[ITEMS_PER_THREAD])
593  {
594  StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
595  }
596 
598  template <typename _OutputIteratorT>
599  __device__ __forceinline__ void Store(
600  _OutputIteratorT block_itr,
601  T (&items)[ITEMS_PER_THREAD])
602  {
603  StoreDirectBlocked(linear_tid, block_itr, items);
604  }
605 
607  __device__ __forceinline__ void Store(
608  OutputIteratorT block_itr,
609  T (&items)[ITEMS_PER_THREAD],
610  int valid_items)
611  {
612  StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
613  }
614  };
615 
616 
620  template <int DUMMY>
621  struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
622  {
623  // BlockExchange utility type for keys
625 
627  typedef typename BlockExchange::TempStorage _TempStorage;
628 
630  struct TempStorage : Uninitialized<_TempStorage> {};
631 
633  _TempStorage &temp_storage;
634 
636  int linear_tid;
637 
639  __device__ __forceinline__ StoreInternal(
640  TempStorage &temp_storage,
641  int linear_tid)
642  :
643  temp_storage(temp_storage.Alias()),
644  linear_tid(linear_tid)
645  {}
646 
648  __device__ __forceinline__ void Store(
649  OutputIteratorT block_itr,
650  T (&items)[ITEMS_PER_THREAD])
651  {
652  BlockExchange(temp_storage).BlockedToStriped(items);
653  StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
654  }
655 
657  __device__ __forceinline__ void Store(
658  OutputIteratorT block_itr,
659  T (&items)[ITEMS_PER_THREAD],
660  int valid_items)
661  {
662  BlockExchange(temp_storage).BlockedToStriped(items);
663  StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
664  }
665  };
666 
667 
671  template <int DUMMY>
672  struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
673  {
674  enum
675  {
676  WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
677  };
678 
679  // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
680  CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
681 
682  // BlockExchange utility type for keys
683  typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
684 
686  typedef typename BlockExchange::TempStorage _TempStorage;
687 
689  struct TempStorage : Uninitialized<_TempStorage> {};
690 
692  _TempStorage &temp_storage;
693 
695  int linear_tid;
696 
698  __device__ __forceinline__ StoreInternal(
699  TempStorage &temp_storage,
700  int linear_tid)
701  :
702  temp_storage(temp_storage.Alias()),
703  linear_tid(linear_tid)
704  {}
705 
707  __device__ __forceinline__ void Store(
708  OutputIteratorT block_itr,
709  T (&items)[ITEMS_PER_THREAD])
710  {
711  BlockExchange(temp_storage).BlockedToWarpStriped(items);
712  StoreDirectWarpStriped(linear_tid, block_itr, items);
713  }
714 
716  __device__ __forceinline__ void Store(
717  OutputIteratorT block_itr,
718  T (&items)[ITEMS_PER_THREAD],
719  int valid_items)
720  {
721  BlockExchange(temp_storage).BlockedToWarpStriped(items);
722  StoreDirectWarpStriped(linear_tid, block_itr, items, valid_items);
723  }
724  };
725 
726 
730  template <int DUMMY>
731  struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY>
732  {
733  enum
734  {
735  WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
736  };
737 
738  // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
739  CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
740 
741  // BlockExchange utility type for keys
742  typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
743 
745  typedef typename BlockExchange::TempStorage _TempStorage;
746 
748  struct TempStorage : Uninitialized<_TempStorage> {};
749 
751  _TempStorage &temp_storage;
752 
754  int linear_tid;
755 
757  __device__ __forceinline__ StoreInternal(
758  TempStorage &temp_storage,
759  int linear_tid)
760  :
761  temp_storage(temp_storage.Alias()),
762  linear_tid(linear_tid)
763  {}
764 
766  __device__ __forceinline__ void Store(
767  OutputIteratorT block_itr,
768  T (&items)[ITEMS_PER_THREAD])
769  {
770  BlockExchange(temp_storage).BlockedToWarpStriped(items);
771  StoreDirectWarpStriped(linear_tid, block_itr, items);
772  }
773 
775  __device__ __forceinline__ void Store(
776  OutputIteratorT block_itr,
777  T (&items)[ITEMS_PER_THREAD],
778  int valid_items)
779  {
780  BlockExchange(temp_storage).BlockedToWarpStriped(items);
781  StoreDirectWarpStriped(linear_tid, block_itr, items, valid_items);
782  }
783  };
784 
785  /******************************************************************************
786  * Type definitions
787  ******************************************************************************/
788 
790  typedef StoreInternal<ALGORITHM, 0> InternalStore;
791 
792 
794  typedef typename InternalStore::TempStorage _TempStorage;
795 
796 
797  /******************************************************************************
798  * Utility methods
799  ******************************************************************************/
800 
802  __device__ __forceinline__ _TempStorage& PrivateStorage()
803  {
804  __shared__ _TempStorage private_storage;
805  return private_storage;
806  }
807 
808 
809  /******************************************************************************
810  * Thread fields
811  ******************************************************************************/
812 
814  _TempStorage &temp_storage;
815 
817  int linear_tid;
818 
819 public:
820 
821 
823  struct TempStorage : Uninitialized<_TempStorage> {};
824 
825 
826  /******************************************************************/
830 
834  __device__ __forceinline__ BlockStore()
835  :
836  temp_storage(PrivateStorage()),
837  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
838  {}
839 
840 
844  __device__ __forceinline__ BlockStore(
845  TempStorage &temp_storage)
846  :
847  temp_storage(temp_storage.Alias()),
848  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
849  {}
850 
851 
853  /******************************************************************/
857 
858 
899  __device__ __forceinline__ void Store(
900  OutputIteratorT block_itr,
901  T (&items)[ITEMS_PER_THREAD])
902  {
903  InternalStore(temp_storage, linear_tid).Store(block_itr, items);
904  }
905 
947  __device__ __forceinline__ void Store(
948  OutputIteratorT block_itr,
949  T (&items)[ITEMS_PER_THREAD],
950  int valid_items)
951  {
952  InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
953  }
954 };
955 
956 
957 } // CUB namespace
958 CUB_NS_POSTFIX // Optional outer namespace(s)
959