39 #include "../util_ptx.cuh"
40 #include "../util_macro.cuh"
41 #include "../util_type.cuh"
42 #include "../util_namespace.cuh"
73 typename OutputIteratorT>
76 OutputIteratorT block_itr,
77 T (&items)[ITEMS_PER_THREAD])
81 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
83 block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM] = items[ITEM];
100 typename OutputIteratorT>
103 OutputIteratorT block_itr,
104 T (&items)[ITEMS_PER_THREAD],
109 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
111 if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
113 block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM] = items[ITEM];
138 int ITEMS_PER_THREAD>
142 T (&items)[ITEMS_PER_THREAD])
147 MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
150 VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
154 VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
158 typedef typename CubVector<T, VEC_SIZE>::Type Vector;
161 Vector *block_ptr_vectors =
reinterpret_cast<Vector*
>(
const_cast<T*
>(block_ptr));
164 Vector raw_vector[VECTORS_PER_THREAD];
165 T *raw_items =
reinterpret_cast<T*
>(raw_vector);
169 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
171 raw_items[ITEM] = items[ITEM];
200 int ITEMS_PER_THREAD,
201 typename OutputIteratorT>
204 OutputIteratorT block_itr,
205 T (&items)[ITEMS_PER_THREAD])
209 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
211 block_itr[(ITEM * BLOCK_THREADS) + linear_tid] = items[ITEM];
229 int ITEMS_PER_THREAD,
230 typename OutputIteratorT>
233 OutputIteratorT block_itr,
234 T (&items)[ITEMS_PER_THREAD],
239 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
241 if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
243 block_itr[(ITEM * BLOCK_THREADS) + linear_tid] = items[ITEM];
271 int ITEMS_PER_THREAD,
272 typename OutputIteratorT>
275 OutputIteratorT block_itr,
276 T (&items)[ITEMS_PER_THREAD])
278 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
279 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
280 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
284 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
286 block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
305 int ITEMS_PER_THREAD,
306 typename OutputIteratorT>
309 OutputIteratorT block_itr,
310 T (&items)[ITEMS_PER_THREAD],
313 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
314 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
315 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
319 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
321 if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
323 block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
496 typename OutputIteratorT,
498 int ITEMS_PER_THREAD,
502 int PTX_ARCH = CUB_PTX_ARCH>
514 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
518 typedef typename std::iterator_traits<OutputIteratorT>::value_type T;
526 template <BlockStoreAlgorithm _POLICY,
int DUMMY>
527 struct StoreInternal;
534 struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
543 __device__ __forceinline__ StoreInternal(
547 linear_tid(linear_tid)
551 __device__ __forceinline__
void Store(
552 OutputIteratorT block_itr,
553 T (&items)[ITEMS_PER_THREAD])
559 __device__ __forceinline__
void Store(
560 OutputIteratorT block_itr,
561 T (&items)[ITEMS_PER_THREAD],
573 struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
582 __device__ __forceinline__ StoreInternal(
586 linear_tid(linear_tid)
590 __device__ __forceinline__
void Store(
592 T (&items)[ITEMS_PER_THREAD])
598 template <
typename _OutputIteratorT>
599 __device__ __forceinline__
void Store(
600 _OutputIteratorT block_itr,
601 T (&items)[ITEMS_PER_THREAD])
607 __device__ __forceinline__
void Store(
608 OutputIteratorT block_itr,
609 T (&items)[ITEMS_PER_THREAD],
621 struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
639 __device__ __forceinline__ StoreInternal(
643 temp_storage(temp_storage.Alias()),
644 linear_tid(linear_tid)
648 __device__ __forceinline__
void Store(
649 OutputIteratorT block_itr,
650 T (&items)[ITEMS_PER_THREAD])
653 StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
657 __device__ __forceinline__
void Store(
658 OutputIteratorT block_itr,
659 T (&items)[ITEMS_PER_THREAD],
663 StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
676 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
680 CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0),
"BLOCK_THREADS must be a multiple of WARP_THREADS");
683 typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
686 typedef typename BlockExchange::TempStorage _TempStorage;
698 __device__ __forceinline__ StoreInternal(
702 temp_storage(temp_storage.Alias()),
703 linear_tid(linear_tid)
707 __device__ __forceinline__
void Store(
708 OutputIteratorT block_itr,
709 T (&items)[ITEMS_PER_THREAD])
716 __device__ __forceinline__
void Store(
717 OutputIteratorT block_itr,
718 T (&items)[ITEMS_PER_THREAD],
735 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
739 CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0),
"BLOCK_THREADS must be a multiple of WARP_THREADS");
742 typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
745 typedef typename BlockExchange::TempStorage _TempStorage;
757 __device__ __forceinline__ StoreInternal(
761 temp_storage(temp_storage.Alias()),
762 linear_tid(linear_tid)
766 __device__ __forceinline__
void Store(
767 OutputIteratorT block_itr,
768 T (&items)[ITEMS_PER_THREAD])
775 __device__ __forceinline__
void Store(
776 OutputIteratorT block_itr,
777 T (&items)[ITEMS_PER_THREAD],
790 typedef StoreInternal<ALGORITHM, 0> InternalStore;
794 typedef typename InternalStore::TempStorage _TempStorage;
802 __device__ __forceinline__ _TempStorage& PrivateStorage()
804 __shared__ _TempStorage private_storage;
805 return private_storage;
814 _TempStorage &temp_storage;
836 temp_storage(PrivateStorage()),
837 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
847 temp_storage(temp_storage.Alias()),
848 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
899 __device__ __forceinline__
void Store(
900 OutputIteratorT block_itr,
901 T (&items)[ITEMS_PER_THREAD])
903 InternalStore(temp_storage, linear_tid).Store(block_itr, items);
947 __device__ __forceinline__
void Store(
948 OutputIteratorT block_itr,
949 T (&items)[ITEMS_PER_THREAD],
952 InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);