36 #include "specializations/block_reduce_raking.cuh"
37 #include "specializations/block_reduce_raking_commutative_only.cuh"
38 #include "specializations/block_reduce_warp_reductions.cuh"
39 #include "../util_ptx.cuh"
40 #include "../util_type.cuh"
41 #include "../thread/thread_operators.cuh"
42 #include "../util_namespace.cuh"
220 int PTX_ARCH = CUB_PTX_ARCH>
233 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
236 typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpReductions;
237 typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> RakingCommutativeOnly;
238 typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> Raking;
244 RakingCommutativeOnly,
245 Raking>::Type>::Type InternalBlockReduce;
248 typedef typename InternalBlockReduce::TempStorage _TempStorage;
256 __device__ __forceinline__ _TempStorage& PrivateStorage()
258 __shared__ _TempStorage private_storage;
259 return private_storage;
268 _TempStorage &temp_storage;
290 temp_storage(PrivateStorage()),
291 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
301 temp_storage(temp_storage.Alias()),
302 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
347 template <
typename ReductionOp>
350 ReductionOp reduction_op)
352 return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
393 int ITEMS_PER_THREAD,
394 typename ReductionOp>
396 T (&inputs)[ITEMS_PER_THREAD],
397 ReductionOp reduction_op)
400 T partial = ThreadReduce(inputs, reduction_op);
401 return Reduce(partial, reduction_op);
439 template <
typename ReductionOp>
442 ReductionOp reduction_op,
446 if (num_valid >= BLOCK_THREADS)
448 return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
452 return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
497 __device__ __forceinline__ T
Sum(
500 return InternalBlockReduce(temp_storage).template
Sum<true>(input, BLOCK_THREADS);
538 template <
int ITEMS_PER_THREAD>
539 __device__ __forceinline__ T
Sum(
540 T (&inputs)[ITEMS_PER_THREAD])
543 T partial = ThreadReduce(inputs,
cub::Sum());
582 __device__ __forceinline__ T
Sum(
587 if (num_valid >= BLOCK_THREADS)
589 return InternalBlockReduce(temp_storage).template
Sum<true>(input, num_valid);
593 return InternalBlockReduce(temp_storage).template
Sum<false>(input, num_valid);