39 #include "util_namespace.cuh"
40 #include "util_macro.cuh"
54 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
60 template <
int ALLOCATIONS>
61 CUB_RUNTIME_FUNCTION __forceinline__
62 cudaError_t AliasTemporaries(
64 size_t &temp_storage_bytes,
65 void* (&allocations)[ALLOCATIONS],
66 size_t (&allocation_sizes)[ALLOCATIONS])
68 const int ALIGN_BYTES = 256;
69 const int ALIGN_MASK = ~(ALIGN_BYTES - 1);
72 size_t allocation_offsets[ALLOCATIONS];
73 size_t bytes_needed = 0;
74 for (
int i = 0; i < ALLOCATIONS; ++i)
76 size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
77 allocation_offsets[i] = bytes_needed;
78 bytes_needed += allocation_bytes;
80 bytes_needed += ALIGN_BYTES - 1;
85 temp_storage_bytes = bytes_needed;
90 if (temp_storage_bytes < bytes_needed)
92 return CubDebug(cudaErrorInvalidValue);
96 d_temp_storage = (
void *) ((
size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
97 for (
int i = 0; i < ALLOCATIONS; ++i)
99 allocations[i] =
static_cast<char*
>(d_temp_storage) + allocation_offsets[i];
109 template <
typename T>
110 __global__
void EmptyKernel(
void) { }
113 #endif // DOXYGEN_SHOULD_SKIP_THIS
118 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
PtxVersion(
int &ptx_version)
123 typedef void (*EmptyKernelPtr)();
126 CUB_RUNTIME_FUNCTION __forceinline__
127 EmptyKernelPtr Empty()
129 return EmptyKernel<void>;
134 #ifndef CUB_RUNTIME_ENABLED
137 return cudaErrorInvalidConfiguration;
139 #elif (CUB_PTX_ARCH > 0)
141 ptx_version = CUB_PTX_ARCH;
146 cudaError_t error = cudaSuccess;
149 cudaFuncAttributes empty_kernel_attrs;
150 if (
CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>)))
break;
151 ptx_version = empty_kernel_attrs.ptxVersion * 10;
164 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
SmVersion(
int &sm_version,
int device_ordinal)
166 #ifndef CUB_RUNTIME_ENABLED
169 return cudaErrorInvalidConfiguration;
173 cudaError_t error = cudaSuccess;
178 if (
CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal)))
break;
179 if (
CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal)))
break;
180 sm_version = major * 100 + minor * 10;
190 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
195 CUB_RUNTIME_FUNCTION __forceinline__
196 static cudaError_t SyncStream(cudaStream_t stream)
198 #if (CUB_PTX_ARCH == 0)
199 return cudaStreamSynchronize(stream);
202 return cudaDeviceSynchronize();
238 template <
typename KernelPtr>
239 CUB_RUNTIME_FUNCTION __forceinline__
240 cudaError_t MaxSmOccupancy(
241 int &max_sm_occupancy,
242 KernelPtr kernel_ptr,
244 int dynamic_smem_bytes = 0)
246 #ifndef CUB_RUNTIME_ENABLED
249 return CubDebug(cudaErrorInvalidConfiguration);
253 return cudaOccupancyMaxActiveBlocksPerMultiprocessor (
259 #endif // CUB_RUNTIME_ENABLED
273 int items_per_thread;
277 CUB_RUNTIME_FUNCTION __forceinline__
278 KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
280 template <
typename AgentPolicyT,
typename KernelPtrT>
281 CUB_RUNTIME_FUNCTION __forceinline__
282 cudaError_t Init(KernelPtrT kernel_ptr)
284 block_threads = AgentPolicyT::BLOCK_THREADS;
285 items_per_thread = AgentPolicyT::ITEMS_PER_THREAD;
286 tile_size = block_threads * items_per_thread;
287 cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
295 template <
int PTX_VERSION,
typename PolicyT,
typename PrevPolicyT>
299 typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;
302 template <
typename FunctorT>
303 CUB_RUNTIME_FUNCTION __forceinline__
304 static cudaError_t Invoke(
int ptx_version, FunctorT &op)
306 if (ptx_version < PTX_VERSION) {
307 return PrevPolicyT::Invoke(ptx_version, op);
309 return op.template Invoke<PolicyT>();
314 template <
int PTX_VERSION,
typename PolicyT>
315 struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
318 typedef PolicyT ActivePolicy;
321 template <
typename FunctorT>
322 CUB_RUNTIME_FUNCTION __forceinline__
323 static cudaError_t Invoke(
int ptx_version, FunctorT &op) {
324 return op.template Invoke<PolicyT>();
331 #endif // Do not document