39 #include "util_namespace.cuh"
40 #include "util_macro.cuh"
54 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
60 template <
int ALLOCATIONS>
61 CUB_RUNTIME_FUNCTION __forceinline__
62 cudaError_t AliasTemporaries(
64 size_t &temp_storage_bytes,
65 void* (&allocations)[ALLOCATIONS],
66 size_t (&allocation_sizes)[ALLOCATIONS])
68 const int ALIGN_BYTES = 256;
69 const int ALIGN_MASK = ~(ALIGN_BYTES - 1);
72 size_t allocation_offsets[ALLOCATIONS];
73 size_t bytes_needed = 0;
74 for (
int i = 0; i < ALLOCATIONS; ++i)
76 size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
77 allocation_offsets[i] = bytes_needed;
78 bytes_needed += allocation_bytes;
84 temp_storage_bytes = bytes_needed;
89 if (temp_storage_bytes < bytes_needed)
91 return CubDebug(cudaErrorInvalidValue);
95 for (
int i = 0; i < ALLOCATIONS; ++i)
97 allocations[i] =
static_cast<char*
>(d_temp_storage) + allocation_offsets[i];
107 template <
typename T>
108 __global__
void EmptyKernel(
void) { }
111 #endif // DOXYGEN_SHOULD_SKIP_THIS
116 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
PtxVersion(
int &ptx_version)
121 typedef void (*EmptyKernelPtr)();
124 CUB_RUNTIME_FUNCTION __forceinline__
125 EmptyKernelPtr Empty()
127 return EmptyKernel<void>;
132 #ifndef CUB_RUNTIME_ENABLED
135 return cudaErrorInvalidConfiguration;
137 #elif (CUB_PTX_ARCH > 0)
139 ptx_version = CUB_PTX_ARCH;
144 cudaError_t error = cudaSuccess;
147 cudaFuncAttributes empty_kernel_attrs;
148 if (
CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>)))
break;
149 ptx_version = empty_kernel_attrs.ptxVersion * 10;
162 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
SmVersion(
int &sm_version,
int device_ordinal)
164 #ifndef CUB_RUNTIME_ENABLED
167 return cudaErrorInvalidConfiguration;
171 cudaError_t error = cudaSuccess;
176 if (
CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal)))
break;
177 if (
CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal)))
break;
178 sm_version = major * 100 + minor * 10;
188 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
193 CUB_RUNTIME_FUNCTION __forceinline__
194 static cudaError_t SyncStream(cudaStream_t stream)
196 #if (CUB_PTX_ARCH == 0)
197 return cudaStreamSynchronize(stream);
200 return cudaDeviceSynchronize();
236 template <
typename KernelPtr>
237 CUB_RUNTIME_FUNCTION __forceinline__
238 cudaError_t MaxSmOccupancy(
239 int &max_sm_occupancy,
240 KernelPtr kernel_ptr,
242 int dynamic_smem_bytes = 0)
244 #ifndef CUB_RUNTIME_ENABLED
247 return CubDebug(cudaErrorInvalidConfiguration);
251 return cudaOccupancyMaxActiveBlocksPerMultiprocessor (
257 #endif // CUB_RUNTIME_ENABLED
271 int items_per_thread;
275 CUB_RUNTIME_FUNCTION __forceinline__
276 KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
278 template <
typename AgentPolicyT,
typename KernelPtrT>
279 CUB_RUNTIME_FUNCTION __forceinline__
280 cudaError_t Init(KernelPtrT kernel_ptr)
282 block_threads = AgentPolicyT::BLOCK_THREADS;
283 items_per_thread = AgentPolicyT::ITEMS_PER_THREAD;
284 tile_size = block_threads * items_per_thread;
285 cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
293 template <
int PTX_VERSION,
typename PolicyT,
typename PrevPolicyT>
297 typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;
300 template <
typename FunctorT>
301 CUB_RUNTIME_FUNCTION __forceinline__
302 static cudaError_t Invoke(
int ptx_version, FunctorT &op)
304 if (ptx_version < PTX_VERSION) {
305 return PrevPolicyT::Invoke(ptx_version, op);
307 return op.template Invoke<PolicyT>();
312 template <
int PTX_VERSION,
typename PolicyT>
313 struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
316 typedef PolicyT ActivePolicy;
319 template <
typename FunctorT>
320 CUB_RUNTIME_FUNCTION __forceinline__
321 static cudaError_t Invoke(
int ptx_version, FunctorT &op) {
322 return op.template Invoke<PolicyT>();
329 #endif // Do not document