CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
util_device.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2015, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "util_type.cuh"
37 #include "util_arch.cuh"
38 #include "util_debug.cuh"
39 #include "util_namespace.cuh"
40 #include "util_macro.cuh"
41 
43 CUB_NS_PREFIX
44 
46 namespace cub {
47 
48 
54 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
55 
56 
60 template <int ALLOCATIONS>
61 CUB_RUNTIME_FUNCTION __forceinline__
62 cudaError_t AliasTemporaries(
63  void *d_temp_storage,
64  size_t &temp_storage_bytes,
65  void* (&allocations)[ALLOCATIONS],
66  size_t (&allocation_sizes)[ALLOCATIONS])
67 {
68  const int ALIGN_BYTES = 256;
69  const int ALIGN_MASK = ~(ALIGN_BYTES - 1);
70 
71  // Compute exclusive prefix sum over allocation requests
72  size_t allocation_offsets[ALLOCATIONS];
73  size_t bytes_needed = 0;
74  for (int i = 0; i < ALLOCATIONS; ++i)
75  {
76  size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
77  allocation_offsets[i] = bytes_needed;
78  bytes_needed += allocation_bytes;
79  }
80  bytes_needed += ALIGN_BYTES - 1;
81 
82  // Check if the caller is simply requesting the size of the storage allocation
83  if (!d_temp_storage)
84  {
85  temp_storage_bytes = bytes_needed;
86  return cudaSuccess;
87  }
88 
89  // Check if enough storage provided
90  if (temp_storage_bytes < bytes_needed)
91  {
92  return CubDebug(cudaErrorInvalidValue);
93  }
94 
95  // Alias
96  d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
97  for (int i = 0; i < ALLOCATIONS; ++i)
98  {
99  allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
100  }
101 
102  return cudaSuccess;
103 }
104 
105 
109 template <typename T>
110 __global__ void EmptyKernel(void) { }
111 
112 
113 #endif // DOXYGEN_SHOULD_SKIP_THIS
114 
118 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
119 {
120  struct Dummy
121  {
123  typedef void (*EmptyKernelPtr)();
124 
126  CUB_RUNTIME_FUNCTION __forceinline__
127  EmptyKernelPtr Empty()
128  {
129  return EmptyKernel<void>;
130  }
131  };
132 
133 
134 #ifndef CUB_RUNTIME_ENABLED
135 
136  // CUDA API calls not supported from this device
137  return cudaErrorInvalidConfiguration;
138 
139 #elif (CUB_PTX_ARCH > 0)
140 
141  ptx_version = CUB_PTX_ARCH;
142  return cudaSuccess;
143 
144 #else
145 
146  cudaError_t error = cudaSuccess;
147  do
148  {
149  cudaFuncAttributes empty_kernel_attrs;
150  if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
151  ptx_version = empty_kernel_attrs.ptxVersion * 10;
152  }
153  while (0);
154 
155  return error;
156 
157 #endif
158 }
159 
160 
164 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
165 {
166 #ifndef CUB_RUNTIME_ENABLED
167 
168  // CUDA API calls not supported from this device
169  return cudaErrorInvalidConfiguration;
170 
171 #else
172 
173  cudaError_t error = cudaSuccess;
174  do
175  {
176  // Fill in SM version
177  int major, minor;
178  if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
179  if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
180  sm_version = major * 100 + minor * 10;
181  }
182  while (0);
183 
184  return error;
185 
186 #endif
187 }
188 
189 
190 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
191 
195 CUB_RUNTIME_FUNCTION __forceinline__
196 static cudaError_t SyncStream(cudaStream_t stream)
197 {
198 #if (CUB_PTX_ARCH == 0)
199  return cudaStreamSynchronize(stream);
200 #else
201  // Device can't yet sync on a specific stream
202  return cudaDeviceSynchronize();
203 #endif
204 }
205 
206 
238 template <typename KernelPtr>
239 CUB_RUNTIME_FUNCTION __forceinline__
240 cudaError_t MaxSmOccupancy(
241  int &max_sm_occupancy,
242  KernelPtr kernel_ptr,
243  int block_threads,
244  int dynamic_smem_bytes = 0)
245 {
246 #ifndef CUB_RUNTIME_ENABLED
247 
248  // CUDA API calls not supported from this device
249  return CubDebug(cudaErrorInvalidConfiguration);
250 
251 #else
252 
253  return cudaOccupancyMaxActiveBlocksPerMultiprocessor (
254  &max_sm_occupancy,
255  kernel_ptr,
256  block_threads,
257  dynamic_smem_bytes);
258 
259 #endif // CUB_RUNTIME_ENABLED
260 }
261 
262 
263 /******************************************************************************
264  * Policy management
265  ******************************************************************************/
266 
270 struct KernelConfig
271 {
272  int block_threads;
273  int items_per_thread;
274  int tile_size;
275  int sm_occupancy;
276 
277  CUB_RUNTIME_FUNCTION __forceinline__
278  KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
279 
280  template <typename AgentPolicyT, typename KernelPtrT>
281  CUB_RUNTIME_FUNCTION __forceinline__
282  cudaError_t Init(KernelPtrT kernel_ptr)
283  {
284  block_threads = AgentPolicyT::BLOCK_THREADS;
285  items_per_thread = AgentPolicyT::ITEMS_PER_THREAD;
286  tile_size = block_threads * items_per_thread;
287  cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
288  return retval;
289  }
290 };
291 
292 
293 
295 template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>
296 struct ChainedPolicy
297 {
299  typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;
300 
302  template <typename FunctorT>
303  CUB_RUNTIME_FUNCTION __forceinline__
304  static cudaError_t Invoke(int ptx_version, FunctorT &op)
305  {
306  if (ptx_version < PTX_VERSION) {
307  return PrevPolicyT::Invoke(ptx_version, op);
308  }
309  return op.template Invoke<PolicyT>();
310  }
311 };
312 
314 template <int PTX_VERSION, typename PolicyT>
315 struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
316 {
318  typedef PolicyT ActivePolicy;
319 
321  template <typename FunctorT>
322  CUB_RUNTIME_FUNCTION __forceinline__
323  static cudaError_t Invoke(int ptx_version, FunctorT &op) {
324  return op.template Invoke<PolicyT>();
325  }
326 };
327 
328 
329 
330 
331 #endif // Do not document
332 
333 
334 
335  // end group UtilMgmt
337 
338 } // CUB namespace
339 CUB_NS_POSTFIX // Optional outer namespace(s)