CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
util_device.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2015, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "util_type.cuh"
37 #include "util_arch.cuh"
38 #include "util_debug.cuh"
39 #include "util_namespace.cuh"
40 #include "util_macro.cuh"
41 
43 CUB_NS_PREFIX
44 
46 namespace cub {
47 
48 
54 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
55 
56 
60 template <int ALLOCATIONS>
61 CUB_RUNTIME_FUNCTION __forceinline__
62 cudaError_t AliasTemporaries(
63  void *d_temp_storage,
64  size_t &temp_storage_bytes,
65  void* (&allocations)[ALLOCATIONS],
66  size_t (&allocation_sizes)[ALLOCATIONS])
67 {
68  const int ALIGN_BYTES = 256;
69  const int ALIGN_MASK = ~(ALIGN_BYTES - 1);
70 
71  // Compute exclusive prefix sum over allocation requests
72  size_t allocation_offsets[ALLOCATIONS];
73  size_t bytes_needed = 0;
74  for (int i = 0; i < ALLOCATIONS; ++i)
75  {
76  size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
77  allocation_offsets[i] = bytes_needed;
78  bytes_needed += allocation_bytes;
79  }
80 
81  // Check if the caller is simply requesting the size of the storage allocation
82  if (!d_temp_storage)
83  {
84  temp_storage_bytes = bytes_needed;
85  return cudaSuccess;
86  }
87 
88  // Check if enough storage provided
89  if (temp_storage_bytes < bytes_needed)
90  {
91  return CubDebug(cudaErrorInvalidValue);
92  }
93 
94  // Alias
95  for (int i = 0; i < ALLOCATIONS; ++i)
96  {
97  allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
98  }
99 
100  return cudaSuccess;
101 }
102 
103 
107 template <typename T>
108 __global__ void EmptyKernel(void) { }
109 
110 
111 #endif // DOXYGEN_SHOULD_SKIP_THIS
112 
116 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
117 {
118  struct Dummy
119  {
121  typedef void (*EmptyKernelPtr)();
122 
124  CUB_RUNTIME_FUNCTION __forceinline__
125  EmptyKernelPtr Empty()
126  {
127  return EmptyKernel<void>;
128  }
129  };
130 
131 
132 #ifndef CUB_RUNTIME_ENABLED
133 
134  // CUDA API calls not supported from this device
135  return cudaErrorInvalidConfiguration;
136 
137 #elif (CUB_PTX_ARCH > 0)
138 
139  ptx_version = CUB_PTX_ARCH;
140  return cudaSuccess;
141 
142 #else
143 
144  cudaError_t error = cudaSuccess;
145  do
146  {
147  cudaFuncAttributes empty_kernel_attrs;
148  if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
149  ptx_version = empty_kernel_attrs.ptxVersion * 10;
150  }
151  while (0);
152 
153  return error;
154 
155 #endif
156 }
157 
158 
162 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
163 {
164 #ifndef CUB_RUNTIME_ENABLED
165 
166  // CUDA API calls not supported from this device
167  return cudaErrorInvalidConfiguration;
168 
169 #else
170 
171  cudaError_t error = cudaSuccess;
172  do
173  {
174  // Fill in SM version
175  int major, minor;
176  if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
177  if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
178  sm_version = major * 100 + minor * 10;
179  }
180  while (0);
181 
182  return error;
183 
184 #endif
185 }
186 
187 
188 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
189 
193 CUB_RUNTIME_FUNCTION __forceinline__
194 static cudaError_t SyncStream(cudaStream_t stream)
195 {
196 #if (CUB_PTX_ARCH == 0)
197  return cudaStreamSynchronize(stream);
198 #else
199  // Device can't yet sync on a specific stream
200  return cudaDeviceSynchronize();
201 #endif
202 }
203 
204 
236 template <typename KernelPtr>
237 CUB_RUNTIME_FUNCTION __forceinline__
238 cudaError_t MaxSmOccupancy(
239  int &max_sm_occupancy,
240  KernelPtr kernel_ptr,
241  int block_threads,
242  int dynamic_smem_bytes = 0)
243 {
244 #ifndef CUB_RUNTIME_ENABLED
245 
246  // CUDA API calls not supported from this device
247  return CubDebug(cudaErrorInvalidConfiguration);
248 
249 #else
250 
251  return cudaOccupancyMaxActiveBlocksPerMultiprocessor (
252  &max_sm_occupancy,
253  kernel_ptr,
254  block_threads,
255  dynamic_smem_bytes);
256 
257 #endif // CUB_RUNTIME_ENABLED
258 }
259 
260 
261 /******************************************************************************
262  * Policy management
263  ******************************************************************************/
264 
268 struct KernelConfig
269 {
270  int block_threads;
271  int items_per_thread;
272  int tile_size;
273  int sm_occupancy;
274 
275  CUB_RUNTIME_FUNCTION __forceinline__
276  KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
277 
278  template <typename AgentPolicyT, typename KernelPtrT>
279  CUB_RUNTIME_FUNCTION __forceinline__
280  cudaError_t Init(KernelPtrT kernel_ptr)
281  {
282  block_threads = AgentPolicyT::BLOCK_THREADS;
283  items_per_thread = AgentPolicyT::ITEMS_PER_THREAD;
284  tile_size = block_threads * items_per_thread;
285  cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
286  return retval;
287  }
288 };
289 
290 
291 
293 template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>
294 struct ChainedPolicy
295 {
297  typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;
298 
300  template <typename FunctorT>
301  CUB_RUNTIME_FUNCTION __forceinline__
302  static cudaError_t Invoke(int ptx_version, FunctorT &op)
303  {
304  if (ptx_version < PTX_VERSION) {
305  return PrevPolicyT::Invoke(ptx_version, op);
306  }
307  return op.template Invoke<PolicyT>();
308  }
309 };
310 
312 template <int PTX_VERSION, typename PolicyT>
313 struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
314 {
316  typedef PolicyT ActivePolicy;
317 
319  template <typename FunctorT>
320  CUB_RUNTIME_FUNCTION __forceinline__
321  static cudaError_t Invoke(int ptx_version, FunctorT &op) {
322  return op.template Invoke<PolicyT>();
323  }
324 };
325 
326 
327 
328 
329 #endif // Do not document
330 
331 
332 
333  // end group UtilMgmt
335 
336 } // CUB namespace
337 CUB_NS_POSTFIX // Optional outer namespace(s)