CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
util_allocator.cuh
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2015, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /******************************************************************************
30  * Simple caching allocator for device memory allocations. The allocator is
31  * thread-safe and capable of managing device allocations on multiple devices.
32  ******************************************************************************/
33 
34 #pragma once
35 
36 #if (CUB_PTX_ARCH == 0)
37  #include <set> // NVCC (EDG, really) takes FOREVER to compile std::map
38  #include <map>
39 #endif
40 
41 #include <math.h>
42 
43 #include "util_namespace.cuh"
44 #include "util_debug.cuh"
45 
46 #include "host/spinlock.cuh"
47 
49 CUB_NS_PREFIX
50 
52 namespace cub {
53 
54 
61 /******************************************************************************
62  * CachingDeviceAllocator (host use)
63  ******************************************************************************/
64 
105 {
106 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
107 
108 
109  //---------------------------------------------------------------------
110  // Type definitions and constants
111  //---------------------------------------------------------------------
112 
113  enum
114  {
116  INVALID_DEVICE_ORDINAL = -1,
117  };
118 
122  static unsigned int IntPow(
123  unsigned int base,
124  unsigned int exp)
125  {
126  unsigned int retval = 1;
127  while (exp > 0)
128  {
129  if (exp & 1) {
130  retval = retval * base; // multiply the result by the current base
131  }
132  base = base * base; // square the base
133  exp = exp >> 1; // divide the exponent in half
134  }
135  return retval;
136  }
137 
138 
142  static void NearestPowerOf(
143  unsigned int &power,
144  size_t &rounded_bytes,
145  unsigned int base,
146  size_t value)
147  {
148  power = 0;
149  rounded_bytes = 1;
150 
151  while (rounded_bytes < value)
152  {
153  rounded_bytes *= base;
154  power++;
155  }
156  }
157 
161  struct BlockDescriptor
162  {
163  int device; // device ordinal
164  void* d_ptr; // Device pointer
165  cudaStream_t associated_stream; // Associated associated_stream
166  cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed
167  size_t bytes; // Size of allocation in bytes
168  unsigned int bin; // Bin enumeration
169 
170  // Constructor
171  BlockDescriptor(void *d_ptr, int device) :
172  d_ptr(d_ptr),
173  bytes(0),
174  bin(0),
175  device(device),
176  associated_stream(0),
177  ready_event(0)
178  {}
179 
180  // Constructor
181  BlockDescriptor(size_t bytes, unsigned int bin, int device, cudaStream_t associated_stream) :
182  d_ptr(NULL),
183  bytes(bytes),
184  bin(bin),
185  device(device),
186  associated_stream(associated_stream),
187  ready_event(0)
188  {}
189 
190  // Comparison functor for comparing device pointers
191  static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
192  {
193  if (a.device == b.device)
194  return (a.d_ptr < b.d_ptr);
195  else
196  return (a.device < b.device);
197  }
198 
199  // Comparison functor for comparing allocation sizes
200  static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
201  {
202  if (a.device == b.device)
203  return (a.bytes < b.bytes);
204  else
205  return (a.device < b.device);
206  }
207  };
208 
210  typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
211 
212 #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code
213 
215  typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
216 
218  typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
219 
221  typedef std::map<int, size_t> GpuCachedBytes;
222 
223 #endif // CUB_PTX_ARCH
224 
225  //---------------------------------------------------------------------
226  // Fields
227  //---------------------------------------------------------------------
228 
229  Spinlock spin_lock;
230 
231  unsigned int bin_growth;
232  unsigned int min_bin;
233  unsigned int max_bin;
234 
235  size_t min_bin_bytes;
236  size_t max_bin_bytes;
237  size_t max_cached_bytes;
238 
239  bool debug;
240  bool skip_cleanup;
241 
242 #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code
243 
244  GpuCachedBytes cached_bytes;
245  CachedBlocks cached_blocks;
246  BusyBlocks live_blocks;
247 
248 #endif // CUB_PTX_ARCH
249 
250 #endif // DOXYGEN_SHOULD_SKIP_THIS
251 
252  //---------------------------------------------------------------------
253  // Methods
254  //---------------------------------------------------------------------
255 
260  unsigned int bin_growth,
261  unsigned int min_bin,
262  unsigned int max_bin,
263  size_t max_cached_bytes,
264  bool skip_cleanup = false)
265  :
266  #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code
267  cached_blocks(BlockDescriptor::SizeCompare),
268  live_blocks(BlockDescriptor::PtrCompare),
269  #endif
270  debug(false),
271  spin_lock(0),
272  bin_growth(bin_growth),
273  min_bin(min_bin),
274  max_bin(max_bin),
275  min_bin_bytes(IntPow(bin_growth, min_bin)),
276  max_bin_bytes(IntPow(bin_growth, max_bin)),
277  max_cached_bytes(max_cached_bytes)
278  {}
279 
280 
295  bool skip_cleanup = false)
296  :
297  #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code
298  cached_blocks(BlockDescriptor::SizeCompare),
299  live_blocks(BlockDescriptor::PtrCompare),
300  #endif
301  skip_cleanup(skip_cleanup),
302  debug(false),
303  spin_lock(0),
304  bin_growth(8),
305  min_bin(3),
306  max_bin(7),
307  min_bin_bytes(IntPow(bin_growth, min_bin)),
308  max_bin_bytes(IntPow(bin_growth, max_bin)),
309  max_cached_bytes((max_bin_bytes * 3) - 1)
310  {}
311 
312 
316  cudaError_t SetMaxCachedBytes(
317  size_t max_cached_bytes)
318  {
319  #if (CUB_PTX_ARCH > 0)
320  // Caching functionality only defined on host
321  return CubDebug(cudaErrorInvalidConfiguration);
322  #else
323 
324  // Lock
325  Lock(&spin_lock);
326 
327  this->max_cached_bytes = max_cached_bytes;
328 
329  if (debug) _CubLog("New max_cached_bytes(%lld)\n", (long long) max_cached_bytes);
330 
331  // Unlock
332  Unlock(&spin_lock);
333 
334  return cudaSuccess;
335 
336  #endif // CUB_PTX_ARCH
337  }
338 
339 
347  cudaError_t DeviceAllocate(
348  int device,
349  void **d_ptr,
350  size_t bytes,
351  cudaStream_t active_stream = 0)
352  {
353  #if (CUB_PTX_ARCH > 0)
354  // Caching functionality only defined on host
355  return CubDebug(cudaErrorInvalidConfiguration);
356  #else
357 
358  *d_ptr = NULL;
359  bool locked = false;
360  int entrypoint_device = INVALID_DEVICE_ORDINAL;
361  cudaError_t error = cudaSuccess;
362 
363  do {
364 
365  if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
366  if (device == INVALID_DEVICE_ORDINAL)
367  device = entrypoint_device;
368 
369  // Round up to nearest bin size
370  unsigned int bin;
371  size_t bin_bytes;
372  NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
373  if (bin < min_bin) {
374  bin = min_bin;
375  bin_bytes = min_bin_bytes;
376  }
377 
378  // Check if bin is greater than our maximum bin
379  if (bin > max_bin)
380  {
381  // Allocate the request exactly and give out-of-range bin
382  bin = (unsigned int) -1;
383  bin_bytes = bytes;
384  }
385 
386  BlockDescriptor search_key(bin_bytes, bin, device, active_stream);
387 
388  // Lock
389  if (!locked) {
390  Lock(&spin_lock);
391  locked = true;
392  }
393 
394  // Find the range of freed blocks big enough within the same bin on the same device
395  CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
396 
397  // Look for freed blocks from the active stream or from other idle streams
398  bool found = false;
399  while ((block_itr != cached_blocks.end()) &&
400  (block_itr->device == device) &&
401  (block_itr->bin == search_key.bin))
402  {
403  cudaStream_t prev_stream = block_itr->associated_stream;
404  if ((active_stream == prev_stream) || (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
405  {
406  // Reuse existing cache block. Insert into live blocks.
407  found = true;
408  search_key = *block_itr;
409  search_key.associated_stream = active_stream;
410  live_blocks.insert(search_key);
411 
412  // Remove from free blocks
413  cached_blocks.erase(block_itr);
414  cached_bytes[device] -= search_key.bytes;
415 
416  if (debug) _CubLog("\tdevice %d reused cached block for stream %lld (%lld bytes, previously associated with stream %lld).\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
417  device, (long long) active_stream, (long long) search_key.bytes, (long long) prev_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
418 
419  break;
420  }
421 
422  block_itr++;
423  }
424 
425  if (!found)
426  {
427  // Need to allocate a new cache block. Unlock.
428  if (locked) {
429  Unlock(&spin_lock);
430  locked = false;
431  }
432 
433  // Set to specified device
434  if (device != entrypoint_device) {
435  if (CubDebug(error = cudaSetDevice(device))) break;
436  }
437 
438  // Allocate
439  if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break;
440  if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) break;
441 
442  // Lock
443  if (!locked) {
444  Lock(&spin_lock);
445  locked = true;
446  }
447 
448  // Insert into live blocks
449  live_blocks.insert(search_key);
450 
451  if (debug) _CubLog("\tdevice %d allocating new device block %lld bytes associated with stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
452  device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
453  }
454 
455  // Copy device pointer to output parameter
456  *d_ptr = search_key.d_ptr;
457 
458  } while(0);
459 
460  // Unlock
461  if (locked) {
462  Unlock(&spin_lock);
463  locked = false;
464  }
465 
466  // Attempt to revert back to previous device if necessary
467  if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
468  {
469  if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
470  }
471 
472  return error;
473 
474  #endif // CUB_PTX_ARCH
475  }
476 
477 
485  cudaError_t DeviceAllocate(
486  void **d_ptr,
487  size_t bytes,
488  cudaStream_t active_stream = 0)
489  {
490  #if (CUB_PTX_ARCH > 0)
491  // Caching functionality only defined on host
492  return CubDebug(cudaErrorInvalidConfiguration);
493  #else
494  return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
495  #endif // CUB_PTX_ARCH
496  }
497 
498 
506  cudaError_t DeviceFree(
507  int device,
508  void* d_ptr)
509  {
510  #if (CUB_PTX_ARCH > 0)
511  // Caching functionality only defined on host
512  return CubDebug(cudaErrorInvalidConfiguration);
513  #else
514 
515  bool locked = false;
516  int entrypoint_device = INVALID_DEVICE_ORDINAL;
517  cudaError_t error = cudaSuccess;
518 
519  do {
520  if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
521  if (device == INVALID_DEVICE_ORDINAL)
522  device = entrypoint_device;
523 
524  // Set to specified device
525  if (device != entrypoint_device) {
526  if (CubDebug(error = cudaSetDevice(device))) break;
527  }
528 
529  // Lock
530  if (!locked) {
531  Lock(&spin_lock);
532  locked = true;
533  }
534 
535  // Find corresponding block descriptor
536  BlockDescriptor search_key(d_ptr, device);
537  BusyBlocks::iterator block_itr = live_blocks.find(search_key);
538  if (block_itr == live_blocks.end())
539  {
540  // Cannot find pointer
541  if (CubDebug(error = cudaErrorUnknown)) break;
542  }
543  else
544  {
545  // Remove from live blocks
546  search_key = *block_itr;
547  live_blocks.erase(block_itr);
548 
549  // Check if we should keep the returned allocation
550  if (cached_bytes[device] + search_key.bytes <= max_cached_bytes)
551  {
552  // Signal the event in the associated stream
553  if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) break;
554 
555  // Insert returned allocation into free blocks
556  cached_blocks.insert(search_key);
557  cached_bytes[device] += search_key.bytes;
558 
559  if (debug) _CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
560  device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
561  }
562  else
563  {
564  // Free the returned allocation. Unlock.
565  if (locked) {
566  Unlock(&spin_lock);
567  locked = false;
568  }
569 
570  // Free device memory
571  if (CubDebug(error = cudaFree(d_ptr))) break;
572  if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) break;
573 
574  if (debug) _CubLog("\tdevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
575  device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
576  }
577  }
578  } while (0);
579 
580  // Unlock
581  if (locked) {
582  Unlock(&spin_lock);
583  locked = false;
584  }
585 
586  if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
587  {
588  if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
589  }
590 
591  return error;
592 
593  #endif // CUB_PTX_ARCH
594  }
595 
596 
604  cudaError_t DeviceFree(
605  void* d_ptr)
606  {
607  #if (CUB_PTX_ARCH > 0)
608  // Caching functionality only defined on host
609  return CubDebug(cudaErrorInvalidConfiguration);
610  #else
611  return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
612  #endif // CUB_PTX_ARCH
613  }
614 
615 
619  cudaError_t FreeAllCached()
620  {
621  #if (CUB_PTX_ARCH > 0)
622  // Caching functionality only defined on host
623  return CubDebug(cudaErrorInvalidConfiguration);
624  #else
625 
626  cudaError_t error = cudaSuccess;
627  bool locked = false;
628  int entrypoint_device = INVALID_DEVICE_ORDINAL;
629  int current_device = INVALID_DEVICE_ORDINAL;
630 
631  // Lock
632  if (!locked) {
633  Lock(&spin_lock);
634  locked = true;
635  }
636 
637  while (!cached_blocks.empty())
638  {
639  // Get first block
640  CachedBlocks::iterator begin = cached_blocks.begin();
641 
642  // Get entry-point device ordinal if necessary
643  if (entrypoint_device == INVALID_DEVICE_ORDINAL)
644  {
645  if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
646  }
647 
648  // Set current device ordinal if necessary
649  if (begin->device != current_device)
650  {
651  if (CubDebug(error = cudaSetDevice(begin->device))) break;
652  current_device = begin->device;
653  }
654 
655  // Free device memory
656  if (CubDebug(error = cudaFree(begin->d_ptr))) break;
657  if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
658 
659  // Reduce balance and erase entry
660  cached_bytes[current_device] -= begin->bytes;
661  cached_blocks.erase(begin);
662 
663  if (debug) _CubLog("\tdevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
664  current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device], (long long) live_blocks.size());
665  }
666 
667  // Unlock
668  if (locked) {
669  Unlock(&spin_lock);
670  locked = false;
671  }
672 
673  // Attempt to revert back to entry-point device if necessary
674  if (entrypoint_device != INVALID_DEVICE_ORDINAL)
675  {
676  if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
677  }
678 
679  return error;
680 
681  #endif // CUB_PTX_ARCH
682  }
683 
684 
689  {
690  if (!skip_cleanup)
691  FreeAllCached();
692  }
693 
694 };
695 
696 
697 
698  // end group UtilMgmt
700 
701 } // CUB namespace
702 CUB_NS_POSTFIX // Optional outer namespace(s)