CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
util_allocator.cuh
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /******************************************************************************
30  * Simple caching allocator for device memory allocations. The allocator is
31  * thread-safe and capable of managing device allocations on multiple devices.
32  ******************************************************************************/
33 
34 #pragma once
35 
36 #include "util_namespace.cuh"
37 #include "util_debug.cuh"
38 
39 #include <set>
40 #include <map>
41 
42 #include "host/mutex.cuh"
43 #include <math.h>
44 
46 CUB_NS_PREFIX
47 
49 namespace cub {
50 
51 
58 /******************************************************************************
59  * CachingDeviceAllocator (host use)
60  ******************************************************************************/
61 
102 {
103 
104  //---------------------------------------------------------------------
105  // Constants
106  //---------------------------------------------------------------------
107 
109  static const unsigned int INVALID_BIN = (unsigned int) -1;
110 
112  static const size_t INVALID_SIZE = (size_t) -1;
113 
114 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
115 
117  static const int INVALID_DEVICE_ORDINAL = -1;
118 
119  //---------------------------------------------------------------------
120  // Type definitions and helper types
121  //---------------------------------------------------------------------
122 
126  struct BlockDescriptor
127  {
128  void* d_ptr; // Device pointer
129  size_t bytes; // Size of allocation in bytes
130  unsigned int bin; // Bin enumeration
131  int device; // device ordinal
132  cudaStream_t associated_stream; // Associated associated_stream
133  cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed
134 
135  // Constructor (suitable for searching maps for a specific block, given its pointer and device)
136  BlockDescriptor(void *d_ptr, int device) :
137  d_ptr(d_ptr),
138  bytes(0),
139  bin(INVALID_BIN),
140  device(device),
141  associated_stream(0),
142  ready_event(0)
143  {}
144 
145  // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
146  BlockDescriptor(int device) :
147  d_ptr(NULL),
148  bytes(0),
149  bin(INVALID_BIN),
150  device(device),
151  associated_stream(0),
152  ready_event(0)
153  {}
154 
155  // Comparison functor for comparing device pointers
156  static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
157  {
158  if (a.device == b.device)
159  return (a.d_ptr < b.d_ptr);
160  else
161  return (a.device < b.device);
162  }
163 
164  // Comparison functor for comparing allocation sizes
165  static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
166  {
167  if (a.device == b.device)
168  return (a.bytes < b.bytes);
169  else
170  return (a.device < b.device);
171  }
172  };
173 
175  typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
176 
177  class TotalBytes {
178  public:
179  size_t free;
180  size_t live;
181  TotalBytes() { free = live = 0; }
182  };
183 
185  typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
186 
188  typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
189 
191  typedef std::map<int, TotalBytes> GpuCachedBytes;
192 
193 
194  //---------------------------------------------------------------------
195  // Utility functions
196  //---------------------------------------------------------------------
197 
201  static unsigned int IntPow(
202  unsigned int base,
203  unsigned int exp)
204  {
205  unsigned int retval = 1;
206  while (exp > 0)
207  {
208  if (exp & 1) {
209  retval = retval * base; // multiply the result by the current base
210  }
211  base = base * base; // square the base
212  exp = exp >> 1; // divide the exponent in half
213  }
214  return retval;
215  }
216 
217 
221  static void NearestPowerOf(
222  unsigned int &power,
223  size_t &rounded_bytes,
224  unsigned int base,
225  size_t value)
226  {
227  power = 0;
228  rounded_bytes = 1;
229 
230  while (rounded_bytes < value)
231  {
232  rounded_bytes *= base;
233  power++;
234  }
235  }
236 
237 
238  //---------------------------------------------------------------------
239  // Fields
240  //---------------------------------------------------------------------
241 
242  cub::Mutex mutex;
243 
244  unsigned int bin_growth;
245  unsigned int min_bin;
246  unsigned int max_bin;
247 
248  size_t min_bin_bytes;
249  size_t max_bin_bytes;
250  size_t max_cached_bytes;
251 
252  const bool skip_cleanup;
253  bool debug;
254 
255  GpuCachedBytes cached_bytes;
256  CachedBlocks cached_blocks;
257  BusyBlocks live_blocks;
258 
259 #endif // DOXYGEN_SHOULD_SKIP_THIS
260 
261  //---------------------------------------------------------------------
262  // Methods
263  //---------------------------------------------------------------------
264 
269  unsigned int bin_growth,
270  unsigned int min_bin = 1,
271  unsigned int max_bin = INVALID_BIN,
272  size_t max_cached_bytes = INVALID_SIZE,
273  bool skip_cleanup = false,
274  bool debug = false)
275  :
276  bin_growth(bin_growth),
277  min_bin(min_bin),
278  max_bin(max_bin),
279  min_bin_bytes(IntPow(bin_growth, min_bin)),
280  max_bin_bytes(IntPow(bin_growth, max_bin)),
281  max_cached_bytes(max_cached_bytes),
282  skip_cleanup(skip_cleanup),
283  debug(debug),
284  cached_blocks(BlockDescriptor::SizeCompare),
285  live_blocks(BlockDescriptor::PtrCompare)
286  {}
287 
288 
303  bool skip_cleanup = false,
304  bool debug = false)
305  :
306  bin_growth(8),
307  min_bin(3),
308  max_bin(7),
309  min_bin_bytes(IntPow(bin_growth, min_bin)),
310  max_bin_bytes(IntPow(bin_growth, max_bin)),
311  max_cached_bytes((max_bin_bytes * 3) - 1),
312  skip_cleanup(skip_cleanup),
313  debug(debug),
314  cached_blocks(BlockDescriptor::SizeCompare),
315  live_blocks(BlockDescriptor::PtrCompare)
316  {}
317 
318 
325  cudaError_t SetMaxCachedBytes(
326  size_t max_cached_bytes)
327  {
328  // Lock
329  mutex.Lock();
330 
331  if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);
332 
333  this->max_cached_bytes = max_cached_bytes;
334 
335  // Unlock
336  mutex.Unlock();
337 
338  return cudaSuccess;
339  }
340 
341 
349  cudaError_t DeviceAllocate(
350  int device,
351  void **d_ptr,
352  size_t bytes,
353  cudaStream_t active_stream = 0)
354  {
355  *d_ptr = NULL;
356  int entrypoint_device = INVALID_DEVICE_ORDINAL;
357  cudaError_t error = cudaSuccess;
358 
359  if (device == INVALID_DEVICE_ORDINAL)
360  {
361  if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
362  device = entrypoint_device;
363  }
364 
365  // Create a block descriptor for the requested allocation
366  bool found = false;
367  BlockDescriptor search_key(device);
368  search_key.associated_stream = active_stream;
369  NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
370 
371  if (search_key.bin > max_bin)
372  {
373  // Bin is greater than our maximum bin: allocate the request
374  // exactly and give out-of-bounds bin. It will not be cached
375  // for reuse when returned.
376  search_key.bin = INVALID_BIN;
377  search_key.bytes = bytes;
378  }
379  else
380  {
381  // Search for a suitable cached allocation: lock
382  mutex.Lock();
383 
384  if (search_key.bin < min_bin)
385  {
386  // Bin is less than minimum bin: round up
387  search_key.bin = min_bin;
388  search_key.bytes = min_bin_bytes;
389  }
390 
391  // Iterate through the range of cached blocks on the same device in the same bin
392  CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
393  while ((block_itr != cached_blocks.end())
394  && (block_itr->device == device)
395  && (block_itr->bin == search_key.bin))
396  {
397  // To prevent races with reusing blocks returned by the host but still
398  // in use by the device, only consider cached blocks that are
399  // either (from the active stream) or (from an idle stream)
400  if ((active_stream == block_itr->associated_stream) ||
401  (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
402  {
403  // Reuse existing cache block. Insert into live blocks.
404  found = true;
405  search_key = *block_itr;
406  search_key.associated_stream = active_stream;
407  live_blocks.insert(search_key);
408 
409  // Remove from free blocks
410  cached_blocks.erase(block_itr);
411  cached_bytes[device].free -= search_key.bytes;
412  cached_bytes[device].live += search_key.bytes;
413 
414  if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
415  device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) block_itr->associated_stream);
416 
417  break;
418  }
419  block_itr++;
420  }
421 
422  // Done searching: unlock
423  mutex.Unlock();
424  }
425 
426  // Allocate the block if necessary
427  if (!found)
428  {
429  // Set runtime's current device to specified device (entrypoint may not be set)
430  if (device != entrypoint_device)
431  {
432  if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
433  if (CubDebug(error = cudaSetDevice(device))) return error;
434  }
435 
436  // Attempt to allocate
437  if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
438  {
439  // The allocation attempt failed: free all cached blocks on device and retry
440  error = cudaSuccess; // Reset error
441  if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
442  device, (long long) search_key.bytes, (long long) search_key.associated_stream);
443 
444  // Lock
445  mutex.Lock();
446 
447  // Iterate the range of free blocks on the same device
448  BlockDescriptor free_key(device);
449  CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
450 
451  while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
452  {
453  // No need to worry about synchronization with the device: cudaFree is
454  // blocking and will synchronize across all kernels executing
455  // on the current device
456 
457  // Free device memory and destroy stream event.
458  if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
459  if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
460 
461  // Reduce balance and erase entry
462  cached_bytes[device].free -= block_itr->bytes;
463  cached_blocks.erase(block_itr);
464 
465  if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
466  device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
467 
468  block_itr++;
469  }
470 
471  // Unlock
472  mutex.Unlock();
473 
474  // Return under error
475  if (error) return error;
476 
477  // Try to allocate again
478  if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
479  }
480 
481  // Create ready event
482  if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
483  return error;
484 
485  // Insert into live blocks
486  mutex.Lock();
487  live_blocks.insert(search_key);
488  cached_bytes[device].live += search_key.bytes;
489  mutex.Unlock();
490 
491  if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
492  device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
493 
494  // Attempt to revert back to previous device if necessary
495  if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
496  {
497  if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
498  }
499  }
500 
501  // Copy device pointer to output parameter
502  *d_ptr = search_key.d_ptr;
503 
504  if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
505  (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
506 
507  return error;
508  }
509 
510 
518  cudaError_t DeviceAllocate(
519  void **d_ptr,
520  size_t bytes,
521  cudaStream_t active_stream = 0)
522  {
523  return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
524  }
525 
526 
534  cudaError_t DeviceFree(
535  int device,
536  void* d_ptr)
537  {
538  int entrypoint_device = INVALID_DEVICE_ORDINAL;
539  cudaError_t error = cudaSuccess;
540 
541  if (device == INVALID_DEVICE_ORDINAL)
542  {
543  if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
544  return error;
545  device = entrypoint_device;
546  }
547 
548  // Lock
549  mutex.Lock();
550 
551  // Find corresponding block descriptor
552  bool recached = false;
553  BlockDescriptor search_key(d_ptr, device);
554  BusyBlocks::iterator block_itr = live_blocks.find(search_key);
555  if (block_itr != live_blocks.end())
556  {
557  // Remove from live blocks
558  search_key = *block_itr;
559  live_blocks.erase(block_itr);
560  cached_bytes[device].live -= search_key.bytes;
561 
562  // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
563  if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
564  {
565  // Insert returned allocation into free blocks
566  recached = true;
567  cached_blocks.insert(search_key);
568  cached_bytes[device].free += search_key.bytes;
569 
570  if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
571  device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
572  (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
573  }
574  }
575 
576  // Unlock
577  mutex.Unlock();
578 
579  // First set to specified device (entrypoint may not be set)
580  if (device != entrypoint_device)
581  {
582  if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
583  if (CubDebug(error = cudaSetDevice(device))) return error;
584  }
585 
586  if (recached)
587  {
588  // Insert the ready event in the associated stream (must have current device set properly)
589  if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
590  }
591  else
592  {
593  // Free the allocation from the runtime and cleanup the event.
594  if (CubDebug(error = cudaFree(d_ptr))) return error;
595  if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
596 
597  if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
598  device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
599  }
600 
601  // Reset device
602  if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
603  {
604  if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
605  }
606 
607  return error;
608  }
609 
610 
618  cudaError_t DeviceFree(
619  void* d_ptr)
620  {
621  return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
622  }
623 
624 
628  cudaError_t FreeAllCached()
629  {
630  cudaError_t error = cudaSuccess;
631  int entrypoint_device = INVALID_DEVICE_ORDINAL;
632  int current_device = INVALID_DEVICE_ORDINAL;
633 
634  mutex.Lock();
635 
636  while (!cached_blocks.empty())
637  {
638  // Get first block
639  CachedBlocks::iterator begin = cached_blocks.begin();
640 
641  // Get entry-point device ordinal if necessary
642  if (entrypoint_device == INVALID_DEVICE_ORDINAL)
643  {
644  if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
645  }
646 
647  // Set current device ordinal if necessary
648  if (begin->device != current_device)
649  {
650  if (CubDebug(error = cudaSetDevice(begin->device))) break;
651  current_device = begin->device;
652  }
653 
654  // Free device memory
655  if (CubDebug(error = cudaFree(begin->d_ptr))) break;
656  if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
657 
658  // Reduce balance and erase entry
659  cached_bytes[current_device].free -= begin->bytes;
660  cached_blocks.erase(begin);
661 
662  if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
663  current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);
664  }
665 
666  mutex.Unlock();
667 
668  // Attempt to revert back to entry-point device if necessary
669  if (entrypoint_device != INVALID_DEVICE_ORDINAL)
670  {
671  if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
672  }
673 
674  return error;
675  }
676 
677 
682  {
683  if (!skip_cleanup)
684  FreeAllCached();
685  }
686 
687 };
688 
689 
690 
691  // end group UtilMgmt
693 
694 } // CUB namespace
695 CUB_NS_POSTFIX // Optional outer namespace(s)