36 #include "util_namespace.cuh"
42 #include "host/mutex.cuh"
114 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
117 static const int INVALID_DEVICE_ORDINAL = -1;
126 struct BlockDescriptor
132 cudaStream_t associated_stream;
133 cudaEvent_t ready_event;
136 BlockDescriptor(
void *d_ptr,
int device) :
141 associated_stream(0),
146 BlockDescriptor(
int device) :
151 associated_stream(0),
156 static bool PtrCompare(
const BlockDescriptor &a,
const BlockDescriptor &b)
158 if (a.device == b.device)
159 return (a.d_ptr < b.d_ptr);
161 return (a.device < b.device);
165 static bool SizeCompare(
const BlockDescriptor &a,
const BlockDescriptor &b)
167 if (a.device == b.device)
168 return (a.bytes < b.bytes);
170 return (a.device < b.device);
175 typedef bool (*Compare)(
const BlockDescriptor &,
const BlockDescriptor &);
181 TotalBytes() { free = live = 0; }
185 typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
188 typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
191 typedef std::map<int, TotalBytes> GpuCachedBytes;
201 static unsigned int IntPow(
205 unsigned int retval = 1;
209 retval = retval * base;
221 static void NearestPowerOf(
223 size_t &rounded_bytes,
230 while (rounded_bytes < value)
232 rounded_bytes *= base;
244 unsigned int bin_growth;
245 unsigned int min_bin;
246 unsigned int max_bin;
248 size_t min_bin_bytes;
249 size_t max_bin_bytes;
250 size_t max_cached_bytes;
252 const bool skip_cleanup;
255 GpuCachedBytes cached_bytes;
256 CachedBlocks cached_blocks;
257 BusyBlocks live_blocks;
259 #endif // DOXYGEN_SHOULD_SKIP_THIS
269 unsigned int bin_growth,
270 unsigned int min_bin = 1,
273 bool skip_cleanup =
false,
276 bin_growth(bin_growth),
279 min_bin_bytes(IntPow(bin_growth, min_bin)),
280 max_bin_bytes(IntPow(bin_growth, max_bin)),
281 max_cached_bytes(max_cached_bytes),
282 skip_cleanup(skip_cleanup),
284 cached_blocks(BlockDescriptor::SizeCompare),
285 live_blocks(BlockDescriptor::PtrCompare)
303 bool skip_cleanup =
false,
309 min_bin_bytes(IntPow(bin_growth, min_bin)),
310 max_bin_bytes(IntPow(bin_growth, max_bin)),
311 max_cached_bytes((max_bin_bytes * 3) - 1),
312 skip_cleanup(skip_cleanup),
314 cached_blocks(BlockDescriptor::SizeCompare),
315 live_blocks(BlockDescriptor::PtrCompare)
326 size_t max_cached_bytes)
331 if (debug)
_CubLog(
"Changing max_cached_bytes (%lld -> %lld)\n", (
long long) this->max_cached_bytes, (
long long) max_cached_bytes);
333 this->max_cached_bytes = max_cached_bytes;
353 cudaStream_t active_stream = 0)
356 int entrypoint_device = INVALID_DEVICE_ORDINAL;
357 cudaError_t error = cudaSuccess;
359 if (device == INVALID_DEVICE_ORDINAL)
361 if (
CubDebug(error = cudaGetDevice(&entrypoint_device)))
return error;
362 device = entrypoint_device;
367 BlockDescriptor search_key(device);
368 search_key.associated_stream = active_stream;
369 NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
371 if (search_key.bin > max_bin)
377 search_key.bytes = bytes;
384 if (search_key.bin < min_bin)
387 search_key.bin = min_bin;
388 search_key.bytes = min_bin_bytes;
392 CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
393 while ((block_itr != cached_blocks.end())
394 && (block_itr->device == device)
395 && (block_itr->bin == search_key.bin))
400 if ((active_stream == block_itr->associated_stream) ||
401 (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
405 search_key = *block_itr;
406 search_key.associated_stream = active_stream;
407 live_blocks.insert(search_key);
410 cached_blocks.erase(block_itr);
411 cached_bytes[device].free -= search_key.bytes;
412 cached_bytes[device].live += search_key.bytes;
414 if (debug)
_CubLog(
"\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
415 device, search_key.d_ptr, (
long long) search_key.bytes, (
long long) search_key.associated_stream, (
long long) block_itr->associated_stream);
430 if (device != entrypoint_device)
432 if (
CubDebug(error = cudaGetDevice(&entrypoint_device)))
return error;
433 if (
CubDebug(error = cudaSetDevice(device)))
return error;
437 if (
CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
441 if (debug)
_CubLog(
"\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
442 device, (
long long) search_key.bytes, (
long long) search_key.associated_stream);
448 BlockDescriptor free_key(device);
449 CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
451 while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
458 if (
CubDebug(error = cudaFree(block_itr->d_ptr)))
break;
459 if (
CubDebug(error = cudaEventDestroy(block_itr->ready_event)))
break;
462 cached_bytes[device].free -= block_itr->bytes;
463 cached_blocks.erase(block_itr);
465 if (debug)
_CubLog(
"\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
466 device, (
long long) block_itr->bytes, (
long long) cached_blocks.size(), (
long long) cached_bytes[device].free, (
long long) live_blocks.size(), (
long long) cached_bytes[device].live);
475 if (error)
return error;
478 if (
CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)))
return error;
482 if (
CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
487 live_blocks.insert(search_key);
488 cached_bytes[device].live += search_key.bytes;
491 if (debug)
_CubLog(
"\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
492 device, search_key.d_ptr, (
long long) search_key.bytes, (
long long) search_key.associated_stream);
495 if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
497 if (
CubDebug(error = cudaSetDevice(entrypoint_device)))
return error;
502 *d_ptr = search_key.d_ptr;
504 if (debug)
_CubLog(
"\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
505 (
long long) cached_blocks.size(), (
long long) cached_bytes[device].free, (
long long) live_blocks.size(), (
long long) cached_bytes[device].live);
521 cudaStream_t active_stream = 0)
523 return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
538 int entrypoint_device = INVALID_DEVICE_ORDINAL;
539 cudaError_t error = cudaSuccess;
541 if (device == INVALID_DEVICE_ORDINAL)
543 if (
CubDebug(error = cudaGetDevice(&entrypoint_device)))
545 device = entrypoint_device;
552 bool recached =
false;
553 BlockDescriptor search_key(d_ptr, device);
554 BusyBlocks::iterator block_itr = live_blocks.find(search_key);
555 if (block_itr != live_blocks.end())
558 search_key = *block_itr;
559 live_blocks.erase(block_itr);
560 cached_bytes[device].live -= search_key.bytes;
563 if ((search_key.bin !=
INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
567 cached_blocks.insert(search_key);
568 cached_bytes[device].free += search_key.bytes;
570 if (debug)
_CubLog(
"\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
571 device, (
long long) search_key.bytes, (
long long) search_key.associated_stream, (
long long) cached_blocks.size(),
572 (
long long) cached_bytes[device].free, (
long long) live_blocks.size(), (
long long) cached_bytes[device].live);
580 if (device != entrypoint_device)
582 if (
CubDebug(error = cudaGetDevice(&entrypoint_device)))
return error;
583 if (
CubDebug(error = cudaSetDevice(device)))
return error;
589 if (
CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
return error;
594 if (
CubDebug(error = cudaFree(d_ptr)))
return error;
595 if (
CubDebug(error = cudaEventDestroy(search_key.ready_event)))
return error;
597 if (debug)
_CubLog(
"\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
598 device, (
long long) search_key.bytes, (
long long) search_key.associated_stream, (
long long) cached_blocks.size(), (
long long) cached_bytes[device].free, (
long long) live_blocks.size(), (
long long) cached_bytes[device].live);
602 if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
604 if (
CubDebug(error = cudaSetDevice(entrypoint_device)))
return error;
621 return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
630 cudaError_t error = cudaSuccess;
631 int entrypoint_device = INVALID_DEVICE_ORDINAL;
632 int current_device = INVALID_DEVICE_ORDINAL;
636 while (!cached_blocks.empty())
639 CachedBlocks::iterator begin = cached_blocks.begin();
642 if (entrypoint_device == INVALID_DEVICE_ORDINAL)
644 if (
CubDebug(error = cudaGetDevice(&entrypoint_device)))
break;
648 if (begin->device != current_device)
650 if (
CubDebug(error = cudaSetDevice(begin->device)))
break;
651 current_device = begin->device;
655 if (
CubDebug(error = cudaFree(begin->d_ptr)))
break;
656 if (
CubDebug(error = cudaEventDestroy(begin->ready_event)))
break;
659 cached_bytes[current_device].free -= begin->bytes;
660 cached_blocks.erase(begin);
662 if (debug)
_CubLog(
"\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
663 current_device, (
long long) begin->bytes, (
long long) cached_blocks.size(), (
long long) cached_bytes[current_device].free, (
long long) live_blocks.size(), (
long long) cached_bytes[current_device].live);
669 if (entrypoint_device != INVALID_DEVICE_ORDINAL)
671 if (
CubDebug(error = cudaSetDevice(entrypoint_device)))
return error;