36 #if (CUB_PTX_ARCH == 0)
43 #include "util_namespace.cuh"
46 #include "host/spinlock.cuh"
106 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
116 INVALID_DEVICE_ORDINAL = -1,
122 static unsigned int IntPow(
126 unsigned int retval = 1;
130 retval = retval * base;
142 static void NearestPowerOf(
144 size_t &rounded_bytes,
151 while (rounded_bytes < value)
153 rounded_bytes *= base;
161 struct BlockDescriptor
165 cudaStream_t associated_stream;
166 cudaEvent_t ready_event;
171 BlockDescriptor(
void *d_ptr,
int device) :
176 associated_stream(0),
181 BlockDescriptor(
size_t bytes,
unsigned int bin,
int device, cudaStream_t associated_stream) :
186 associated_stream(associated_stream),
191 static bool PtrCompare(
const BlockDescriptor &a,
const BlockDescriptor &b)
193 if (a.device == b.device)
194 return (a.d_ptr < b.d_ptr);
196 return (a.device < b.device);
200 static bool SizeCompare(
const BlockDescriptor &a,
const BlockDescriptor &b)
202 if (a.device == b.device)
203 return (a.bytes < b.bytes);
205 return (a.device < b.device);
210 typedef bool (*Compare)(
const BlockDescriptor &,
const BlockDescriptor &);
212 #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code
215 typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
218 typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
221 typedef std::map<int, size_t> GpuCachedBytes;
223 #endif // CUB_PTX_ARCH
231 unsigned int bin_growth;
232 unsigned int min_bin;
233 unsigned int max_bin;
235 size_t min_bin_bytes;
236 size_t max_bin_bytes;
237 size_t max_cached_bytes;
242 #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code
244 GpuCachedBytes cached_bytes;
245 CachedBlocks cached_blocks;
246 BusyBlocks live_blocks;
248 #endif // CUB_PTX_ARCH
250 #endif // DOXYGEN_SHOULD_SKIP_THIS
260 unsigned int bin_growth,
261 unsigned int min_bin,
262 unsigned int max_bin,
263 size_t max_cached_bytes,
264 bool skip_cleanup =
false)
266 #if (CUB_PTX_ARCH == 0)
267 cached_blocks(BlockDescriptor::SizeCompare),
268 live_blocks(BlockDescriptor::PtrCompare),
272 bin_growth(bin_growth),
275 min_bin_bytes(IntPow(bin_growth, min_bin)),
276 max_bin_bytes(IntPow(bin_growth, max_bin)),
277 max_cached_bytes(max_cached_bytes)
295 bool skip_cleanup =
false)
297 #if (CUB_PTX_ARCH == 0)
298 cached_blocks(BlockDescriptor::SizeCompare),
299 live_blocks(BlockDescriptor::PtrCompare),
301 skip_cleanup(skip_cleanup),
307 min_bin_bytes(IntPow(bin_growth, min_bin)),
308 max_bin_bytes(IntPow(bin_growth, max_bin)),
309 max_cached_bytes((max_bin_bytes * 3) - 1)
317 size_t max_cached_bytes)
319 #if (CUB_PTX_ARCH > 0)
321 return CubDebug(cudaErrorInvalidConfiguration);
327 this->max_cached_bytes = max_cached_bytes;
329 if (debug)
_CubLog(
"New max_cached_bytes(%lld)\n", (
long long) max_cached_bytes);
336 #endif // CUB_PTX_ARCH
351 cudaStream_t active_stream = 0)
353 #if (CUB_PTX_ARCH > 0)
355 return CubDebug(cudaErrorInvalidConfiguration);
360 int entrypoint_device = INVALID_DEVICE_ORDINAL;
361 cudaError_t error = cudaSuccess;
365 if (
CubDebug(error = cudaGetDevice(&entrypoint_device)))
break;
366 if (device == INVALID_DEVICE_ORDINAL)
367 device = entrypoint_device;
372 NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
375 bin_bytes = min_bin_bytes;
382 bin = (
unsigned int) -1;
386 BlockDescriptor search_key(bin_bytes, bin, device, active_stream);
395 CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
399 while ((block_itr != cached_blocks.end()) &&
400 (block_itr->device == device) &&
401 (block_itr->bin == search_key.bin))
403 cudaStream_t prev_stream = block_itr->associated_stream;
404 if ((active_stream == prev_stream) || (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
408 search_key = *block_itr;
409 search_key.associated_stream = active_stream;
410 live_blocks.insert(search_key);
413 cached_blocks.erase(block_itr);
414 cached_bytes[device] -= search_key.bytes;
416 if (debug)
_CubLog(
"\tdevice %d reused cached block for stream %lld (%lld bytes, previously associated with stream %lld).\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
417 device, (
long long) active_stream, (
long long) search_key.bytes, (
long long) prev_stream, (
long long) cached_blocks.size(), (
long long) cached_bytes[device], (
long long) live_blocks.size());
434 if (device != entrypoint_device) {
435 if (
CubDebug(error = cudaSetDevice(device)))
break;
439 if (
CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)))
break;
440 if (
CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
break;
449 live_blocks.insert(search_key);
451 if (debug)
_CubLog(
"\tdevice %d allocating new device block %lld bytes associated with stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
452 device, (
long long) search_key.bytes, (
long long) search_key.associated_stream, (
long long) cached_blocks.size(), (
long long) cached_bytes[device], (
long long) live_blocks.size());
456 *d_ptr = search_key.d_ptr;
467 if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
469 if (
CubDebug(error = cudaSetDevice(entrypoint_device)))
return error;
474 #endif // CUB_PTX_ARCH
488 cudaStream_t active_stream = 0)
490 #if (CUB_PTX_ARCH > 0)
492 return CubDebug(cudaErrorInvalidConfiguration);
494 return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
495 #endif // CUB_PTX_ARCH
510 #if (CUB_PTX_ARCH > 0)
512 return CubDebug(cudaErrorInvalidConfiguration);
516 int entrypoint_device = INVALID_DEVICE_ORDINAL;
517 cudaError_t error = cudaSuccess;
520 if (
CubDebug(error = cudaGetDevice(&entrypoint_device)))
break;
521 if (device == INVALID_DEVICE_ORDINAL)
522 device = entrypoint_device;
525 if (device != entrypoint_device) {
526 if (
CubDebug(error = cudaSetDevice(device)))
break;
536 BlockDescriptor search_key(d_ptr, device);
537 BusyBlocks::iterator block_itr = live_blocks.find(search_key);
538 if (block_itr == live_blocks.end())
541 if (
CubDebug(error = cudaErrorUnknown))
break;
546 search_key = *block_itr;
547 live_blocks.erase(block_itr);
550 if (cached_bytes[device] + search_key.bytes <= max_cached_bytes)
553 if (
CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
break;
556 cached_blocks.insert(search_key);
557 cached_bytes[device] += search_key.bytes;
559 if (debug)
_CubLog(
"\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
560 device, (
long long) search_key.bytes, (
long long) search_key.associated_stream, (
long long) cached_blocks.size(), (
long long) cached_bytes[device], (
long long) live_blocks.size());
571 if (
CubDebug(error = cudaFree(d_ptr)))
break;
572 if (
CubDebug(error = cudaEventDestroy(search_key.ready_event)))
break;
574 if (debug)
_CubLog(
"\tdevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
575 device, (
long long) search_key.bytes, (
long long) search_key.associated_stream, (
long long) cached_blocks.size(), (
long long) cached_bytes[device], (
long long) live_blocks.size());
586 if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
588 if (
CubDebug(error = cudaSetDevice(entrypoint_device)))
return error;
593 #endif // CUB_PTX_ARCH
607 #if (CUB_PTX_ARCH > 0)
609 return CubDebug(cudaErrorInvalidConfiguration);
611 return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
612 #endif // CUB_PTX_ARCH
621 #if (CUB_PTX_ARCH > 0)
623 return CubDebug(cudaErrorInvalidConfiguration);
626 cudaError_t error = cudaSuccess;
628 int entrypoint_device = INVALID_DEVICE_ORDINAL;
629 int current_device = INVALID_DEVICE_ORDINAL;
637 while (!cached_blocks.empty())
640 CachedBlocks::iterator begin = cached_blocks.begin();
643 if (entrypoint_device == INVALID_DEVICE_ORDINAL)
645 if (
CubDebug(error = cudaGetDevice(&entrypoint_device)))
break;
649 if (begin->device != current_device)
651 if (
CubDebug(error = cudaSetDevice(begin->device)))
break;
652 current_device = begin->device;
656 if (
CubDebug(error = cudaFree(begin->d_ptr)))
break;
657 if (
CubDebug(error = cudaEventDestroy(begin->ready_event)))
break;
660 cached_bytes[current_device] -= begin->bytes;
661 cached_blocks.erase(begin);
663 if (debug)
_CubLog(
"\tdevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
664 current_device, (
long long) begin->bytes, (
long long) cached_blocks.size(), (
long long) cached_bytes[current_device], (
long long) live_blocks.size());
674 if (entrypoint_device != INVALID_DEVICE_ORDINAL)
676 if (
CubDebug(error = cudaSetDevice(entrypoint_device)))
return error;
681 #endif // CUB_PTX_ARCH