5 #ifndef FML_GPU_ARCH_CUDA_NVML_H
6 #define FML_GPU_ARCH_CUDA_NVML_H
27 const static int NVML_MAX_STRLEN = 128;
31 inline void check_nvml_ret(nvmlReturn_t check)
33 if (check != NVML_SUCCESS)
35 if (check == NVML_ERROR_UNINITIALIZED)
36 throw std::runtime_error(
"NVML was not successfully initialized");
37 else if (check == NVML_ERROR_INVALID_ARGUMENT)
38 throw std::runtime_error(
"invalid argument");
39 else if (check == NVML_ERROR_NOT_SUPPORTED)
40 throw std::runtime_error(
"device does not support requested feature");
41 else if (check == NVML_ERROR_NO_PERMISSION)
42 throw std::runtime_error(
"NVML does not have permission to talk to the driver");
45 else if (check == NVML_ERROR_NOT_FOUND)
46 throw std::runtime_error(
"process not found");
47 else if (check == NVML_ERROR_INSUFFICIENT_SIZE)
48 throw std::runtime_error(
"internal string buffer too small");
49 else if (check == NVML_ERROR_INSUFFICIENT_POWER)
50 throw std::runtime_error(
"device has improperly attached external power cable");
51 else if (check == NVML_ERROR_DRIVER_NOT_LOADED)
52 throw std::runtime_error(
"NVIDIA driver is not running");
53 else if (check == NVML_ERROR_TIMEOUT)
54 throw std::runtime_error(
"provided timeout has passed");
55 else if (check == NVML_ERROR_IRQ_ISSUE)
56 throw std::runtime_error(
"NVIDIA kernel detected an interrupt issue with the attached GPUs");
57 else if (check == NVML_ERROR_LIBRARY_NOT_FOUND)
58 throw std::runtime_error(
"NVML shared library could not be loaded");
59 else if (check == NVML_ERROR_FUNCTION_NOT_FOUND)
60 throw std::runtime_error(
"local NVML version does not support requested function");
61 else if (check == NVML_ERROR_CORRUPTED_INFOROM)
62 throw std::runtime_error(
"infoROM is corrupted");
63 else if (check == NVML_ERROR_GPU_IS_LOST)
64 throw std::runtime_error(
"GPU is inaccessible");
65 else if (check == NVML_ERROR_RESET_REQUIRED)
66 throw std::runtime_error(
"GPU needs to be reset before it can be used again");
67 else if (check == NVML_ERROR_OPERATING_SYSTEM)
68 throw std::runtime_error(
"GPU control device was blocked by the OS");
69 else if (check == NVML_ERROR_LIB_RM_VERSION_MISMATCH)
70 throw std::runtime_error(
"driver/library version mismatch");
71 else if (check == NVML_ERROR_IN_USE)
72 throw std::runtime_error(
"GPU currently in use");
73 else if (check == NVML_ERROR_MEMORY)
74 throw std::runtime_error(
"insufficient memory");
75 else if (check == NVML_ERROR_MEMORY)
76 throw std::runtime_error(
"no data");
77 else if (check == NVML_ERROR_VGPU_ECC_NOT_SUPPORTED)
78 throw std::runtime_error(
"operation is not available because ECC is enabled");
79 else if (check == NVML_ERROR_UNKNOWN)
80 throw std::runtime_error(
"unknown NVML error");
82 throw std::runtime_error(nvmlErrorString(check));
95 err::check_nvml_ret( nvmlInit() );
104 err::check_nvml_ret( nvmlShutdown() );
122 err::check_nvml_ret( nvmlSystemGetCudaDriverVersion(&ret) );
132 ret.resize(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE);
133 err::check_nvml_ret( nvmlSystemGetDriverVersion(&ret[0], ret.max_size()) );
143 ret.resize(NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE);
144 err::check_nvml_ret( nvmlSystemGetNVMLVersion(&ret[0], ret.max_size()) );
156 ret.resize(NVML_MAX_STRLEN);
157 err::check_nvml_ret( nvmlSystemGetProcessName(pid, &ret[0], ret.max_size()) );
170 inline std::string get_board_part_number(nvmlDevice_t device)
173 ret.resize(NVML_MAX_STRLEN);
174 err::check_nvml_ret( nvmlDeviceGetBoardPartNumber(device, &ret[0], ret.max_size()) );
178 inline std::string get_brand(nvmlDevice_t device)
180 nvmlBrandType_t type;
181 err::check_nvml_ret( nvmlDeviceGetBrand(device, &type) );
182 if (type == NVML_BRAND_UNKNOWN)
184 else if (type == NVML_BRAND_QUADRO)
186 else if (type == NVML_BRAND_TESLA)
188 else if (type == NVML_BRAND_NVS)
190 else if (type == NVML_BRAND_GRID)
192 else if (type == NVML_BRAND_GEFORCE)
194 #ifdef NVML_BRAND_TITAN
195 else if (type == NVML_BRAND_TITAN)
199 return "missing from list; contact nvsmi devs";
202 inline std::string get_compute_mode(nvmlDevice_t device)
204 nvmlComputeMode_t mode;
205 err::check_nvml_ret( nvmlDeviceGetComputeMode(device, &mode) );
206 if (mode == NVML_COMPUTEMODE_DEFAULT)
208 else if (mode == NVML_COMPUTEMODE_EXCLUSIVE_THREAD)
210 else if (mode == NVML_COMPUTEMODE_PROHIBITED)
212 else if (mode == NVML_COMPUTEMODE_EXCLUSIVE_PROCESS)
218 inline int get_count()
220 unsigned int num_gpus;
221 err::check_nvml_ret( nvmlDeviceGetCount(&num_gpus) );
222 return (
int) num_gpus;
225 inline void get_cuda_compute_capability(nvmlDevice_t device,
int *major,
int *minor)
227 err::check_nvml_ret( nvmlDeviceGetCudaComputeCapability(device, major, minor) );
230 inline int get_curr_pcie_link_generation(nvmlDevice_t device)
232 unsigned int currLinkGen;
233 err::check_nvml_ret( nvmlDeviceGetCurrPcieLinkGeneration(device, &currLinkGen) );
234 return (
int) currLinkGen;
237 inline int get_curr_pcie_link_width(nvmlDevice_t device)
239 unsigned int currLinkWidth;
240 err::check_nvml_ret( nvmlDeviceGetCurrPcieLinkWidth(device, &currLinkWidth) );
241 return (
int) currLinkWidth;
244 inline int get_display_active(nvmlDevice_t device)
246 nvmlEnableState_t disp;
247 err::check_nvml_ret( nvmlDeviceGetDisplayActive(device, &disp) );
251 inline int get_fan_speed(nvmlDevice_t device)
254 nvmlReturn_t check = nvmlDeviceGetFanSpeed(device, &speed);
255 if (check == NVML_ERROR_NOT_SUPPORTED)
258 err::check_nvml_ret(check);
263 inline nvmlDevice_t get_handle_by_index(
int index)
266 err::check_nvml_ret( nvmlDeviceGetHandleByIndex(index, &device) );
270 inline int get_index(nvmlDevice_t device)
273 err::check_nvml_ret( nvmlDeviceGetIndex(device, &index) );
277 inline void get_memory_info(nvmlDevice_t device,
double *memory_used,
double *memory_total)
280 err::check_nvml_ret( nvmlDeviceGetMemoryInfo(device, &memory) );
281 *memory_used = (double) memory.used;
282 *memory_total = (
double) memory.total;
285 inline std::string get_name(nvmlDevice_t device)
288 ret.resize(NVML_MAX_STRLEN);
289 err::check_nvml_ret( nvmlDeviceGetName(device, &ret[0], ret.max_size()) );
293 inline int get_performance_state(nvmlDevice_t device)
295 nvmlPstates_t pState;
296 err::check_nvml_ret( nvmlDeviceGetPerformanceState(device, &pState) );
300 inline int get_persistence_mode(nvmlDevice_t device)
302 nvmlEnableState_t mode;
303 err::check_nvml_ret( nvmlDeviceGetPersistenceMode(device, &mode) );
307 inline int get_power_max(nvmlDevice_t device)
309 unsigned int power_min, power_max;
310 err::check_nvml_ret( nvmlDeviceGetPowerManagementLimitConstraints(device, &power_min, &power_max) );
311 return (
int) power_max;
314 inline int get_power_usage(nvmlDevice_t device)
317 err::check_nvml_ret( nvmlDeviceGetPowerUsage(device, &power) );
321 inline std::string get_serial(nvmlDevice_t device)
324 ret.resize(NVML_MAX_STRLEN);
325 err::check_nvml_ret( nvmlDeviceGetSerial(device, &ret[0], ret.max_size()) );
329 inline int get_temperature(nvmlDevice_t device)
331 nvmlTemperatureSensors_t sensor = NVML_TEMPERATURE_GPU;
333 err::check_nvml_ret( nvmlDeviceGetTemperature(device, sensor, &temp) );
337 inline int get_utilization(nvmlDevice_t device)
339 nvmlUtilization_t utilization;
340 err::check_nvml_ret( nvmlDeviceGetUtilizationRates(device, &utilization) );
341 return (
int) utilization.gpu;
344 inline std::string get_uuid(nvmlDevice_t device)
347 ret.resize(NVML_MAX_STRLEN);
348 err::check_nvml_ret( nvmlDeviceGetUUID(device, &ret[0], ret.max_size()) );