fml  0.1-0
Fused Matrix Library
nvml.hh
1 // This file is part of fml which is released under the Boost Software
2 // License, Version 1.0. See accompanying file LICENSE or copy at
3 // https://www.boost.org/LICENSE_1_0.txt
4 
5 #ifndef FML_GPU_ARCH_CUDA_NVML_H
6 #define FML_GPU_ARCH_CUDA_NVML_H
7 #pragma once
8 
9 
10 #include <climits>
11 #include <cmath>
12 #include <stdexcept>
13 #include <string>
14 
15 // https://docs.nvidia.com/deploy/nvml-api/nvml-api-reference.html#nvml-api-reference
16 #include <nvml.h>
17 
18 
19 namespace fml
20 {
25 namespace nvml
26 {
27  const static int NVML_MAX_STRLEN = 128;
28 
29  namespace err
30  {
31  inline void check_nvml_ret(nvmlReturn_t check)
32  {
33  if (check != NVML_SUCCESS)
34  {
35  if (check == NVML_ERROR_UNINITIALIZED)
36  throw std::runtime_error("NVML was not successfully initialized");
37  else if (check == NVML_ERROR_INVALID_ARGUMENT)
38  throw std::runtime_error("invalid argument");
39  else if (check == NVML_ERROR_NOT_SUPPORTED)
40  throw std::runtime_error("device does not support requested feature");
41  else if (check == NVML_ERROR_NO_PERMISSION)
42  throw std::runtime_error("NVML does not have permission to talk to the driver");
43  // else if (check == NVML_ERROR_ALREADY_INITIALIZED) // deprecated
44  // throw std::runtime_error("already initialized")
45  else if (check == NVML_ERROR_NOT_FOUND)
46  throw std::runtime_error("process not found");
47  else if (check == NVML_ERROR_INSUFFICIENT_SIZE)
48  throw std::runtime_error("internal string buffer too small");
49  else if (check == NVML_ERROR_INSUFFICIENT_POWER)
50  throw std::runtime_error("device has improperly attached external power cable");
51  else if (check == NVML_ERROR_DRIVER_NOT_LOADED)
52  throw std::runtime_error("NVIDIA driver is not running");
53  else if (check == NVML_ERROR_TIMEOUT)
54  throw std::runtime_error("provided timeout has passed");
55  else if (check == NVML_ERROR_IRQ_ISSUE)
56  throw std::runtime_error("NVIDIA kernel detected an interrupt issue with the attached GPUs");
57  else if (check == NVML_ERROR_LIBRARY_NOT_FOUND)
58  throw std::runtime_error("NVML shared library could not be loaded");
59  else if (check == NVML_ERROR_FUNCTION_NOT_FOUND)
60  throw std::runtime_error("local NVML version does not support requested function");
61  else if (check == NVML_ERROR_CORRUPTED_INFOROM)
62  throw std::runtime_error("infoROM is corrupted");
63  else if (check == NVML_ERROR_GPU_IS_LOST)
64  throw std::runtime_error("GPU is inaccessible");
65  else if (check == NVML_ERROR_RESET_REQUIRED)
66  throw std::runtime_error("GPU needs to be reset before it can be used again");
67  else if (check == NVML_ERROR_OPERATING_SYSTEM)
68  throw std::runtime_error("GPU control device was blocked by the OS");
69  else if (check == NVML_ERROR_LIB_RM_VERSION_MISMATCH)
70  throw std::runtime_error("driver/library version mismatch");
71  else if (check == NVML_ERROR_IN_USE)
72  throw std::runtime_error("GPU currently in use");
73  else if (check == NVML_ERROR_MEMORY)
74  throw std::runtime_error("insufficient memory");
75  else if (check == NVML_ERROR_MEMORY)
76  throw std::runtime_error("no data");
77  else if (check == NVML_ERROR_VGPU_ECC_NOT_SUPPORTED)
78  throw std::runtime_error("operation is not available because ECC is enabled");
79  else if (check == NVML_ERROR_UNKNOWN)
80  throw std::runtime_error("unknown NVML error");
81  else
82  throw std::runtime_error(nvmlErrorString(check));
83  }
84  }
85  }
86 
87 
88 
93  inline void init()
94  {
95  err::check_nvml_ret( nvmlInit() );
96  }
97 
102  inline void shutdown()
103  {
104  err::check_nvml_ret( nvmlShutdown() );
105  }
106 
107 
108 
113  namespace system
114  {
120  {
121  int ret;
122  err::check_nvml_ret( nvmlSystemGetCudaDriverVersion(&ret) );
123  return ret;
124  }
125 
129  inline std::string get_driver_version()
130  {
131  std::string ret;
132  ret.resize(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE);
133  err::check_nvml_ret( nvmlSystemGetDriverVersion(&ret[0], ret.max_size()) );
134  return ret;
135  }
136 
140  inline std::string get_nvml_version()
141  {
142  std::string ret;
143  ret.resize(NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE);
144  err::check_nvml_ret( nvmlSystemGetNVMLVersion(&ret[0], ret.max_size()) );
145  return ret;
146  }
147 
153  inline std::string get_process_name(unsigned int pid)
154  {
155  std::string ret;
156  ret.resize(NVML_MAX_STRLEN);
157  err::check_nvml_ret( nvmlSystemGetProcessName(pid, &ret[0], ret.max_size()) );
158  return ret;
159  }
160  }
161 
162 
163 
168  namespace device
169  {
170  inline std::string get_board_part_number(nvmlDevice_t device)
171  {
172  std::string ret;
173  ret.resize(NVML_MAX_STRLEN);
174  err::check_nvml_ret( nvmlDeviceGetBoardPartNumber(device, &ret[0], ret.max_size()) );
175  return ret;
176  }
177 
178  inline std::string get_brand(nvmlDevice_t device)
179  {
180  nvmlBrandType_t type;
181  err::check_nvml_ret( nvmlDeviceGetBrand(device, &type) );
182  if (type == NVML_BRAND_UNKNOWN)
183  return "unknown";
184  else if (type == NVML_BRAND_QUADRO)
185  return "quadro";
186  else if (type == NVML_BRAND_TESLA)
187  return "tesla";
188  else if (type == NVML_BRAND_NVS)
189  return "nvs";
190  else if (type == NVML_BRAND_GRID)
191  return "grid";
192  else if (type == NVML_BRAND_GEFORCE)
193  return "geforce";
194  #ifdef NVML_BRAND_TITAN
195  else if (type == NVML_BRAND_TITAN)
196  return "titan";
197  #endif
198  else
199  return "missing from list; contact nvsmi devs";
200  }
201 
202  inline std::string get_compute_mode(nvmlDevice_t device)
203  {
204  nvmlComputeMode_t mode;
205  err::check_nvml_ret( nvmlDeviceGetComputeMode(device, &mode) );
206  if (mode == NVML_COMPUTEMODE_DEFAULT)
207  return "Default";
208  else if (mode == NVML_COMPUTEMODE_EXCLUSIVE_THREAD)
209  return "E. Thread";
210  else if (mode == NVML_COMPUTEMODE_PROHIBITED)
211  return "Prohibited";
212  else if (mode == NVML_COMPUTEMODE_EXCLUSIVE_PROCESS)
213  return "E. Process";
214  else
215  return "";
216  }
217 
218  inline int get_count()
219  {
220  unsigned int num_gpus;
221  err::check_nvml_ret( nvmlDeviceGetCount(&num_gpus) );
222  return (int) num_gpus;
223  }
224 
225  inline void get_cuda_compute_capability(nvmlDevice_t device, int *major, int *minor)
226  {
227  err::check_nvml_ret( nvmlDeviceGetCudaComputeCapability(device, major, minor) );
228  }
229 
230  inline int get_curr_pcie_link_generation(nvmlDevice_t device)
231  {
232  unsigned int currLinkGen;
233  err::check_nvml_ret( nvmlDeviceGetCurrPcieLinkGeneration(device, &currLinkGen) );
234  return (int) currLinkGen;
235  }
236 
237  inline int get_curr_pcie_link_width(nvmlDevice_t device)
238  {
239  unsigned int currLinkWidth;
240  err::check_nvml_ret( nvmlDeviceGetCurrPcieLinkWidth(device, &currLinkWidth) );
241  return (int) currLinkWidth;
242  }
243 
244  inline int get_display_active(nvmlDevice_t device)
245  {
246  nvmlEnableState_t disp;
247  err::check_nvml_ret( nvmlDeviceGetDisplayActive(device, &disp) );
248  return (int) disp;
249  }
250 
251  inline int get_fan_speed(nvmlDevice_t device)
252  {
253  unsigned int speed;
254  nvmlReturn_t check = nvmlDeviceGetFanSpeed(device, &speed);
255  if (check == NVML_ERROR_NOT_SUPPORTED)
256  return INT_MIN;
257  else
258  err::check_nvml_ret(check);
259 
260  return (int) speed;
261  }
262 
263  inline nvmlDevice_t get_handle_by_index(int index)
264  {
265  nvmlDevice_t device;
266  err::check_nvml_ret( nvmlDeviceGetHandleByIndex(index, &device) );
267  return device;
268  }
269 
270  inline int get_index(nvmlDevice_t device)
271  {
272  unsigned int index;
273  err::check_nvml_ret( nvmlDeviceGetIndex(device, &index) );
274  return (int) index;
275  }
276 
277  inline void get_memory_info(nvmlDevice_t device, double *memory_used, double *memory_total)
278  {
279  nvmlMemory_t memory;
280  err::check_nvml_ret( nvmlDeviceGetMemoryInfo(device, &memory) );
281  *memory_used = (double) memory.used;
282  *memory_total = (double) memory.total;
283  }
284 
285  inline std::string get_name(nvmlDevice_t device)
286  {
287  std::string ret;
288  ret.resize(NVML_MAX_STRLEN);
289  err::check_nvml_ret( nvmlDeviceGetName(device, &ret[0], ret.max_size()) );
290  return ret;
291  }
292 
293  inline int get_performance_state(nvmlDevice_t device)
294  {
295  nvmlPstates_t pState;
296  err::check_nvml_ret( nvmlDeviceGetPerformanceState(device, &pState) );
297  return (int) pState;
298  }
299 
300  inline int get_persistence_mode(nvmlDevice_t device)
301  {
302  nvmlEnableState_t mode;
303  err::check_nvml_ret( nvmlDeviceGetPersistenceMode(device, &mode) );
304  return (int) mode;
305  }
306 
307  inline int get_power_max(nvmlDevice_t device)
308  {
309  unsigned int power_min, power_max;
310  err::check_nvml_ret( nvmlDeviceGetPowerManagementLimitConstraints(device, &power_min, &power_max) );
311  return (int) power_max;
312  }
313 
314  inline int get_power_usage(nvmlDevice_t device)
315  {
316  unsigned int power;
317  err::check_nvml_ret( nvmlDeviceGetPowerUsage(device, &power) );
318  return (int) power;
319  }
320 
321  inline std::string get_serial(nvmlDevice_t device)
322  {
323  std::string ret;
324  ret.resize(NVML_MAX_STRLEN);
325  err::check_nvml_ret( nvmlDeviceGetSerial(device, &ret[0], ret.max_size()) );
326  return ret;
327  }
328 
329  inline int get_temperature(nvmlDevice_t device)
330  {
331  nvmlTemperatureSensors_t sensor = NVML_TEMPERATURE_GPU;
332  unsigned int temp;
333  err::check_nvml_ret( nvmlDeviceGetTemperature(device, sensor, &temp) );
334  return (int) temp;
335  }
336 
337  inline int get_utilization(nvmlDevice_t device)
338  {
339  nvmlUtilization_t utilization;
340  err::check_nvml_ret( nvmlDeviceGetUtilizationRates(device, &utilization) );
341  return (int) utilization.gpu;
342  }
343 
344  inline std::string get_uuid(nvmlDevice_t device)
345  {
346  std::string ret;
347  ret.resize(NVML_MAX_STRLEN);
348  err::check_nvml_ret( nvmlDeviceGetUUID(device, &ret[0], ret.max_size()) );
349  return ret;
350  }
351  }
352 }
353 }
354 
355 
356 #endif
fml::nvml::system::get_driver_version
std::string get_driver_version()
System graphics driver version.
Definition: nvml.hh:129
fml::nvml::init
void init()
Initialize NVML.
Definition: nvml.hh:93
fml::nvml::shutdown
void shutdown()
Shut down NVML.
Definition: nvml.hh:102
fml
Core namespace.
Definition: dimops.hh:10
fml::nvml::system::get_cuda_driver_version
int get_cuda_driver_version()
System CUDA driver version.
Definition: nvml.hh:119
fml::nvml::system::get_nvml_version
std::string get_nvml_version()
Version of the NVML library.
Definition: nvml.hh:140
fml::nvml::system::get_process_name
std::string get_process_name(unsigned int pid)
Process name.
Definition: nvml.hh:153