-
Notifications
You must be signed in to change notification settings - Fork 1
/
printHumanReadableResults.cpp
168 lines (151 loc) · 6.88 KB
/
printHumanReadableResults.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#include <iostream>
#include <fstream>
#include <sstream>
#include <cstring>
#include <ctime>
#include <vector>
#include <cuda_runtime_api.h>
#include <nvml.h>
#include "getters.h"
void printHumanReadableResults(){
nvmlReturn_t nvmlResult;
// Initialize NVML -- do not call nvmlSystemGet'ters before initializing!
nvmlResult = nvmlInit();
if (nvmlResult != NVML_SUCCESS) {
std::cerr << "Failed to initialize NVML: " << nvmlErrorString(nvmlResult) << std::endl;
}
cudaError_t cudaResult;
int deviceCount;
std::time_t unixTime = std::time(0);
// Get cuda and nvidia driber versions
char nvidiaDriverVersion[30];
int cudaDriverVersion=0;
nvmlSystemGetDriverVersion(nvidiaDriverVersion,sizeof(nvidiaDriverVersion));
nvmlSystemGetCudaDriverVersion(&cudaDriverVersion);
std::cout << "Timestamp: " << unixTime << std::endl;
std::cout << "NVIDIA Driver: " << nvidiaDriverVersion<<std::endl;
std::cout << "Cuda Driver: " << cudaDriverVersion<<std::endl;
// Get number of CUDA devices
cudaResult = cudaGetDeviceCount(&deviceCount);
if (cudaResult != cudaSuccess) {
std::cerr << "Failed to get CUDA device count: " << cudaGetErrorString(cudaResult) << std::endl;
nvmlShutdown();
}
nvmlDevice_t device;
nvmlUtilization_t utilization;
nvmlMemory_t memory;
unsigned int temperature, power;
char name[64];
// Iterate over devices
for (int i = 0; i < deviceCount; ++i) {
// Get handle to the NVML device
nvmlResult = nvmlDeviceGetHandleByIndex(i, &device);
if (nvmlResult != NVML_SUCCESS) {
std::cerr << "Failed to get handle for device " << i << ": " << nvmlErrorString(nvmlResult) << std::endl;
continue;
}
// Note to future self - see https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries
char serialnumber[NVML_DEVICE_SERIAL_BUFFER_SIZE];
unsigned int numCores=0;
unsigned long long energy = 0;
nvmlEccErrorCounts_t eccCounts;
nvmlEnableState_t currentECCMode, pendingECCMode;
nvmlMemoryErrorType_t errorType;
nvmlEccCounterType_t counterType;
nvmlEnableState_t mode;
nvmlPstates_t pState;
nvmlComputeMode_t computemode;
// Get device properties
nvmlResult = nvmlDeviceGetName(device, name, sizeof(name));
nvmlResult = nvmlDeviceGetSerial(device, serialnumber,NVML_DEVICE_SERIAL_BUFFER_SIZE);
nvmlResult = nvmlDeviceGetUtilizationRates(device, &utilization);
nvmlResult = nvmlDeviceGetPersistenceMode (device, &mode);
nvmlResult = nvmlDeviceGetMemoryInfo(device, &memory);
nvmlResult = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature);
nvmlResult = nvmlDeviceGetPowerUsage(device, &power);
nvmlResult = nvmlDeviceGetNumGpuCores(device, &numCores );
nvmlResult = nvmlDeviceGetPowerState(device,&pState);
nvmlResult = nvmlDeviceGetEccMode(device,¤tECCMode,&pendingECCMode);
nvmlResult = nvmlDeviceGetDetailedEccErrors(device, NVML_MEMORY_ERROR_TYPE_CORRECTED, NVML_VOLATILE_ECC, &eccCounts);
nvmlResult = nvmlDeviceGetComputeMode(device,&computemode);
nvmlResult = nvmlDeviceGetTotalEnergyConsumption (device, &energy );
std::cout << "\033[1;4mDevice " << i << "\033[0m - " << name << " Serial Number: " << serialnumber << std::endl;
std::cout << " Number GPU Cores: " << numCores << std::endl;
switch (computemode) {
case NVML_COMPUTEMODE_DEFAULT :
std::cout << " Compute Mode: Default compute mode -- multiple contexts per device." << std::endl;
break;
case NVML_COMPUTEMODE_EXCLUSIVE_THREAD:
std::cout << " Compute Mode: only one context per device, usable from one thread at a time." << std::endl;
break;
case NVML_COMPUTEMODE_PROHIBITED:
std::cout << " Compute Mode: no contexts per device." << std::endl;
break;
case NVML_COMPUTEMODE_EXCLUSIVE_PROCESS:
std::cout << " Compute Mode: only one context per device, usable from multiple threads at a time." << std::endl;
break;
default:
std::cout << " Compute Mode: Unable to determine compute mode." << std::endl;
std::cout << " Compute Mode: Unable to determine compute mode." << std::endl;
}
if ( NVML_FEATURE_ENABLED == mode ){
std::cout << " Persistence Mode: \033[1;32mON\033[0m" << std::endl;
} else {
std::cout << " Persistence Mode: \033[1;31mOFF\033[0m" << std::endl;
}
if ( NVML_PSTATE_0 == pState ){
std::cout << " Power State: \033[1;32mMax Performance\033[0m" << std::endl;
} else {
std::cout << " Power State: \033[1;31mSuboptimal Performance\033[0m" << std::endl;
}
if ( power <= 50000 ) {
std::cout << " Power: \033[1;36m" << power/1000. << "\033[0m W" << std::endl;
}
else if (power > 50000 && power <=70000) {
std::cout << " Power: \033[1;33m" << power/1000. << "\033[0m W" << std::endl;
}
else{
std::cout << " Power: \033[1;31m" << power/1000. << "\033[0m W" << std::endl;
}
std::cout << " Total Energy Consumed: " << energy/3.6e+9 << " kWh since last driver-reload" << std::endl;
std::cout << " Memory Usage: " << memory.used / 1048576 << " MB of " << memory.total / 1048576 << " MB" << std::endl;
if ( temperature <= 40 ) {
std::cout << " Temperature: \033[1;36m" << temperature << "\033[0m\u00b0 C" << std::endl;
}
else if (temperature > 40 && temperature <=70) {
std::cout << " Temperature: \033[1;33m" << temperature << "\033[0m\u00b0 C" << std::endl;
}
else{
std::cout << " Temperature: \033[1;31m" << temperature << "\033[0m\u00b0 C" << std::endl;
}
std::cout << " GPU Utilization: " << utilization.gpu << "%" << std::endl;
std::cout << " Memory Utilization: " << utilization.memory << "%" << std::endl;
if ( NVML_FEATURE_ENABLED == currentECCMode){
std::cout << " Current ECC Mode: \033[1;32mON\033[0m " << std::endl;
} else {
std::cout << " Current ECC Mode: \033[1;31mOFF\033[0m " << std::endl;
}
if ( NVML_FEATURE_ENABLED == pendingECCMode ){
std::cout << " Pending ECC Mode: \033[1;32mON\033[0m " << std::endl;
} else {
std::cout << " Pending ECC Mode: \033[1;31mOFF\033[0m " << std::endl;
}
std::cout << " ECC Mem Errors: " << eccCounts.deviceMemory << std::endl;
std::cout << " L1 Cache Errors: " << eccCounts.l1Cache << std::endl;
std::cout << " L2 Cache Errors: " << eccCounts.l2Cache << std::endl;
std::cout << " Register File Errors: " << eccCounts.registerFile << std::endl;
// Get process information
unsigned int processCount = 0;
nvmlResult = nvmlDeviceGetComputeRunningProcesses(device, &processCount, NULL);
std::vector<nvmlProcessInfo_t> processes(processCount);
nvmlResult = nvmlDeviceGetComputeRunningProcesses(device, &processCount, &processes[0]);
if (nvmlResult == NVML_SUCCESS && processCount > 0) {
for (unsigned int j = 0; j < processCount; ++j) {
std::string processName = getProcessName(processes[j].pid);
std::string commandName = getCommandName(processes[j].pid);
std::cout << " Process ID: " << processes[j].pid << " (" << commandName << ") using " << processes[j].usedGpuMemory / 1048576 << " MB" << std::endl;
}
}
}
nvmlShutdown();
}