Skip to content

Commit

Permalink
fix: get driver version and cuda version at a single command (#1754)
Browse files Browse the repository at this point in the history
Co-authored-by: vansangpfiev <[email protected]>
  • Loading branch information
vansangpfiev and sangjanai committed Dec 2, 2024
1 parent 045762c commit f40a377
Show file tree
Hide file tree
Showing 8 changed files with 36 additions and 42 deletions.
6 changes: 4 additions & 2 deletions engine/cli/commands/engine_install_cmd.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ bool EngineInstallCmd::Exec(const std::string& engine,
dp.Connect(host_, port_);
// engine can be small, so need to start ws first
auto dp_res = std::async(std::launch::deferred, [&dp] {
bool need_cuda_download = !system_info_utils::GetCudaVersion().empty();
bool need_cuda_download =
!system_info_utils::GetDriverAndCudaVersion().second.empty();
if (need_cuda_download) {
return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit});
} else {
Expand Down Expand Up @@ -149,7 +150,8 @@ bool EngineInstallCmd::Exec(const std::string& engine,
dp.Connect(host_, port_);
// engine can be small, so need to start ws first
auto dp_res = std::async(std::launch::deferred, [&dp] {
bool need_cuda_download = !system_info_utils::GetCudaVersion().empty();
bool need_cuda_download =
!system_info_utils::GetDriverAndCudaVersion().second.empty();
if (need_cuda_download) {
return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit});
} else {
Expand Down
3 changes: 2 additions & 1 deletion engine/cli/commands/engine_install_cmd.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ class EngineInstallCmd {
port_(port),
show_menu_(show_menu),
hw_inf_{.sys_inf = system_info_utils::GetSystemInfo(),
.cuda_driver_version = system_info_utils::GetCudaVersion()} {};
.cuda_driver_version =
system_info_utils::GetDriverAndCudaVersion().second} {};

bool Exec(const std::string& engine, const std::string& version = "latest",
const std::string& src = "");
Expand Down
3 changes: 2 additions & 1 deletion engine/cli/commands/engine_update_cmd.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ bool EngineUpdateCmd::Exec(const std::string& host, int port,
dp.Connect(host, port);
// engine can be small, so need to start ws first
auto dp_res = std::async(std::launch::deferred, [&dp] {
bool need_cuda_download = !system_info_utils::GetCudaVersion().empty();
bool need_cuda_download =
!system_info_utils::GetDriverAndCudaVersion().second.empty();
if (need_cuda_download) {
return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit});
} else {
Expand Down
2 changes: 1 addition & 1 deletion engine/cli/commands/server_start_cmd.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ namespace commands {

namespace {
bool TryConnectToServer(const std::string& host, int port) {
constexpr const auto kMaxRetry = 3u;
constexpr const auto kMaxRetry = 4u;
auto count = 0u;
// Check if server is started
while (true) {
Expand Down
3 changes: 2 additions & 1 deletion engine/services/engine_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ class EngineService : public EngineServiceI {
explicit EngineService(std::shared_ptr<DownloadService> download_service)
: download_service_{download_service},
hw_inf_{.sys_inf = system_info_utils::GetSystemInfo(),
.cuda_driver_version = system_info_utils::GetCudaVersion()} {}
.cuda_driver_version =
system_info_utils::GetDriverAndCudaVersion().second} {}

std::vector<EngineInfo> GetEngineInfoList() const;

Expand Down
4 changes: 2 additions & 2 deletions engine/services/hardware_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ namespace services {

namespace {
bool TryConnectToServer(const std::string& host, int port) {
constexpr const auto kMaxRetry = 3u;
constexpr const auto kMaxRetry = 4u;
auto count = 0u;
// Check if server is started
while (true) {
Expand Down Expand Up @@ -292,7 +292,7 @@ void HardwareService::UpdateHardwareInfos() {
}

#if defined(_WIN32) || defined(_WIN64) || defined(__linux__)
if (system_info_utils::IsNvidiaSmiAvailable()) {
if (!gpus.empty()) {
const char* value = std::getenv("CUDA_VISIBLE_DEVICES");
if (value) {
LOG_INFO << "CUDA_VISIBLE_DEVICES: " << value;
Expand Down
3 changes: 1 addition & 2 deletions engine/utils/hardware/gpu_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,11 @@ inline std::vector<GPU> GetGPUInfo() {
// Only support for nvidia for now
// auto gpus = hwinfo::getAllGPUs();
auto nvidia_gpus = system_info_utils::GetGpuInfoList();
auto cuda_version = system_info_utils::GetCudaVersion();
for (auto& n : nvidia_gpus) {
res.emplace_back(
GPU{.id = n.id,
.name = n.name,
.version = cuda_version,
.version = nvidia_gpus[0].cuda_driver_version.value_or("unknown"),
.add_info =
NvidiaAddInfo{
.driver_version = n.driver_version.value_or("unknown"),
Expand Down
54 changes: 22 additions & 32 deletions engine/utils/system_info_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ constexpr static auto kUnsupported{"Unsupported"};
constexpr static auto kCudaVersionRegex{R"(CUDA Version:\s*([\d\.]+))"};
constexpr static auto kDriverVersionRegex{R"(Driver Version:\s*(\d+\.\d+))"};
constexpr static auto kGpuQueryCommand{
"nvidia-smi --query-gpu=index,memory.total,memory.free,name,compute_cap,uuid "
"nvidia-smi "
"--query-gpu=index,memory.total,memory.free,name,compute_cap,uuid "
"--format=csv,noheader,nounits"};
constexpr static auto kGpuInfoRegex{
R"((\d+),\s*(\d+),\s*(\d+),\s*([^,]+),\s*([\d\.]+),\s*([^\n,]+))"};
Expand Down Expand Up @@ -100,53 +101,42 @@ inline bool IsNvidiaSmiAvailable() {
#endif
}

inline std::string GetDriverVersion() {
inline std::pair<std::string, std::string> GetDriverAndCudaVersion() {
if (!IsNvidiaSmiAvailable()) {
CTL_INF("nvidia-smi is not available!");
return "";
return {};
}
try {
std::string driver_version;
std::string cuda_version;
CommandExecutor cmd("nvidia-smi");
auto output = cmd.execute();

const std::regex driver_version_reg(kDriverVersionRegex);
std::smatch match;
std::smatch driver_match;

if (std::regex_search(output, match, driver_version_reg)) {
LOG_INFO << "Gpu Driver Version: " << match[1].str();
return match[1].str();
if (std::regex_search(output, driver_match, driver_version_reg)) {
LOG_INFO << "Gpu Driver Version: " << driver_match[1].str();
driver_version = driver_match[1].str();
} else {
LOG_ERROR << "Gpu Driver not found!";
return "";
return {};
}
} catch (const std::exception& e) {
LOG_ERROR << "Error: " << e.what();
return "";
}
}

inline std::string GetCudaVersion() {
if (!IsNvidiaSmiAvailable()) {
CTL_INF("nvidia-smi is not available!");
return "";
}
try {
CommandExecutor cmd("nvidia-smi");
auto output = cmd.execute();

const std::regex cuda_version_reg(kCudaVersionRegex);
std::smatch match;
std::smatch cuda_match;

if (std::regex_search(output, match, cuda_version_reg)) {
LOG_INFO << "CUDA Version: " << match[1].str();
return match[1].str();
if (std::regex_search(output, cuda_match, cuda_version_reg)) {
LOG_INFO << "CUDA Version: " << cuda_match[1].str();
cuda_version = cuda_match[1].str();
} else {
LOG_ERROR << "CUDA Version not found!";
return "";
return {};
}
return std::pair(driver_version, cuda_version);
} catch (const std::exception& e) {
LOG_ERROR << "Error: " << e.what();
return "";
return {};
}
}

Expand Down Expand Up @@ -227,9 +217,9 @@ inline std::vector<GpuInfo> GetGpuInfoList() {
if (!IsNvidiaSmiAvailable())
return gpuInfoList;
try {
// TODO: improve by parsing both in one command execution
auto driver_version = GetDriverVersion();
auto cuda_version = GetCudaVersion();
auto [driver_version, cuda_version] = GetDriverAndCudaVersion();
if (driver_version.empty() || cuda_version.empty())
return gpuInfoList;

CommandExecutor cmd(kGpuQueryCommand);
auto output = cmd.execute();
Expand All @@ -249,7 +239,7 @@ inline std::vector<GpuInfo> GetGpuInfoList() {
driver_version, // driver_version
cuda_version, // cuda_driver_version
match[5].str(), // compute_cap
match[6].str() // uuid
match[6].str() // uuid
};
gpuInfoList.push_back(gpuInfo);
search_start = match.suffix().first;
Expand Down

0 comments on commit f40a377

Please sign in to comment.