diff --git a/engine/common/hardware_common.h b/engine/common/hardware_common.h index 576e2e826..e9cbee1be 100644 --- a/engine/common/hardware_common.h +++ b/engine/common/hardware_common.h @@ -78,7 +78,7 @@ inline Json::Value ToJson(const std::vector& gpus) { Json::Value res(Json::arrayValue); for (size_t i = 0; i < gpus.size(); i++) { Json::Value gpu; - gpu["id"] = std::to_string(i); + gpu["id"] = gpus[i].id; gpu["name"] = gpus[i].name; gpu["version"] = gpus[i].version; Json::Value add_info; diff --git a/engine/database/hardware.cc b/engine/database/hardware.cc index 2ee1db968..47a8ac343 100644 --- a/engine/database/hardware.cc +++ b/engine/database/hardware.cc @@ -98,4 +98,41 @@ cpp::result Hardware::DeleteHardwareEntry( return cpp::fail(e.what()); } } + +bool Hardware::HasHardwareEntry(const std::string& id) { + try { + SQLite::Statement query(db_, + "SELECT COUNT(*) FROM hardware WHERE uuid = ?"); + query.bind(1, id); + if (query.executeStep()) { + return query.getColumn(0).getInt() > 0; + } + return false; + } catch (const std::exception& e) { + CTL_WRN(e.what()); + return false; + } +} + +cpp::result Hardware::UpdateHardwareEntry(const std::string& id, + int hw_id, + int sw_id) const { + try { + SQLite::Statement upd( + db_, + "UPDATE hardware " + "SET hardware_id = ?, software_id = ? " + "WHERE uuid = ?"); + upd.bind(1, hw_id); + upd.bind(2, sw_id); + upd.bind(3, id); + if (upd.exec() == 1) { + CTL_INF("Updated: " << id << " " << hw_id << " " << sw_id); + return true; + } + return false; + } catch (const std::exception& e) { + return cpp::fail(e.what()); + } + } } // namespace cortex::db diff --git a/engine/database/hardware.h b/engine/database/hardware.h index 04d0bbda1..394a440ad 100644 --- a/engine/database/hardware.h +++ b/engine/database/hardware.h @@ -43,5 +43,9 @@ class Hardware { cpp::result UpdateHardwareEntry( const std::string& id, const HardwareEntry& updated_entry); cpp::result DeleteHardwareEntry(const std::string& id); + bool HasHardwareEntry(const std::string& id); + cpp::result UpdateHardwareEntry(const std::string& id, + int hw_id, + int sw_id) const; }; } // namespace cortex::db \ No newline at end of file diff --git a/engine/services/hardware_service.cc b/engine/services/hardware_service.cc index bcba33b84..0f1ad846e 100644 --- a/engine/services/hardware_service.cc +++ b/engine/services/hardware_service.cc @@ -86,20 +86,15 @@ bool HardwareService::Restart(const std::string& host, int port) { #if defined(_WIN32) || defined(_WIN64) || defined(__linux__) std::string cuda_visible_devices = ""; - std::string vk_visible_devices = ""; - for (auto i : (*ahc_).gpus) { + auto cuda_config = GetCudaConfig(); + for (auto i : cuda_config) { if (!cuda_visible_devices.empty()) cuda_visible_devices += ","; cuda_visible_devices += std::to_string(i); - - if(!vk_visible_devices.empty()) - vk_visible_devices += ","; - vk_visible_devices += std::to_string(i); } if (cuda_visible_devices.empty()) cuda_visible_devices += " "; - if (vk_visible_devices.empty()) - vk_visible_devices += " "; + // Set the CUDA_VISIBLE_DEVICES environment variable if (!set_env("CUDA_VISIBLE_DEVICES", cuda_visible_devices)) { LOG_WARN << "Error setting CUDA_VISIBLE_DEVICES"; @@ -113,7 +108,17 @@ bool HardwareService::Restart(const std::string& host, int port) { LOG_WARN << "CUDA_VISIBLE_DEVICES is not set."; } - if (!set_env("GGML_VK_VISIBLE_DEVICES", vk_visible_devices)) { + std::string vk_visible_devices = ""; + for (auto i : (*ahc_).gpus) { + if (!vk_visible_devices.empty()) + vk_visible_devices += ","; + vk_visible_devices += std::to_string(i); + } + + if (vk_visible_devices.empty()) + vk_visible_devices += " "; + + if (!set_env("GGML_VK_VISIBLE_DEVICES", vk_visible_devices)) { LOG_WARN << "Error setting GGML_VK_VISIBLE_DEVICES"; return false; } @@ -255,7 +260,7 @@ bool HardwareService::SetActivateHardwareConfig( } } std::sort(activated_ids.begin(), activated_ids.end(), - [](auto& p1, auto& p2) { return p1.second < p2.second; }); + [](auto& p1, auto& p2) { return p1.second < p2.second; }); if (ahc_gpus.size() != activated_ids.size()) { need_update = true; } else { @@ -292,6 +297,22 @@ void HardwareService::UpdateHardwareInfos() { auto gpus = cortex::hw::GetGPUInfo(); cortex::db::Hardware hw_db; auto b = hw_db.LoadHardwareList(); + // delete if not exists + auto exists = [&gpus](const std::string& uuid) { + for (auto const& g : gpus) { + if (g.uuid == uuid) + return true; + } + return false; + }; + for (auto const& he : b.value()) { + if (!exists(he.uuid)) { + hw_db.DeleteHardwareEntry(he.uuid); + } + } + + // Get updated list + b = hw_db.LoadHardwareList(); std::vector> activated_gpu_bf; std::string debug_b; for (auto const& he : b.value()) { @@ -304,14 +325,23 @@ void HardwareService::UpdateHardwareInfos() { for (auto const& gpu : gpus) { // ignore error // Note: only support NVIDIA for now, so hardware_id = software_id - auto res = hw_db.AddHardwareEntry(HwEntry{.uuid = gpu.uuid, - .type = "gpu", - .hardware_id = std::stoi(gpu.id), - .software_id = std::stoi(gpu.id), - .activated = true, - .priority = INT_MAX}); - if (res.has_error()) { - CTL_WRN(res.error()); + if (hw_db.HasHardwareEntry(gpu.uuid)) { + auto res = hw_db.UpdateHardwareEntry(gpu.uuid, std::stoi(gpu.id), + std::stoi(gpu.id)); + if (res.has_error()) { + CTL_WRN(res.error()); + } + } else { + auto res = + hw_db.AddHardwareEntry(HwEntry{.uuid = gpu.uuid, + .type = "gpu", + .hardware_id = std::stoi(gpu.id), + .software_id = std::stoi(gpu.id), + .activated = true, + .priority = INT_MAX}); + if (res.has_error()) { + CTL_WRN(res.error()); + } } } @@ -336,7 +366,7 @@ void HardwareService::UpdateHardwareInfos() { need_restart = true; } else { for (size_t i = 0; i < activated_gpu_bf.size(); i++) { - if (activated_gpu_bf[i].first != activated_gpu_af[i].first) { + if (activated_gpu_bf[i].first != activated_gpu_af[i].first) { need_restart = true; break; } @@ -354,7 +384,7 @@ void HardwareService::UpdateHardwareInfos() { } const char* vk_value = std::getenv("GGML_VK_VISIBLE_DEVICES"); - if (vk_value) { + if (vk_value) { LOG_INFO << "GGML_VK_VISIBLE_DEVICES: " << vk_value; } else { need_restart = true; @@ -390,4 +420,31 @@ bool HardwareService::IsValidConfig( } return false; } + +std::vector HardwareService::GetCudaConfig() { + std::vector res; + if (!ahc_) + return res; + auto nvidia_gpus = system_info_utils::GetGpuInfoList(); + auto all_gpus = cortex::hw::GetGPUInfo(); + // Map id with uuid + std::vector uuids; + for (auto i : (*ahc_).gpus) { + for (auto const& gpu : all_gpus) { + if (i == std::stoi(gpu.id)) { + uuids.push_back(gpu.uuid); + } + } + } + + // Map uuid back to nvidia id + for (auto const& uuid : uuids) { + for (auto const& ngpu : nvidia_gpus) { + if (uuid == ngpu.uuid) { + res.push_back(std::stoi(ngpu.id)); + } + } + } + return res; +} } // namespace services diff --git a/engine/services/hardware_service.h b/engine/services/hardware_service.h index 48ab7a4b1..6a7e4932f 100644 --- a/engine/services/hardware_service.h +++ b/engine/services/hardware_service.h @@ -31,6 +31,8 @@ class HardwareService { void UpdateHardwareInfos(); bool IsValidConfig(const cortex::hw::ActivateHardwareConfig& ahc); +private: + std::vector GetCudaConfig(); private: std::optional ahc_; }; diff --git a/engine/utils/hardware/gpu_info.h b/engine/utils/hardware/gpu_info.h index f08e0345d..c6ffaa193 100644 --- a/engine/utils/hardware/gpu_info.h +++ b/engine/utils/hardware/gpu_info.h @@ -23,6 +23,8 @@ inline std::vector GetGPUInfo() { vulkan_gpus[j].add_info = NvidiaAddInfo{ .driver_version = nvidia_gpus[i].driver_version.value_or("unknown"), .compute_cap = nvidia_gpus[i].compute_cap.value_or("unknown")}; + vulkan_gpus[j].free_vram = std::stoll(nvidia_gpus[i].vram_free); + vulkan_gpus[j].total_vram = std::stoll(nvidia_gpus[i].vram_total); } } }