Skip to content

Commit

Permalink
fix: map cuda gpus
Browse files Browse the repository at this point in the history
  • Loading branch information
sangjanai committed Dec 18, 2024
1 parent f72b111 commit 5d270af
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 21 deletions.
2 changes: 1 addition & 1 deletion engine/common/hardware_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ inline Json::Value ToJson(const std::vector<GPU>& gpus) {
Json::Value res(Json::arrayValue);
for (size_t i = 0; i < gpus.size(); i++) {
Json::Value gpu;
gpu["id"] = std::to_string(i);
gpu["id"] = gpus[i].id;
gpu["name"] = gpus[i].name;
gpu["version"] = gpus[i].version;
Json::Value add_info;
Expand Down
37 changes: 37 additions & 0 deletions engine/database/hardware.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,4 +98,41 @@ cpp::result<bool, std::string> Hardware::DeleteHardwareEntry(
return cpp::fail(e.what());
}
}

bool Hardware::HasHardwareEntry(const std::string& id) {
try {
SQLite::Statement query(db_,
"SELECT COUNT(*) FROM hardware WHERE uuid = ?");
query.bind(1, id);
if (query.executeStep()) {
return query.getColumn(0).getInt() > 0;
}
return false;
} catch (const std::exception& e) {
CTL_WRN(e.what());
return false;
}
}

cpp::result<bool, std::string> Hardware::UpdateHardwareEntry(const std::string& id,
int hw_id,
int sw_id) const {
try {
SQLite::Statement upd(
db_,
"UPDATE hardware "
"SET hardware_id = ?, software_id = ? "
"WHERE uuid = ?");
upd.bind(1, hw_id);
upd.bind(2, sw_id);
upd.bind(3, id);
if (upd.exec() == 1) {
CTL_INF("Updated: " << id << " " << hw_id << " " << sw_id);
return true;
}
return false;
} catch (const std::exception& e) {
return cpp::fail(e.what());
}
}
} // namespace cortex::db
4 changes: 4 additions & 0 deletions engine/database/hardware.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,9 @@ class Hardware {
cpp::result<bool, std::string> UpdateHardwareEntry(
const std::string& id, const HardwareEntry& updated_entry);
cpp::result<bool, std::string> DeleteHardwareEntry(const std::string& id);
bool HasHardwareEntry(const std::string& id);
cpp::result<bool, std::string> UpdateHardwareEntry(const std::string& id,
int hw_id,
int sw_id) const;
};
} // namespace cortex::db
97 changes: 77 additions & 20 deletions engine/services/hardware_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,20 +86,15 @@ bool HardwareService::Restart(const std::string& host, int port) {

#if defined(_WIN32) || defined(_WIN64) || defined(__linux__)
std::string cuda_visible_devices = "";
std::string vk_visible_devices = "";
for (auto i : (*ahc_).gpus) {
auto cuda_config = GetCudaConfig();
for (auto i : cuda_config) {
if (!cuda_visible_devices.empty())
cuda_visible_devices += ",";
cuda_visible_devices += std::to_string(i);

if(!vk_visible_devices.empty())
vk_visible_devices += ",";
vk_visible_devices += std::to_string(i);
}
if (cuda_visible_devices.empty())
cuda_visible_devices += " ";
if (vk_visible_devices.empty())
vk_visible_devices += " ";

// Set the CUDA_VISIBLE_DEVICES environment variable
if (!set_env("CUDA_VISIBLE_DEVICES", cuda_visible_devices)) {
LOG_WARN << "Error setting CUDA_VISIBLE_DEVICES";
Expand All @@ -113,7 +108,17 @@ bool HardwareService::Restart(const std::string& host, int port) {
LOG_WARN << "CUDA_VISIBLE_DEVICES is not set.";
}

if (!set_env("GGML_VK_VISIBLE_DEVICES", vk_visible_devices)) {
std::string vk_visible_devices = "";
for (auto i : (*ahc_).gpus) {
if (!vk_visible_devices.empty())
vk_visible_devices += ",";
vk_visible_devices += std::to_string(i);
}

if (vk_visible_devices.empty())
vk_visible_devices += " ";

if (!set_env("GGML_VK_VISIBLE_DEVICES", vk_visible_devices)) {
LOG_WARN << "Error setting GGML_VK_VISIBLE_DEVICES";
return false;
}
Expand Down Expand Up @@ -255,7 +260,7 @@ bool HardwareService::SetActivateHardwareConfig(
}
}
std::sort(activated_ids.begin(), activated_ids.end(),
[](auto& p1, auto& p2) { return p1.second < p2.second; });
[](auto& p1, auto& p2) { return p1.second < p2.second; });
if (ahc_gpus.size() != activated_ids.size()) {
need_update = true;
} else {
Expand Down Expand Up @@ -292,6 +297,22 @@ void HardwareService::UpdateHardwareInfos() {
auto gpus = cortex::hw::GetGPUInfo();
cortex::db::Hardware hw_db;
auto b = hw_db.LoadHardwareList();
// delete if not exists
auto exists = [&gpus](const std::string& uuid) {
for (auto const& g : gpus) {
if (g.uuid == uuid)
return true;
}
return false;
};
for (auto const& he : b.value()) {
if (!exists(he.uuid)) {
hw_db.DeleteHardwareEntry(he.uuid);
}
}

// Get updated list
b = hw_db.LoadHardwareList();
std::vector<std::pair<int, int>> activated_gpu_bf;
std::string debug_b;
for (auto const& he : b.value()) {
Expand All @@ -304,14 +325,23 @@ void HardwareService::UpdateHardwareInfos() {
for (auto const& gpu : gpus) {
// ignore error
// Note: only support NVIDIA for now, so hardware_id = software_id
auto res = hw_db.AddHardwareEntry(HwEntry{.uuid = gpu.uuid,
.type = "gpu",
.hardware_id = std::stoi(gpu.id),
.software_id = std::stoi(gpu.id),
.activated = true,
.priority = INT_MAX});
if (res.has_error()) {
CTL_WRN(res.error());
if (hw_db.HasHardwareEntry(gpu.uuid)) {
auto res = hw_db.UpdateHardwareEntry(gpu.uuid, std::stoi(gpu.id),
std::stoi(gpu.id));
if (res.has_error()) {
CTL_WRN(res.error());
}
} else {
auto res =
hw_db.AddHardwareEntry(HwEntry{.uuid = gpu.uuid,
.type = "gpu",
.hardware_id = std::stoi(gpu.id),
.software_id = std::stoi(gpu.id),
.activated = true,
.priority = INT_MAX});
if (res.has_error()) {
CTL_WRN(res.error());
}
}
}

Expand All @@ -336,7 +366,7 @@ void HardwareService::UpdateHardwareInfos() {
need_restart = true;
} else {
for (size_t i = 0; i < activated_gpu_bf.size(); i++) {
if (activated_gpu_bf[i].first != activated_gpu_af[i].first) {
if (activated_gpu_bf[i].first != activated_gpu_af[i].first) {
need_restart = true;
break;
}
Expand All @@ -354,7 +384,7 @@ void HardwareService::UpdateHardwareInfos() {
}

const char* vk_value = std::getenv("GGML_VK_VISIBLE_DEVICES");
if (vk_value) {
if (vk_value) {
LOG_INFO << "GGML_VK_VISIBLE_DEVICES: " << vk_value;
} else {
need_restart = true;
Expand Down Expand Up @@ -390,4 +420,31 @@ bool HardwareService::IsValidConfig(
}
return false;
}

std::vector<int> HardwareService::GetCudaConfig() {
std::vector<int> res;
if (!ahc_)
return res;
auto nvidia_gpus = system_info_utils::GetGpuInfoList();
auto all_gpus = cortex::hw::GetGPUInfo();
// Map id with uuid
std::vector<std::string> uuids;
for (auto i : (*ahc_).gpus) {
for (auto const& gpu : all_gpus) {
if (i == std::stoi(gpu.id)) {
uuids.push_back(gpu.uuid);
}
}
}

// Map uuid back to nvidia id
for (auto const& uuid : uuids) {
for (auto const& ngpu : nvidia_gpus) {
if (uuid == ngpu.uuid) {
res.push_back(std::stoi(ngpu.id));
}
}
}
return res;
}
} // namespace services
2 changes: 2 additions & 0 deletions engine/services/hardware_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ class HardwareService {
void UpdateHardwareInfos();
bool IsValidConfig(const cortex::hw::ActivateHardwareConfig& ahc);

private:
std::vector<int> GetCudaConfig();
private:
std::optional<cortex::hw::ActivateHardwareConfig> ahc_;
};
Expand Down
2 changes: 2 additions & 0 deletions engine/utils/hardware/gpu_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ inline std::vector<GPU> GetGPUInfo() {
vulkan_gpus[j].add_info = NvidiaAddInfo{
.driver_version = nvidia_gpus[i].driver_version.value_or("unknown"),
.compute_cap = nvidia_gpus[i].compute_cap.value_or("unknown")};
vulkan_gpus[j].free_vram = std::stoll(nvidia_gpus[i].vram_free);
vulkan_gpus[j].total_vram = std::stoll(nvidia_gpus[i].vram_total);
}
}
}
Expand Down

0 comments on commit 5d270af

Please sign in to comment.