From fa8207871c957a012818263e37334ab4017415fb Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Fri, 14 Jun 2024 13:34:05 +0700 Subject: [PATCH] feat: cortex.onnx (#660) --- cortex-cpp/controllers/server.cc | 29 +++++++++++++++++++++-------- cortex-cpp/controllers/server.h | 3 +++ cortex-cpp/utils/cortex_utils.h | 1 + 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/cortex-cpp/controllers/server.cc b/cortex-cpp/controllers/server.cc index 8bdab4757..fb7495c93 100644 --- a/cortex-cpp/controllers/server.cc +++ b/cortex-cpp/controllers/server.cc @@ -16,6 +16,7 @@ namespace inferences { namespace { constexpr static auto kLlamaEngine = "cortex.llamacpp"; constexpr static auto kPythonRuntimeEngine = "cortex.python"; +constexpr static auto kOnnxEngine = "cortex.onnx"; } // namespace server::server(){ @@ -32,7 +33,7 @@ void server::ChatCompletion( const HttpRequestPtr& req, std::function&& callback) { auto engine_type = - (*(req->getJsonObject())).get("engine", kLlamaEngine).asString(); + (*(req->getJsonObject())).get("engine", cur_engine_type_).asString(); if (!IsEngineLoaded(engine_type)) { Json::Value res; res["message"] = "Engine is not loaded yet"; @@ -91,7 +92,7 @@ void server::UnloadModel( const HttpRequestPtr& req, std::function&& callback) { auto engine_type = - (*(req->getJsonObject())).get("engine", kLlamaEngine).asString(); + (*(req->getJsonObject())).get("engine", cur_engine_type_).asString(); if (!IsEngineLoaded(engine_type)) { Json::Value res; res["message"] = "Engine is not loaded yet"; @@ -118,7 +119,7 @@ void server::ModelStatus( const HttpRequestPtr& req, std::function&& callback) { auto engine_type = - (*(req->getJsonObject())).get("engine", kLlamaEngine).asString(); + (*(req->getJsonObject())).get("engine", cur_engine_type_).asString(); if (!IsEngineLoaded(engine_type)) { Json::Value res; res["message"] = "Engine is not loaded yet"; @@ -144,8 +145,7 @@ void server::ModelStatus( void server::GetModels(const HttpRequestPtr& req, std::function&& callback) { - // TODO(sang) need to change this when we support Tensorrt-llm - if (!IsEngineLoaded(kLlamaEngine)) { + if (!IsEngineLoaded(cur_engine_type_)) { Json::Value res; res["message"] = "Engine is not loaded yet"; auto resp = cortex_utils::nitroHttpJsonResponse(res); @@ -156,7 +156,7 @@ void server::GetModels(const HttpRequestPtr& req, } LOG_TRACE << "Start to get models"; - auto& en = std::get(engines_[kLlamaEngine].engine); + auto& en = std::get(engines_[cur_engine_type_].engine); if (en->IsSupported("GetModels")) { en->GetModels( req->getJsonObject(), @@ -257,11 +257,13 @@ void server::LoadModel(const HttpRequestPtr& req, // We have not loaded engine yet, should load it before using it if (engines_.find(engine_type) == engines_.end()) { - // TODO(sang) we cannot run cortex.llamacpp and cortex.tensorrt-llm at the same time. - // So need an unload engine machanism to handle. + // We only use single engine so unload all engines before load new engine + UnloadEngines(); auto get_engine_path = [](std::string_view e) { if (e == kLlamaEngine) { return cortex_utils::kLlamaLibPath; + } else if(e == kOnnxEngine) { + return cortex_utils::kOnnxLibPath; } return cortex_utils::kLlamaLibPath; }; @@ -292,6 +294,7 @@ void server::LoadModel(const HttpRequestPtr& req, callback(resp); return; } + cur_engine_type_ = engine_type; auto func = engines_[engine_type].dl->get_function("get_engine"); @@ -358,4 +361,14 @@ bool server::IsEngineLoaded(const std::string& e) { return engines_.find(e) != engines_.end(); } +void server::UnloadEngines() { + // We unload all engines except python engine + for (auto it = engines_.begin(); it != engines_.end();) { + if (it->first != kPythonRuntimeEngine) { + it = engines_.erase(it); + } else + it++; + } +} + } // namespace inferences \ No newline at end of file diff --git a/cortex-cpp/controllers/server.h b/cortex-cpp/controllers/server.h index 3f214a9f0..9057d44de 100644 --- a/cortex-cpp/controllers/server.h +++ b/cortex-cpp/controllers/server.h @@ -99,6 +99,8 @@ class server : public drogon::HttpController, SyncQueue& q); bool IsEngineLoaded(const std::string& e); + void UnloadEngines(); + private: struct SyncQueue { void push(std::pair&& p) { @@ -145,5 +147,6 @@ class server : public drogon::HttpController, EngineV engine; }; std::unordered_map engines_; + std::string cur_engine_type_; }; }; // namespace inferences \ No newline at end of file diff --git a/cortex-cpp/utils/cortex_utils.h b/cortex-cpp/utils/cortex_utils.h index c0670a431..dc64bfed1 100644 --- a/cortex-cpp/utils/cortex_utils.h +++ b/cortex-cpp/utils/cortex_utils.h @@ -27,6 +27,7 @@ namespace cortex_utils { constexpr static auto kLlamaLibPath = "/engines/cortex.llamacpp"; constexpr static auto kPythonRuntimeLibPath = "/engines/cortex.python"; +constexpr static auto kOnnxLibPath = "/engines/cortex.onnx"; inline std::string models_folder = "./models";