From c08043f4a7e5a5679911eda4c2358ac72a4621ba Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Mon, 13 May 2024 10:36:11 +0700 Subject: [PATCH] feat: engines endpoint and cortex.python --- .../e2e-test-python-runtime-linux-and-mac.sh | 0 .../e2e-test-python-runtime-windows.bat | 0 cortex-cpp/CMakeLists.txt | 1 + cortex-cpp/common/base.h | 11 +- cortex-cpp/controllers/server.cc | 187 ++++++++++++------ cortex-cpp/controllers/server.h | 30 ++- cortex-cpp/cortex-common/EngineI.h | 11 +- cortex-cpp/cortex-common/cortexpythoni.h | 22 +++ cortex-cpp/engines/cortex.python/engine.cmake | 38 ++++ cortex-cpp/main.cc | 21 ++ cortex-cpp/utils/cortex_utils.h | 2 + 11 files changed, 253 insertions(+), 70 deletions(-) create mode 100644 .github/scripts/e2e-test-python-runtime-linux-and-mac.sh create mode 100644 .github/scripts/e2e-test-python-runtime-windows.bat create mode 100644 cortex-cpp/cortex-common/cortexpythoni.h create mode 100644 cortex-cpp/engines/cortex.python/engine.cmake diff --git a/.github/scripts/e2e-test-python-runtime-linux-and-mac.sh b/.github/scripts/e2e-test-python-runtime-linux-and-mac.sh new file mode 100644 index 000000000..e69de29bb diff --git a/.github/scripts/e2e-test-python-runtime-windows.bat b/.github/scripts/e2e-test-python-runtime-windows.bat new file mode 100644 index 000000000..e69de29bb diff --git a/cortex-cpp/CMakeLists.txt b/cortex-cpp/CMakeLists.txt index 8c01d2256..ae9c31ad2 100644 --- a/cortex-cpp/CMakeLists.txt +++ b/cortex-cpp/CMakeLists.txt @@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.5) project(cortex-cpp C CXX) include(engines/cortex.llamacpp/engine.cmake) +include(engines/cortex.python/engine.cmake) include(CheckIncludeFileCXX) check_include_file_cxx(any HAS_ANY) diff --git a/cortex-cpp/common/base.h b/cortex-cpp/common/base.h index 43d612c1b..3156c54ae 100644 --- a/cortex-cpp/common/base.h +++ b/cortex-cpp/common/base.h @@ -8,14 +8,21 @@ class BaseModel { virtual ~BaseModel() {} // Model management - virtual void LoadModel(const HttpRequestPtr& req, - std::function&& callback) = 0; + virtual void LoadModel( + const HttpRequestPtr& req, + std::function&& callback) = 0; virtual void UnloadModel( const HttpRequestPtr& req, std::function&& callback) = 0; virtual void ModelStatus( const HttpRequestPtr& req, std::function&& callback) = 0; + virtual void GetEngines( + const HttpRequestPtr& req, + std::function&& callback) = 0; + virtual void FineTuning( + const HttpRequestPtr& req, + std::function&& callback) = 0; }; class BaseChatCompletion { diff --git a/cortex-cpp/controllers/server.cc b/cortex-cpp/controllers/server.cc index af8a91df8..eff9eebd1 100644 --- a/cortex-cpp/controllers/server.cc +++ b/cortex-cpp/controllers/server.cc @@ -5,32 +5,33 @@ #include #include "trantor/utils/Logger.h" -#include "utils/logging_utils.h" #include "utils/cortex_utils.h" +#include "utils/logging_utils.h" using namespace inferences; using json = nlohmann::json; namespace inferences { namespace { constexpr static auto kLlamaEngine = "cortex.llamacpp"; -constexpr static auto kLlamaLibPath = "./engines/cortex.llamacpp"; +constexpr static auto kPythonRuntimeEngine = "cortex.python"; } // namespace -server::server() - : engine_{nullptr} { +server::server(){ - // Some default values for now below - // log_disable(); // Disable the log to file feature, reduce bloat for - // target - // system () - }; + // Some default values for now below + // log_disable(); // Disable the log to file feature, reduce bloat for + // target + // system () +}; server::~server() {} void server::ChatCompletion( const HttpRequestPtr& req, std::function&& callback) { - if (!IsEngineLoaded()) { + auto engine_type = + (*(req->getJsonObject())).get("engine", kLlamaEngine).asString(); + if (!IsEngineLoaded(engine_type)) { Json::Value res; res["message"] = "Engine is not loaded yet"; auto resp = cortex_utils::nitroHttpJsonResponse(res); @@ -44,10 +45,10 @@ void server::ChatCompletion( auto json_body = req->getJsonObject(); bool is_stream = (*json_body).get("stream", false).asBool(); auto q = std::make_shared(); - engine_->HandleChatCompletion(json_body, - [q](Json::Value status, Json::Value res) { - q->push(std::make_pair(status, res)); - }); + std::get(engines_[engine_type].engine)->HandleChatCompletion( + json_body, [q](Json::Value status, Json::Value res) { + q->push(std::make_pair(status, res)); + }); LOG_TRACE << "Wait to chat completion responses"; if (is_stream) { ProcessStreamRes(std::move(callback), q); @@ -57,10 +58,11 @@ void server::ChatCompletion( LOG_TRACE << "Done chat completion"; } -void server::Embedding( - const HttpRequestPtr& req, - std::function&& callback) { - if (!IsEngineLoaded()) { +void server::Embedding(const HttpRequestPtr& req, + std::function&& callback) { + auto engine_type = + (*(req->getJsonObject())).get("engine", kLlamaEngine).asString(); + if (!IsEngineLoaded(engine_type)) { Json::Value res; res["message"] = "Engine is not loaded yet"; auto resp = cortex_utils::nitroHttpJsonResponse(res); @@ -72,10 +74,10 @@ void server::Embedding( LOG_TRACE << "Start embedding"; SyncQueue q; - engine_->HandleEmbedding(req->getJsonObject(), - [&q](Json::Value status, Json::Value res) { - q.push(std::make_pair(status, res)); - }); + std::get(engines_[engine_type].engine)->HandleEmbedding( + req->getJsonObject(), [&q](Json::Value status, Json::Value res) { + q.push(std::make_pair(status, res)); + }); LOG_TRACE << "Wait to embedding"; ProcessNonStreamRes(std::move(callback), q); LOG_TRACE << "Done embedding"; @@ -84,7 +86,9 @@ void server::Embedding( void server::UnloadModel( const HttpRequestPtr& req, std::function&& callback) { - if (!IsEngineLoaded()) { + auto engine_type = + (*(req->getJsonObject())).get("engine", kLlamaEngine).asString(); + if (!IsEngineLoaded(engine_type)) { Json::Value res; res["message"] = "Engine is not loaded yet"; auto resp = cortex_utils::nitroHttpJsonResponse(res); @@ -94,7 +98,7 @@ void server::UnloadModel( return; } LOG_TRACE << "Start unload model"; - engine_->UnloadModel( + std::get(engines_[engine_type].engine)->UnloadModel( req->getJsonObject(), [cb = std::move(callback)](Json::Value status, Json::Value res) { auto resp = cortex_utils::nitroHttpJsonResponse(res); @@ -108,7 +112,9 @@ void server::UnloadModel( void server::ModelStatus( const HttpRequestPtr& req, std::function&& callback) { - if (!IsEngineLoaded()) { + auto engine_type = + (*(req->getJsonObject())).get("engine", kLlamaEngine).asString(); + if (!IsEngineLoaded(engine_type)) { Json::Value res; res["message"] = "Engine is not loaded yet"; auto resp = cortex_utils::nitroHttpJsonResponse(res); @@ -119,7 +125,7 @@ void server::ModelStatus( } LOG_TRACE << "Start to get model status"; - engine_->GetModelStatus( + std::get(engines_[engine_type].engine)->GetModelStatus( req->getJsonObject(), [cb = std::move(callback)](Json::Value status, Json::Value res) { auto resp = cortex_utils::nitroHttpJsonResponse(res); @@ -130,57 +136,126 @@ void server::ModelStatus( LOG_TRACE << "Done get model status"; } -void server::LoadModel( +void server::GetEngines( + const HttpRequestPtr& req, + std::function&& callback) { + Json::Value res; + Json::Value engine_array(Json::arrayValue); + for (const auto& [s, _] : engines_) { + Json::Value val; + val["id"] = s; + val["object"] = "engine"; + engine_array.append(val); + } + + res["object"] = "list"; + res["data"] = engine_array; + + auto resp = cortex_utils::nitroHttpJsonResponse(res); + callback(resp); +} + +void server::FineTuning( const HttpRequestPtr& req, std::function&& callback) { + auto engine_type = + (*(req->getJsonObject())).get("engine", kPythonRuntimeEngine).asString(); + + if (engines_.find(engine_type) == engines_.end()) { + try { + engines_[engine_type].dl = std::make_unique( + cortex_utils::kPythonRuntimeLibPath, "engine"); + } catch (const dylib::load_error& e) { + LOG_ERROR << "Could not load engine: " << e.what(); + engines_.erase(engine_type); + + Json::Value res; + res["message"] = "Could not load engine " + engine_type; + auto resp = cortex_utils::nitroHttpJsonResponse(res); + resp->setStatusCode(k500InternalServerError); + callback(resp); + return; + } + + auto func = + engines_[engine_type].dl->get_function("get_engine"); + engines_[engine_type].engine = func(); + LOG_INFO << "Loaded engine: " << engine_type; + } + + LOG_TRACE << "Start to fine-tuning"; + auto& en = std::get(engines_[engine_type].engine); + if (en->IsSupported("HandlePythonFileExecutionRequest")) { + en->HandlePythonFileExecutionRequest( + req->getJsonObject(), + [cb = std::move(callback)](Json::Value status, Json::Value res) { + auto resp = cortex_utils::nitroHttpJsonResponse(res); + resp->setStatusCode(static_cast( + status["status_code"].asInt())); + cb(resp); + }); + } else { + Json::Value res; + res["message"] = "Method is not supported yet"; + auto resp = cortex_utils::nitroHttpJsonResponse(res); + resp->setStatusCode(k500InternalServerError); + callback(resp); + LOG_WARN << "Method is not supported yet"; + } + LOG_TRACE << "Done fine-tuning"; +} + +void server::LoadModel(const HttpRequestPtr& req, + std::function&& callback) { auto engine_type = (*(req->getJsonObject())).get("engine", kLlamaEngine).asString(); - if (!dylib_ || engine_type != cur_engine_name_) { - cur_engine_name_ = engine_type; - // TODO: change this when we get more engines + + // We have not loaded engine yet, should load it before using it + if (engines_.find(engine_type) == engines_.end()) { + // TODO(sang) we cannot run cortex.llamacpp and cortex.tensorrt-llm at the same time. + // So need an unload engine machanism to handle. auto get_engine_path = [](std::string_view e) { if (e == kLlamaEngine) { - return kLlamaLibPath; + return cortex_utils::kLlamaLibPath; } - return kLlamaLibPath; + return cortex_utils::kLlamaLibPath; }; try { - dylib_ = - std::make_unique(get_engine_path(cur_engine_name_), "engine"); + engines_[engine_type].dl = + std::make_unique(get_engine_path(engine_type), "engine"); } catch (const dylib::load_error& e) { LOG_ERROR << "Could not load engine: " << e.what(); - dylib_.reset(); - engine_ = nullptr; - } + engines_.erase(engine_type); - if (!dylib_) { Json::Value res; - res["message"] = "Could not load engine " + cur_engine_name_; + res["message"] = "Could not load engine " + engine_type; auto resp = cortex_utils::nitroHttpJsonResponse(res); resp->setStatusCode(k500InternalServerError); callback(resp); return; } - auto func = dylib_->get_function("get_engine"); - engine_ = func(); - LOG_INFO << "Loaded engine: " << cur_engine_name_; + + auto func = + engines_[engine_type].dl->get_function("get_engine"); + engines_[engine_type].engine = func(); + LOG_INFO << "Loaded engine: " << engine_type; } LOG_TRACE << "Load model"; - engine_->LoadModel( - req->getJsonObject(), - [cb = std::move(callback)](Json::Value status, Json::Value res) { - auto resp = cortex_utils::nitroHttpJsonResponse(res); - resp->setStatusCode( - static_cast(status["status_code"].asInt())); - cb(resp); - }); + auto& en = std::get(engines_[engine_type].engine); + en->LoadModel(req->getJsonObject(), [cb = std::move(callback)]( + Json::Value status, Json::Value res) { + auto resp = cortex_utils::nitroHttpJsonResponse(res); + resp->setStatusCode( + static_cast(status["status_code"].asInt())); + cb(resp); + }); LOG_TRACE << "Done load model"; } void server::ProcessStreamRes(std::function cb, - std::shared_ptr q) { + std::shared_ptr q) { auto err_or_done = std::make_shared(false); auto chunked_content_provider = [q, err_or_done](char* buf, std::size_t buf_size) -> std::size_t { @@ -209,12 +284,12 @@ void server::ProcessStreamRes(std::function cb, }; auto resp = cortex_utils::nitroStreamResponse(chunked_content_provider, - "chat_completions.txt"); + "chat_completions.txt"); cb(resp); } -void server::ProcessNonStreamRes( - std::function cb, SyncQueue& q) { +void server::ProcessNonStreamRes(std::function cb, + SyncQueue& q) { auto [status, res] = q.wait_and_pop(); auto resp = cortex_utils::nitroHttpJsonResponse(res); resp->setStatusCode( @@ -222,8 +297,8 @@ void server::ProcessNonStreamRes( cb(resp); } -bool server::IsEngineLoaded() { - return !!engine_; +bool server::IsEngineLoaded(const std::string& e) { + return engines_.find(e) != engines_.end(); } } // namespace inferences \ No newline at end of file diff --git a/cortex-cpp/controllers/server.h b/cortex-cpp/controllers/server.h index d8d6fa004..29be2497e 100644 --- a/cortex-cpp/controllers/server.h +++ b/cortex-cpp/controllers/server.h @@ -14,9 +14,11 @@ #include #include #include +#include #include "common/base.h" #include "cortex-common/EngineI.h" +#include "cortex-common/cortexpythoni.h" #include "trantor/utils/SerialTaskQueue.h" #include "utils/dylib.h" #include "utils/json.hpp" @@ -31,9 +33,9 @@ using namespace drogon; namespace inferences { class server : public drogon::HttpController, - public BaseModel, - public BaseChatCompletion, - public BaseEmbedding { + public BaseModel, + public BaseChatCompletion, + public BaseEmbedding { struct SyncQueue; public: @@ -46,9 +48,14 @@ class server : public drogon::HttpController, METHOD_ADD(server::LoadModel, "loadmodel", Post); METHOD_ADD(server::UnloadModel, "unloadmodel", Post); METHOD_ADD(server::ModelStatus, "modelstatus", Post); + METHOD_ADD(server::GetEngines, "engines", Get); + + // cortex.python API + METHOD_ADD(server::FineTuning, "finetuning", Post); // Openai compatible path ADD_METHOD_TO(server::ChatCompletion, "/v1/chat/completions", Post); + ADD_METHOD_TO(server::FineTuning, "/v1/fine_tuning/job", Post); // ADD_METHOD_TO(server::handlePrelight, "/v1/chat/completions", Options); // NOTE: prelight will be added back when browser support is properly planned @@ -72,13 +79,19 @@ class server : public drogon::HttpController, void ModelStatus( const HttpRequestPtr& req, std::function&& callback) override; + void GetEngines( + const HttpRequestPtr& req, + std::function&& callback) override; + + void FineTuning(const HttpRequestPtr& req, + std::function&& callback) override; private: void ProcessStreamRes(std::function cb, std::shared_ptr q); void ProcessNonStreamRes(std::function cb, SyncQueue& q); - bool IsEngineLoaded(); + bool IsEngineLoaded(const std::string& e); private: struct SyncQueue { @@ -120,8 +133,11 @@ class server : public drogon::HttpController, }; private: - std::unique_ptr dylib_; - EngineI* engine_; - std::string cur_engine_name_; + using EngineV = std::variant; + struct EngineInfo { + std::unique_ptr dl; + EngineV engine; + }; + std::unordered_map engines_; }; }; // namespace inferences \ No newline at end of file diff --git a/cortex-cpp/cortex-common/EngineI.h b/cortex-cpp/cortex-common/EngineI.h index b8770b230..53956624b 100644 --- a/cortex-cpp/cortex-common/EngineI.h +++ b/cortex-cpp/cortex-common/EngineI.h @@ -9,19 +9,20 @@ class EngineI { public: virtual ~EngineI() {} + // cortex.llamacpp interface virtual void HandleChatCompletion( - std::shared_ptr jsonBody, + std::shared_ptr json_body, std::function&& callback) = 0; virtual void HandleEmbedding( - std::shared_ptr jsonBody, + std::shared_ptr json_body, std::function&& callback) = 0; virtual void LoadModel( - std::shared_ptr jsonBody, + std::shared_ptr json_body, std::function&& callback) = 0; virtual void UnloadModel( - std::shared_ptr jsonBody, + std::shared_ptr json_body, std::function&& callback) = 0; virtual void GetModelStatus( - std::shared_ptr jsonBody, + std::shared_ptr json_body, std::function&& callback) = 0; }; diff --git a/cortex-cpp/cortex-common/cortexpythoni.h b/cortex-cpp/cortex-common/cortexpythoni.h new file mode 100644 index 000000000..06a79838f --- /dev/null +++ b/cortex-cpp/cortex-common/cortexpythoni.h @@ -0,0 +1,22 @@ +#pragma once + +#include +#include + +#include "json/value.h" + +class CortexPythonEngineI { + public: + virtual ~CortexPythonEngineI() {} + + virtual bool IsSupported(const std::string& f) = 0; + + virtual void ExecutePythonFile(std::string binary_execute_path, + std::string file_execution_path, + std::string python_library_path) = 0; + + virtual void HandlePythonFileExecutionRequest( + std::shared_ptr json_body, + std::function&& callback) = 0; +}; + diff --git a/cortex-cpp/engines/cortex.python/engine.cmake b/cortex-cpp/engines/cortex.python/engine.cmake new file mode 100644 index 000000000..2a6a6ec0d --- /dev/null +++ b/cortex-cpp/engines/cortex.python/engine.cmake @@ -0,0 +1,38 @@ +# cortex.python release version +set(VERSION 0.1.1) +set(ENGINE_VERSION v${VERSION}) +set(ENGINE_NAME cortex.python) + +# MESSAGE("ENGINE_VERSION=" ${ENGINE_VERSION}) + +# Download library based on instructions +if(UNIX AND NOT APPLE) + set(LIBRARY_NAME ${ENGINE_NAME}-${VERSION}-linux-amd64.tar.gz) +elseif(UNIX) + if(MAC_ARM64) + set(LIBRARY_NAME ${ENGINE_NAME}-${VERSION}-mac-arm64.tar.gz) + else() + set(LIBRARY_NAME ${ENGINE_NAME}-${VERSION}-mac-amd64.tar.gz) + endif() +else() + set(LIBRARY_NAME ${ENGINE_NAME}-${VERSION}-windows-amd64.tar.gz) +endif() + + +set(LIBPYTHONRUNTIME_ENGINE_URL https://github.com/janhq/cortex.python/releases/download/${ENGINE_VERSION}/${LIBRARY_NAME}) +MESSAGE("LIBPYTHONRUNTIME_ENGINE_URL=" ${LIBPYTHONRUNTIME_ENGINE_URL}) +MESSAGE("LIBARRY_NAME=" ${LIBRARY_NAME}) +set(LIBPYTHONRUNTIME_ENGINE_PATH ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME}) + +# MESSAGE("CMAKE_BINARY_DIR = " ${CMAKE_BINARY_DIR}) + +file(DOWNLOAD ${LIBPYTHONRUNTIME_ENGINE_URL} ${LIBPYTHONRUNTIME_ENGINE_PATH} STATUS LIBPYTHONRUNTIME_ENGINE_DOWNLOAD_STATUS) +list(GET LIBPYTHONRUNTIME_ENGINE_DOWNLOAD_STATUS 0 LIBPYTHONRUNTIME_ENGINE_DOWNLOAD_STATUS_NO) +# MESSAGE("file = " ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME}) + +if(LIBPYTHONRUNTIME_ENGINE_DOWNLOAD_STATUS_NO) + message(STATUS "Pre-built library not downloaded. (${LIBPYTHONRUNTIME_ENGINE_DOWNLOAD_STATUS})") +else() + message(STATUS "Linking downloaded pre-built library.") + file(ARCHIVE_EXTRACT INPUT ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME} DESTINATION ${CMAKE_BINARY_DIR}/engines/) +endif() \ No newline at end of file diff --git a/cortex-cpp/main.cc b/cortex-cpp/main.cc index 53c65cd37..6b31e7d6a 100644 --- a/cortex-cpp/main.cc +++ b/cortex-cpp/main.cc @@ -3,6 +3,8 @@ #include // for PATH_MAX #include #include "utils/cortex_utils.h" +#include "utils/dylib.h" +#include "cortex-common/cortexpythoni.h" #if defined(__APPLE__) && defined(__MACH__) #include // for dirname() @@ -18,6 +20,25 @@ #endif int main(int argc, char* argv[]) { + // Check if this process is for python execution + if (argc > 1) { + if (strcmp(argv[1], "--run_python_file") == 0) { + std::string py_home_path = (argc > 3) ? argv[3] : ""; + std::unique_ptr dl; + try { + dl = std::make_unique(cortex_utils::kPythonRuntimeLibPath, "engine"); + } catch (const dylib::load_error& e) { + LOG_ERROR << "Could not load engine: " << e.what(); + return 1; + } + + auto func = dl->get_function("get_engine"); + auto e = func(); + e->ExecutePythonFile(argv[0], argv[2], py_home_path); + return 0; + } + } + int thread_num = 1; std::string host = "127.0.0.1"; int port = 3928; diff --git a/cortex-cpp/utils/cortex_utils.h b/cortex-cpp/utils/cortex_utils.h index 2790e2d38..e6c202598 100644 --- a/cortex-cpp/utils/cortex_utils.h +++ b/cortex-cpp/utils/cortex_utils.h @@ -19,6 +19,8 @@ #endif namespace cortex_utils { +constexpr static auto kLlamaLibPath = "./engines/cortex.llamacpp"; +constexpr static auto kPythonRuntimeLibPath = "./engines/cortex.python"; inline std::string models_folder = "./models";