From c08043f4a7e5a5679911eda4c2358ac72a4621ba Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Mon, 13 May 2024 10:36:11 +0700
Subject: [PATCH] feat: engines endpoint and cortex.python

---
 .../e2e-test-python-runtime-linux-and-mac.sh  |   0
 .../e2e-test-python-runtime-windows.bat       |   0
 cortex-cpp/CMakeLists.txt                     |   1 +
 cortex-cpp/common/base.h                      |  11 +-
 cortex-cpp/controllers/server.cc              | 187 ++++++++++++------
 cortex-cpp/controllers/server.h               |  30 ++-
 cortex-cpp/cortex-common/EngineI.h            |  11 +-
 cortex-cpp/cortex-common/cortexpythoni.h      |  22 +++
 cortex-cpp/engines/cortex.python/engine.cmake |  38 ++++
 cortex-cpp/main.cc                            |  21 ++
 cortex-cpp/utils/cortex_utils.h               |   2 +
 11 files changed, 253 insertions(+), 70 deletions(-)
 create mode 100644 .github/scripts/e2e-test-python-runtime-linux-and-mac.sh
 create mode 100644 .github/scripts/e2e-test-python-runtime-windows.bat
 create mode 100644 cortex-cpp/cortex-common/cortexpythoni.h
 create mode 100644 cortex-cpp/engines/cortex.python/engine.cmake
diff --git a/.github/scripts/e2e-test-python-runtime-linux-and-mac.sh b/.github/scripts/e2e-test-python-runtime-linux-and-mac.sh
new file mode 100644
index 000000000..e69de29bb
diff --git a/.github/scripts/e2e-test-python-runtime-windows.bat b/.github/scripts/e2e-test-python-runtime-windows.bat
new file mode 100644
index 000000000..e69de29bb
diff --git a/cortex-cpp/CMakeLists.txt b/cortex-cpp/CMakeLists.txt
index 8c01d2256..ae9c31ad2 100644
--- a/cortex-cpp/CMakeLists.txt
+++ b/cortex-cpp/CMakeLists.txt
@@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.5)
 project(cortex-cpp C CXX)
 
 include(engines/cortex.llamacpp/engine.cmake)
+include(engines/cortex.python/engine.cmake)
 include(CheckIncludeFileCXX)
 
 check_include_file_cxx(any HAS_ANY)
diff --git a/cortex-cpp/common/base.h b/cortex-cpp/common/base.h
index 43d612c1b..3156c54ae 100644
--- a/cortex-cpp/common/base.h
+++ b/cortex-cpp/common/base.h
@@ -8,14 +8,21 @@ class BaseModel {
   virtual ~BaseModel() {}
 
   // Model management
-  virtual void LoadModel(const HttpRequestPtr& req,
-                         std::function<void(const HttpResponsePtr&)>&& callback) = 0;
+  virtual void LoadModel(
+      const HttpRequestPtr& req,
+      std::function<void(const HttpResponsePtr&)>&& callback) = 0;
   virtual void UnloadModel(
       const HttpRequestPtr& req,
       std::function<void(const HttpResponsePtr&)>&& callback) = 0;
   virtual void ModelStatus(
       const HttpRequestPtr& req,
       std::function<void(const HttpResponsePtr&)>&& callback) = 0;
+  virtual void GetEngines(
+      const HttpRequestPtr& req,
+      std::function<void(const HttpResponsePtr&)>&& callback) = 0;
+  virtual void FineTuning(
+      const HttpRequestPtr& req,
+      std::function<void(const HttpResponsePtr&)>&& callback) = 0;
 };
 
 class BaseChatCompletion {
diff --git a/cortex-cpp/controllers/server.cc b/cortex-cpp/controllers/server.cc
index af8a91df8..eff9eebd1 100644
--- a/cortex-cpp/controllers/server.cc
+++ b/cortex-cpp/controllers/server.cc
@@ -5,32 +5,33 @@
 #include <iostream>
 
 #include "trantor/utils/Logger.h"
-#include "utils/logging_utils.h"
 #include "utils/cortex_utils.h"
+#include "utils/logging_utils.h"
 
 using namespace inferences;
 using json = nlohmann::json;
 namespace inferences {
 namespace {
 constexpr static auto kLlamaEngine = "cortex.llamacpp";
-constexpr static auto kLlamaLibPath = "./engines/cortex.llamacpp";
+constexpr static auto kPythonRuntimeEngine = "cortex.python";
 }  // namespace
 
-server::server()
-    : engine_{nullptr} {
+server::server(){
 
-          // Some default values for now below
-          // log_disable();  // Disable the log to file feature, reduce bloat for
-          // target
-          // system ()
-      };
+    // Some default values for now below
+    // log_disable();  // Disable the log to file feature, reduce bloat for
+    // target
+    // system ()
+};
 
 server::~server() {}
 
 void server::ChatCompletion(
     const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback) {
-  if (!IsEngineLoaded()) {
+  auto engine_type =
+      (*(req->getJsonObject())).get("engine", kLlamaEngine).asString();
+  if (!IsEngineLoaded(engine_type)) {
     Json::Value res;
     res["message"] = "Engine is not loaded yet";
     auto resp = cortex_utils::nitroHttpJsonResponse(res);
@@ -44,10 +45,10 @@ void server::ChatCompletion(
   auto json_body = req->getJsonObject();
   bool is_stream = (*json_body).get("stream", false).asBool();
   auto q = std::make_shared<SyncQueue>();
-  engine_->HandleChatCompletion(json_body,
-                                [q](Json::Value status, Json::Value res) {
-                                  q->push(std::make_pair(status, res));
-                                });
+  std::get<EngineI*>(engines_[engine_type].engine)->HandleChatCompletion(
+      json_body, [q](Json::Value status, Json::Value res) {
+        q->push(std::make_pair(status, res));
+      });
   LOG_TRACE << "Wait to chat completion responses";
   if (is_stream) {
     ProcessStreamRes(std::move(callback), q);
@@ -57,10 +58,11 @@ void server::ChatCompletion(
   LOG_TRACE << "Done chat completion";
 }
 
-void server::Embedding(
-    const HttpRequestPtr& req,
-    std::function<void(const HttpResponsePtr&)>&& callback) {
-  if (!IsEngineLoaded()) {
+void server::Embedding(const HttpRequestPtr& req,
+                       std::function<void(const HttpResponsePtr&)>&& callback) {
+  auto engine_type =
+      (*(req->getJsonObject())).get("engine", kLlamaEngine).asString();
+  if (!IsEngineLoaded(engine_type)) {
     Json::Value res;
     res["message"] = "Engine is not loaded yet";
     auto resp = cortex_utils::nitroHttpJsonResponse(res);
@@ -72,10 +74,10 @@ void server::Embedding(
 
   LOG_TRACE << "Start embedding";
   SyncQueue q;
-  engine_->HandleEmbedding(req->getJsonObject(),
-                           [&q](Json::Value status, Json::Value res) {
-                             q.push(std::make_pair(status, res));
-                           });
+  std::get<EngineI*>(engines_[engine_type].engine)->HandleEmbedding(
+      req->getJsonObject(), [&q](Json::Value status, Json::Value res) {
+        q.push(std::make_pair(status, res));
+      });
   LOG_TRACE << "Wait to embedding";
   ProcessNonStreamRes(std::move(callback), q);
   LOG_TRACE << "Done embedding";
@@ -84,7 +86,9 @@ void server::Embedding(
 void server::UnloadModel(
     const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback) {
-  if (!IsEngineLoaded()) {
+  auto engine_type =
+      (*(req->getJsonObject())).get("engine", kLlamaEngine).asString();
+  if (!IsEngineLoaded(engine_type)) {
     Json::Value res;
     res["message"] = "Engine is not loaded yet";
     auto resp = cortex_utils::nitroHttpJsonResponse(res);
@@ -94,7 +98,7 @@ void server::UnloadModel(
     return;
   }
   LOG_TRACE << "Start unload model";
-  engine_->UnloadModel(
+  std::get<EngineI*>(engines_[engine_type].engine)->UnloadModel(
       req->getJsonObject(),
       [cb = std::move(callback)](Json::Value status, Json::Value res) {
         auto resp = cortex_utils::nitroHttpJsonResponse(res);
@@ -108,7 +112,9 @@ void server::UnloadModel(
 void server::ModelStatus(
     const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback) {
-  if (!IsEngineLoaded()) {
+  auto engine_type =
+      (*(req->getJsonObject())).get("engine", kLlamaEngine).asString();
+  if (!IsEngineLoaded(engine_type)) {
     Json::Value res;
     res["message"] = "Engine is not loaded yet";
     auto resp = cortex_utils::nitroHttpJsonResponse(res);
@@ -119,7 +125,7 @@ void server::ModelStatus(
   }
 
   LOG_TRACE << "Start to get model status";
-  engine_->GetModelStatus(
+  std::get<EngineI*>(engines_[engine_type].engine)->GetModelStatus(
       req->getJsonObject(),
       [cb = std::move(callback)](Json::Value status, Json::Value res) {
         auto resp = cortex_utils::nitroHttpJsonResponse(res);
@@ -130,57 +136,126 @@ void server::ModelStatus(
   LOG_TRACE << "Done get model status";
 }
 
-void server::LoadModel(
+void server::GetEngines(
+    const HttpRequestPtr& req,
+    std::function<void(const HttpResponsePtr&)>&& callback) {
+  Json::Value res;
+  Json::Value engine_array(Json::arrayValue);
+  for (const auto& [s, _] : engines_) {
+    Json::Value val;
+    val["id"] = s;
+    val["object"] = "engine";
+    engine_array.append(val);
+  }
+
+  res["object"] = "list";
+  res["data"] = engine_array;
+
+  auto resp = cortex_utils::nitroHttpJsonResponse(res);
+  callback(resp);
+}
+
+void server::FineTuning(
     const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback) {
+  auto engine_type =
+      (*(req->getJsonObject())).get("engine", kPythonRuntimeEngine).asString();
+
+  if (engines_.find(engine_type) == engines_.end()) {
+    try {
+      engines_[engine_type].dl = std::make_unique<dylib>(
+          cortex_utils::kPythonRuntimeLibPath, "engine");
+    } catch (const dylib::load_error& e) {
+      LOG_ERROR << "Could not load engine: " << e.what();
+      engines_.erase(engine_type);
+
+      Json::Value res;
+      res["message"] = "Could not load engine " + engine_type;
+      auto resp = cortex_utils::nitroHttpJsonResponse(res);
+      resp->setStatusCode(k500InternalServerError);
+      callback(resp);
+      return;
+    }
+
+    auto func =
+        engines_[engine_type].dl->get_function<CortexPythonEngineI*()>("get_engine");
+    engines_[engine_type].engine = func();
+    LOG_INFO << "Loaded engine: " << engine_type;
+  }
+
+  LOG_TRACE << "Start to fine-tuning";
+  auto& en = std::get<CortexPythonEngineI*>(engines_[engine_type].engine);
+  if (en->IsSupported("HandlePythonFileExecutionRequest")) {
+    en->HandlePythonFileExecutionRequest(
+        req->getJsonObject(),
+        [cb = std::move(callback)](Json::Value status, Json::Value res) {
+          auto resp = cortex_utils::nitroHttpJsonResponse(res);
+          resp->setStatusCode(static_cast<drogon::HttpStatusCode>(
+              status["status_code"].asInt()));
+          cb(resp);
+        });
+  } else {
+    Json::Value res;
+    res["message"] = "Method is not supported yet";
+    auto resp = cortex_utils::nitroHttpJsonResponse(res);
+    resp->setStatusCode(k500InternalServerError);
+    callback(resp);
+    LOG_WARN << "Method is not supported yet";
+  }
+  LOG_TRACE << "Done fine-tuning";
+}
+
+void server::LoadModel(const HttpRequestPtr& req,
+                       std::function<void(const HttpResponsePtr&)>&& callback) {
   auto engine_type =
       (*(req->getJsonObject())).get("engine", kLlamaEngine).asString();
-  if (!dylib_ || engine_type != cur_engine_name_) {
-    cur_engine_name_ = engine_type;
-    // TODO: change this when we get more engines
+
+  // We have not loaded engine yet, should load it before using it
+  if (engines_.find(engine_type) == engines_.end()) {
+    // TODO(sang) we cannot run cortex.llamacpp and cortex.tensorrt-llm at the same time.
+    // So need an unload engine machanism to handle.
     auto get_engine_path = [](std::string_view e) {
       if (e == kLlamaEngine) {
-        return kLlamaLibPath;
+        return cortex_utils::kLlamaLibPath;
       }
-      return kLlamaLibPath;
+      return cortex_utils::kLlamaLibPath;
     };
 
     try {
-      dylib_ =
-          std::make_unique<dylib>(get_engine_path(cur_engine_name_), "engine");
+      engines_[engine_type].dl =
+          std::make_unique<dylib>(get_engine_path(engine_type), "engine");
     } catch (const dylib::load_error& e) {
       LOG_ERROR << "Could not load engine: " << e.what();
-      dylib_.reset();
-      engine_ = nullptr;
-    }
+      engines_.erase(engine_type);
 
-    if (!dylib_) {
       Json::Value res;
-      res["message"] = "Could not load engine " + cur_engine_name_;
+      res["message"] = "Could not load engine " + engine_type;
       auto resp = cortex_utils::nitroHttpJsonResponse(res);
       resp->setStatusCode(k500InternalServerError);
       callback(resp);
       return;
     }
-    auto func = dylib_->get_function<EngineI*()>("get_engine");
-    engine_ = func();
-    LOG_INFO << "Loaded engine: " << cur_engine_name_;
+
+    auto func =
+        engines_[engine_type].dl->get_function<EngineI*()>("get_engine");
+    engines_[engine_type].engine = func();
+    LOG_INFO << "Loaded engine: " << engine_type;
   }
 
   LOG_TRACE << "Load model";
-  engine_->LoadModel(
-      req->getJsonObject(),
-      [cb = std::move(callback)](Json::Value status, Json::Value res) {
-        auto resp = cortex_utils::nitroHttpJsonResponse(res);
-        resp->setStatusCode(
-            static_cast<drogon::HttpStatusCode>(status["status_code"].asInt()));
-        cb(resp);
-      });
+  auto& en = std::get<EngineI*>(engines_[engine_type].engine);
+  en->LoadModel(req->getJsonObject(), [cb = std::move(callback)](
+                                          Json::Value status, Json::Value res) {
+    auto resp = cortex_utils::nitroHttpJsonResponse(res);
+    resp->setStatusCode(
+        static_cast<drogon::HttpStatusCode>(status["status_code"].asInt()));
+    cb(resp);
+  });
   LOG_TRACE << "Done load model";
 }
 
 void server::ProcessStreamRes(std::function<void(const HttpResponsePtr&)> cb,
-                                std::shared_ptr<SyncQueue> q) {
+                              std::shared_ptr<SyncQueue> q) {
   auto err_or_done = std::make_shared<std::atomic_bool>(false);
   auto chunked_content_provider =
       [q, err_or_done](char* buf, std::size_t buf_size) -> std::size_t {
@@ -209,12 +284,12 @@ void server::ProcessStreamRes(std::function<void(const HttpResponsePtr&)> cb,
   };
 
   auto resp = cortex_utils::nitroStreamResponse(chunked_content_provider,
-                                               "chat_completions.txt");
+                                                "chat_completions.txt");
   cb(resp);
 }
 
-void server::ProcessNonStreamRes(
-    std::function<void(const HttpResponsePtr&)> cb, SyncQueue& q) {
+void server::ProcessNonStreamRes(std::function<void(const HttpResponsePtr&)> cb,
+                                 SyncQueue& q) {
   auto [status, res] = q.wait_and_pop();
   auto resp = cortex_utils::nitroHttpJsonResponse(res);
   resp->setStatusCode(
@@ -222,8 +297,8 @@ void server::ProcessNonStreamRes(
   cb(resp);
 }
 
-bool server::IsEngineLoaded() {
-  return !!engine_;
+bool server::IsEngineLoaded(const std::string& e) {
+  return engines_.find(e) != engines_.end();
 }
 
 }  // namespace inferences
\ No newline at end of file
diff --git a/cortex-cpp/controllers/server.h b/cortex-cpp/controllers/server.h
index d8d6fa004..29be2497e 100644
--- a/cortex-cpp/controllers/server.h
+++ b/cortex-cpp/controllers/server.h
@@ -14,9 +14,11 @@
 #include <condition_variable>
 #include <cstddef>
 #include <string>
+#include <variant>
 
 #include "common/base.h"
 #include "cortex-common/EngineI.h"
+#include "cortex-common/cortexpythoni.h"
 #include "trantor/utils/SerialTaskQueue.h"
 #include "utils/dylib.h"
 #include "utils/json.hpp"
@@ -31,9 +33,9 @@ using namespace drogon;
 namespace inferences {
 
 class server : public drogon::HttpController<server>,
-                 public BaseModel,
-                 public BaseChatCompletion,
-                 public BaseEmbedding {
+               public BaseModel,
+               public BaseChatCompletion,
+               public BaseEmbedding {
   struct SyncQueue;
 
  public:
@@ -46,9 +48,14 @@ class server : public drogon::HttpController<server>,
   METHOD_ADD(server::LoadModel, "loadmodel", Post);
   METHOD_ADD(server::UnloadModel, "unloadmodel", Post);
   METHOD_ADD(server::ModelStatus, "modelstatus", Post);
+  METHOD_ADD(server::GetEngines, "engines", Get);
+
+  // cortex.python API
+  METHOD_ADD(server::FineTuning, "finetuning", Post);
 
   // Openai compatible path
   ADD_METHOD_TO(server::ChatCompletion, "/v1/chat/completions", Post);
+  ADD_METHOD_TO(server::FineTuning, "/v1/fine_tuning/job", Post);
   // ADD_METHOD_TO(server::handlePrelight, "/v1/chat/completions", Options);
   // NOTE: prelight will be added back when browser support is properly planned
 
@@ -72,13 +79,19 @@ class server : public drogon::HttpController<server>,
   void ModelStatus(
       const HttpRequestPtr& req,
       std::function<void(const HttpResponsePtr&)>&& callback) override;
+  void GetEngines(
+      const HttpRequestPtr& req,
+      std::function<void(const HttpResponsePtr&)>&& callback) override;
+
+  void FineTuning(const HttpRequestPtr& req,
+      std::function<void(const HttpResponsePtr&)>&& callback) override;
 
  private:
   void ProcessStreamRes(std::function<void(const HttpResponsePtr&)> cb,
                         std::shared_ptr<SyncQueue> q);
   void ProcessNonStreamRes(std::function<void(const HttpResponsePtr&)> cb,
                            SyncQueue& q);
-  bool IsEngineLoaded();
+  bool IsEngineLoaded(const std::string& e);
 
  private:
   struct SyncQueue {
@@ -120,8 +133,11 @@ class server : public drogon::HttpController<server>,
   };
 
  private:
-  std::unique_ptr<dylib> dylib_;
-  EngineI* engine_;
-  std::string cur_engine_name_;
+ using EngineV = std::variant<EngineI*, CortexPythonEngineI*>;
+  struct EngineInfo {
+    std::unique_ptr<dylib> dl;
+    EngineV engine;
+  };
+  std::unordered_map<std::string, EngineInfo> engines_;
 };
 };  // namespace inferences
\ No newline at end of file
diff --git a/cortex-cpp/cortex-common/EngineI.h b/cortex-cpp/cortex-common/EngineI.h
index b8770b230..53956624b 100644
--- a/cortex-cpp/cortex-common/EngineI.h
+++ b/cortex-cpp/cortex-common/EngineI.h
@@ -9,19 +9,20 @@ class EngineI {
  public:
   virtual ~EngineI() {}
 
+  // cortex.llamacpp interface
   virtual void HandleChatCompletion(
-      std::shared_ptr<Json::Value> jsonBody,
+      std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
   virtual void HandleEmbedding(
-      std::shared_ptr<Json::Value> jsonBody,
+      std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
   virtual void LoadModel(
-      std::shared_ptr<Json::Value> jsonBody,
+      std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
   virtual void UnloadModel(
-      std::shared_ptr<Json::Value> jsonBody,
+      std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
   virtual void GetModelStatus(
-      std::shared_ptr<Json::Value> jsonBody,
+      std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
 };
diff --git a/cortex-cpp/cortex-common/cortexpythoni.h b/cortex-cpp/cortex-common/cortexpythoni.h
new file mode 100644
index 000000000..06a79838f
--- /dev/null
+++ b/cortex-cpp/cortex-common/cortexpythoni.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+
+#include "json/value.h"
+
+class CortexPythonEngineI {
+ public:
+  virtual ~CortexPythonEngineI() {}
+
+  virtual bool IsSupported(const std::string& f) = 0;
+
+  virtual void ExecutePythonFile(std::string binary_execute_path,
+                                 std::string file_execution_path,
+                                 std::string python_library_path) = 0;
+
+  virtual void HandlePythonFileExecutionRequest(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;  
+};
+
diff --git a/cortex-cpp/engines/cortex.python/engine.cmake b/cortex-cpp/engines/cortex.python/engine.cmake
new file mode 100644
index 000000000..2a6a6ec0d
--- /dev/null
+++ b/cortex-cpp/engines/cortex.python/engine.cmake
@@ -0,0 +1,38 @@
+# cortex.python release version
+set(VERSION 0.1.1)
+set(ENGINE_VERSION v${VERSION})
+set(ENGINE_NAME cortex.python)
+
+# MESSAGE("ENGINE_VERSION=" ${ENGINE_VERSION})
+
+# Download library based on instructions 
+if(UNIX AND NOT APPLE) 
+  set(LIBRARY_NAME ${ENGINE_NAME}-${VERSION}-linux-amd64.tar.gz)
+elseif(UNIX)
+  if(MAC_ARM64) 
+    set(LIBRARY_NAME ${ENGINE_NAME}-${VERSION}-mac-arm64.tar.gz)
+  else()
+    set(LIBRARY_NAME ${ENGINE_NAME}-${VERSION}-mac-amd64.tar.gz)
+  endif()
+else()
+  set(LIBRARY_NAME ${ENGINE_NAME}-${VERSION}-windows-amd64.tar.gz)
+endif()
+
+
+set(LIBPYTHONRUNTIME_ENGINE_URL https://github.com/janhq/cortex.python/releases/download/${ENGINE_VERSION}/${LIBRARY_NAME})
+MESSAGE("LIBPYTHONRUNTIME_ENGINE_URL=" ${LIBPYTHONRUNTIME_ENGINE_URL})
+MESSAGE("LIBARRY_NAME=" ${LIBRARY_NAME})
+set(LIBPYTHONRUNTIME_ENGINE_PATH ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME})
+
+# MESSAGE("CMAKE_BINARY_DIR = " ${CMAKE_BINARY_DIR})
+
+file(DOWNLOAD ${LIBPYTHONRUNTIME_ENGINE_URL} ${LIBPYTHONRUNTIME_ENGINE_PATH} STATUS LIBPYTHONRUNTIME_ENGINE_DOWNLOAD_STATUS)
+list(GET LIBPYTHONRUNTIME_ENGINE_DOWNLOAD_STATUS 0 LIBPYTHONRUNTIME_ENGINE_DOWNLOAD_STATUS_NO)
+# MESSAGE("file = " ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME})
+
+if(LIBPYTHONRUNTIME_ENGINE_DOWNLOAD_STATUS_NO)
+    message(STATUS "Pre-built library not downloaded. (${LIBPYTHONRUNTIME_ENGINE_DOWNLOAD_STATUS})")
+else()
+    message(STATUS "Linking downloaded pre-built library.")
+    file(ARCHIVE_EXTRACT INPUT ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME} DESTINATION ${CMAKE_BINARY_DIR}/engines/)
+endif()
\ No newline at end of file
diff --git a/cortex-cpp/main.cc b/cortex-cpp/main.cc
index 53c65cd37..6b31e7d6a 100644
--- a/cortex-cpp/main.cc
+++ b/cortex-cpp/main.cc
@@ -3,6 +3,8 @@
 #include <climits>  // for PATH_MAX
 #include <iostream>
 #include "utils/cortex_utils.h"
+#include "utils/dylib.h"
+#include "cortex-common/cortexpythoni.h"
 
 #if defined(__APPLE__) && defined(__MACH__)
 #include <libgen.h>  // for dirname()
@@ -18,6 +20,25 @@
 #endif
 
 int main(int argc, char* argv[]) {
+  // Check if this process is for python execution
+  if (argc > 1) {
+    if (strcmp(argv[1], "--run_python_file") == 0) {
+      std::string py_home_path = (argc > 3) ? argv[3] : "";
+      std::unique_ptr<dylib> dl;
+      try {
+        dl = std::make_unique<dylib>(cortex_utils::kPythonRuntimeLibPath, "engine");
+      } catch (const dylib::load_error& e) {
+        LOG_ERROR << "Could not load engine: " << e.what();
+        return 1;
+      }
+
+      auto func = dl->get_function<CortexPythonEngineI*()>("get_engine");
+      auto e = func();
+      e->ExecutePythonFile(argv[0], argv[2], py_home_path);
+      return 0;
+    }
+  }
+
   int thread_num = 1;
   std::string host = "127.0.0.1";
   int port = 3928;
diff --git a/cortex-cpp/utils/cortex_utils.h b/cortex-cpp/utils/cortex_utils.h
index 2790e2d38..e6c202598 100644
--- a/cortex-cpp/utils/cortex_utils.h
+++ b/cortex-cpp/utils/cortex_utils.h
@@ -19,6 +19,8 @@
 #endif
 
 namespace cortex_utils {
+constexpr static auto kLlamaLibPath = "./engines/cortex.llamacpp";
+constexpr static auto kPythonRuntimeLibPath = "./engines/cortex.python";
 
 inline std::string models_folder = "./models";