Merge pull request #68 from janhq/66-feat-loadunload-model-with-confi…

…g-at-runtime 66 feat load unload model with config at runtime
janhq · Oct 11, 2023 · ac5c5be · ac5c5be
2 parents 93ee354 + a7a3818
commit ac5c5be
Show file tree

Hide file tree

Showing 4 changed files with 123 additions and 90 deletions.
diff --git a/config.json b/config.json
@@ -4,11 +4,5 @@
 			"address": "127.0.0.1",
 			"port": 3928
 		}
-	],
-	"custom_config": {
-		"llama_model_path": "/Users/alandao/Documents/codes/nitro.cpp_temp/models/llama2_7b_chat_uncensored.Q4_0.gguf",
-		"ctx_len": 2048,
-		"ngl": 100,
-		"embedding":true
-	}
+	]
 }
diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
@@ -4,6 +4,7 @@
 #include <chrono>
 #include <cstring>
 #include <drogon/HttpResponse.h>
+#include <drogon/HttpTypes.h>
 #include <regex>
 #include <thread>
 
@@ -41,6 +42,15 @@ std::string create_return_json(const std::string &id, const std::string &model,
 void llamaCPP::chatCompletion(
     const HttpRequestPtr &req,
     std::function<void(const HttpResponsePtr &)> &&callback) {
+  if (!model_loaded) {
+    Json::Value jsonResp;
+    jsonResp["message"] = "Model is not loaded yet";
+    auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
+    resp->setStatusCode(drogon::k500InternalServerError);
+    callback(resp);
+    return;
+  }
+
   const auto &jsonBody = req->getJsonObject();
   std::string formatted_output =
       "Below is a conversation between an AI system named ASSISTANT and USER\n";
@@ -203,6 +213,15 @@ void llamaCPP::chatCompletion(
 void llamaCPP::embedding(
     const HttpRequestPtr &req,
     std::function<void(const HttpResponsePtr &)> &&callback) {
+  if (!model_loaded) {
+    Json::Value jsonResp;
+    jsonResp["message"] = "Model is not loaded yet";
+    auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
+    resp->setStatusCode(drogon::k500InternalServerError);
+    callback(resp);
+    return;
+  }
+
   auto lock = llama.lock();
 
   const auto &jsonBody = req->getJsonObject();
@@ -225,3 +244,51 @@ void llamaCPP::embedding(
   resp->setContentTypeString("application/json");
   callback(resp);
 }
+
+void llamaCPP::loadModel(
+    const HttpRequestPtr &req,
+    std::function<void(const HttpResponsePtr &)> &&callback) {
+
+  const auto &jsonBody = req->getJsonObject();
+
+  gpt_params params;
+  if (jsonBody) {
+    params.model = (*jsonBody)["llama_model_path"].asString();
+    params.n_gpu_layers = (*jsonBody)["ngl"].asInt();
+    params.n_ctx = (*jsonBody)["ctx_len"].asInt();
+    params.embedding = (*jsonBody)["embedding"].asBool();
+  }
+#ifdef GGML_USE_CUBLAS
+  LOG_INFO << "Setting up GGML CUBLAS PARAMS";
+  params.mul_mat_q = false;
+#endif // GGML_USE_CUBLAS
+  if (params.model_alias == "unknown") {
+    params.model_alias = params.model;
+  }
+
+  llama_backend_init(params.numa);
+
+  LOG_INFO_LLAMA("build info",
+                 {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
+  LOG_INFO_LLAMA("system info",
+                 {
+                     {"n_threads", params.n_threads},
+                     {"total_threads", std::thread::hardware_concurrency()},
+                     {"system_info", llama_print_system_info()},
+                 });
+
+  // load the model
+  if (!llama.loadModel(params)) {
+    LOG_ERROR << "Error loading the model will exit the program";
+    Json::Value jsonResp;
+    jsonResp["message"] = "Model loaded failed";
+    auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
+    resp->setStatusCode(drogon::k500InternalServerError);
+    callback(resp);
+  }
+  Json::Value jsonResp;
+  jsonResp["message"] = "Model loaded successfully";
+  model_loaded = true;
+  auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
+  callback(resp);
+}
diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
@@ -1,3 +1,4 @@
+#include <drogon/HttpTypes.h>
 #if defined(_WIN32)
 #define NOMINMAX
 #endif
@@ -1311,51 +1312,56 @@ namespace inferences {
 class llamaCPP : public drogon::HttpController<llamaCPP> {
 public:
   llamaCPP() {
-    gpt_params params;
-    auto conf = drogon::app().getCustomConfig();
-    params.model = conf["llama_model_path"].asString();
-    params.n_gpu_layers = conf["ngl"].asInt();
-    params.n_ctx = conf["ctx_len"].asInt();
-    params.embedding = conf["embedding"].asBool();
-#ifdef GGML_USE_CUBLAS
-    LOG_INFO << "Setting up GGML CUBLAS PARAMS";
-    params.mul_mat_q = false;
-#endif // GGML_USE_CUBLAS
-    if (params.model_alias == "unknown") {
-      params.model_alias = params.model;
-    }
-
-    llama_backend_init(params.numa);
-
-    LOG_INFO_LLAMA("build info",
-                   {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
-    LOG_INFO_LLAMA("system info",
-                   {
-                       {"n_threads", params.n_threads},
-                       {"total_threads", std::thread::hardware_concurrency()},
-                       {"system_info", llama_print_system_info()},
-                   });
-
-    // load the model
-    if (!llama.loadModel(params)) {
-      LOG_ERROR << "Error loading the model will exit the program";
-      std::terminate();
-    }
-    nitro_utils::nitro_logo();
+    //    gpt_params params;
+    //    auto conf = drogon::app().getCustomConfig();
+    //    params.model = conf["llama_model_path"].asString();
+    //    params.n_gpu_layers = conf["ngl"].asInt();
+    //    params.n_ctx = conf["ctx_len"].asInt();
+    //    params.embedding = conf["embedding"].asBool();
+    // #ifdef GGML_USE_CUBLAS
+    //    LOG_INFO << "Setting up GGML CUBLAS PARAMS";
+    //    params.mul_mat_q = false;
+    // #endif // GGML_USE_CUBLAS
+    //    if (params.model_alias == "unknown") {
+    //      params.model_alias = params.model;
+    //    }
+    //
+    //    llama_backend_init(params.numa);
+    //
+    //    LOG_INFO_LLAMA("build info",
+    //                   {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
+    //    LOG_INFO_LLAMA("system info",
+    //                   {
+    //                       {"n_threads", params.n_threads},
+    //                       {"total_threads",
+    //                       std::thread::hardware_concurrency()},
+    //                       {"system_info", llama_print_system_info()},
+    //                   });
+    //
+    //    // load the model
+    //    if (!llama.loadModel(params)) {
+    //      LOG_ERROR << "Error loading the model will exit the program";
+    //      std::terminate();
+    //    }
+    //    deprecate this if find no usecase
   }
   METHOD_LIST_BEGIN
   // list path definitions here;
-  METHOD_ADD(llamaCPP::chatCompletion, "chat_completion");
-  METHOD_ADD(llamaCPP::embedding,"embedding");
+  METHOD_ADD(llamaCPP::chatCompletion, "chat_completion", Post);
+  METHOD_ADD(llamaCPP::embedding, "embedding", Post);
+  METHOD_ADD(llamaCPP::loadModel, "loadmodel", Post);
   // PATH_ADD("/llama/chat_completion", Post);
   METHOD_LIST_END
   void chatCompletion(const HttpRequestPtr &req,
                       std::function<void(const HttpResponsePtr &)> &&callback);
   void embedding(const HttpRequestPtr &req,
                  std::function<void(const HttpResponsePtr &)> &&callback);
+  void loadModel(const HttpRequestPtr &req,
+                 std::function<void(const HttpResponsePtr &)> &&callback);
 
 private:
   llama_server_context llama;
+  bool model_loaded = false;
   size_t sent_count = 0;
   size_t sent_token_probs_index = 0;
 };

diff --git a/main.cc b/main.cc
@@ -1,4 +1,4 @@
-
+#include "controllers/nitro_utils.h"
 #include <climits> // for PATH_MAX
 #include <drogon/drogon.h>
 
@@ -14,59 +14,25 @@
 #error "Unsupported platform!"
 #endif
 
-int main() {
-  std::string configPath;
+int main(int argc, char *argv[]) {
 
-#if defined(__APPLE__) && defined(__MACH__)
-  char path[PATH_MAX];
-  uint32_t size = sizeof(path);
-  if (_NSGetExecutablePath(path, &size) == 0) {
-    path[size] = '\0'; // Null-terminate the string
-    char *dir = dirname(path);
-    configPath = std::string(dir) + "/config/config.json";
-  } else {
-    LOG_ERROR << "Failed to get binary location!";
-    return 1;
-  }
-#elif defined(__linux__)
-  char path[PATH_MAX];
-  ssize_t len = readlink("/proc/self/exe", path, sizeof(path) - 1);
-  if (len != -1) {
-    path[len] = '\0';
-    char *dir = dirname(path);
-    configPath = std::string(dir) + "/config/config.json";
-  } else {
-    LOG_ERROR << "Failed to get binary location!";
-    return 1;
-  }
-#elif defined(_WIN32)
-  char path[MAX_PATH];
-  char dir[MAX_PATH];
-  // char dir[MAX_PATH];
-  if(GetModuleFileNameA(NULL, path, sizeof(path))) {
-      char* lastBackslash = strrchr(path, '\\');
-      if (lastBackslash == nullptr) {
-        return 1;
-      }
-      lastBackslash[0] = '\0';
-      strcpy(dir, path);
-      configPath = std::string(dir) + "/config/config.json";
-  }
-  else {
-    LOG_ERROR << "Failed to get binary location!";
-    return 1;
+  std::string host = "127.0.0.1";
+  int port = 3928;
+
+  // Check for host argument
+  if (argc > 1) {
+    host = argv[1];
   }
-#else
-  LOG_ERROR << "Unsupported platform!";
-  return 1;
-#endif
 
-  // Set HTTP listener address and port
-  drogon::app().loadConfigFile(configPath);
-  auto app_conf = drogon::app().getCustomConfig();
+  // Check for port argument
+  if (argc > 2) {
+    port = std::atoi(argv[2]); // Convert string argument to int
+  }
 
-  LOG_INFO << app_conf["llama_model_file"].asString();
-  // drogon::app().addListener("0.0.0.0", 8080);
+  nitro_utils::nitro_logo();
+  LOG_INFO << "Server started, listening at: " << host << ":" << port;
+  LOG_INFO << "Please load your model";
+  drogon::app().addListener(host, port);
   drogon::app().run();
 
   return 0;