Skip to content

Commit

Permalink
Merge pull request #68 from janhq/66-feat-loadunload-model-with-confi…
Browse files Browse the repository at this point in the history
…g-at-runtime

66 feat load unload model with config at runtime
  • Loading branch information
tikikun authored Oct 11, 2023
2 parents 93ee354 + a7a3818 commit ac5c5be
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 90 deletions.
8 changes: 1 addition & 7 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,5 @@
"address": "127.0.0.1",
"port": 3928
}
],
"custom_config": {
"llama_model_path": "/Users/alandao/Documents/codes/nitro.cpp_temp/models/llama2_7b_chat_uncensored.Q4_0.gguf",
"ctx_len": 2048,
"ngl": 100,
"embedding":true
}
]
}
67 changes: 67 additions & 0 deletions controllers/llamaCPP.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <chrono>
#include <cstring>
#include <drogon/HttpResponse.h>
#include <drogon/HttpTypes.h>
#include <regex>
#include <thread>

Expand Down Expand Up @@ -41,6 +42,15 @@ std::string create_return_json(const std::string &id, const std::string &model,
void llamaCPP::chatCompletion(
const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) {
if (!model_loaded) {
Json::Value jsonResp;
jsonResp["message"] = "Model is not loaded yet";
auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
resp->setStatusCode(drogon::k500InternalServerError);
callback(resp);
return;
}

const auto &jsonBody = req->getJsonObject();
std::string formatted_output =
"Below is a conversation between an AI system named ASSISTANT and USER\n";
Expand Down Expand Up @@ -203,6 +213,15 @@ void llamaCPP::chatCompletion(
void llamaCPP::embedding(
const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) {
if (!model_loaded) {
Json::Value jsonResp;
jsonResp["message"] = "Model is not loaded yet";
auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
resp->setStatusCode(drogon::k500InternalServerError);
callback(resp);
return;
}

auto lock = llama.lock();

const auto &jsonBody = req->getJsonObject();
Expand All @@ -225,3 +244,51 @@ void llamaCPP::embedding(
resp->setContentTypeString("application/json");
callback(resp);
}

void llamaCPP::loadModel(
const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) {

const auto &jsonBody = req->getJsonObject();

gpt_params params;
if (jsonBody) {
params.model = (*jsonBody)["llama_model_path"].asString();
params.n_gpu_layers = (*jsonBody)["ngl"].asInt();
params.n_ctx = (*jsonBody)["ctx_len"].asInt();
params.embedding = (*jsonBody)["embedding"].asBool();
}
#ifdef GGML_USE_CUBLAS
LOG_INFO << "Setting up GGML CUBLAS PARAMS";
params.mul_mat_q = false;
#endif // GGML_USE_CUBLAS
if (params.model_alias == "unknown") {
params.model_alias = params.model;
}

llama_backend_init(params.numa);

LOG_INFO_LLAMA("build info",
{{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
LOG_INFO_LLAMA("system info",
{
{"n_threads", params.n_threads},
{"total_threads", std::thread::hardware_concurrency()},
{"system_info", llama_print_system_info()},
});

// load the model
if (!llama.loadModel(params)) {
LOG_ERROR << "Error loading the model will exit the program";
Json::Value jsonResp;
jsonResp["message"] = "Model loaded failed";
auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
resp->setStatusCode(drogon::k500InternalServerError);
callback(resp);
}
Json::Value jsonResp;
jsonResp["message"] = "Model loaded successfully";
model_loaded = true;
auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
callback(resp);
}
72 changes: 39 additions & 33 deletions controllers/llamaCPP.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include <drogon/HttpTypes.h>
#if defined(_WIN32)
#define NOMINMAX
#endif
Expand Down Expand Up @@ -1311,51 +1312,56 @@ namespace inferences {
class llamaCPP : public drogon::HttpController<llamaCPP> {
public:
llamaCPP() {
gpt_params params;
auto conf = drogon::app().getCustomConfig();
params.model = conf["llama_model_path"].asString();
params.n_gpu_layers = conf["ngl"].asInt();
params.n_ctx = conf["ctx_len"].asInt();
params.embedding = conf["embedding"].asBool();
#ifdef GGML_USE_CUBLAS
LOG_INFO << "Setting up GGML CUBLAS PARAMS";
params.mul_mat_q = false;
#endif // GGML_USE_CUBLAS
if (params.model_alias == "unknown") {
params.model_alias = params.model;
}

llama_backend_init(params.numa);

LOG_INFO_LLAMA("build info",
{{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
LOG_INFO_LLAMA("system info",
{
{"n_threads", params.n_threads},
{"total_threads", std::thread::hardware_concurrency()},
{"system_info", llama_print_system_info()},
});

// load the model
if (!llama.loadModel(params)) {
LOG_ERROR << "Error loading the model will exit the program";
std::terminate();
}
nitro_utils::nitro_logo();
// gpt_params params;
// auto conf = drogon::app().getCustomConfig();
// params.model = conf["llama_model_path"].asString();
// params.n_gpu_layers = conf["ngl"].asInt();
// params.n_ctx = conf["ctx_len"].asInt();
// params.embedding = conf["embedding"].asBool();
// #ifdef GGML_USE_CUBLAS
// LOG_INFO << "Setting up GGML CUBLAS PARAMS";
// params.mul_mat_q = false;
// #endif // GGML_USE_CUBLAS
// if (params.model_alias == "unknown") {
// params.model_alias = params.model;
// }
//
// llama_backend_init(params.numa);
//
// LOG_INFO_LLAMA("build info",
// {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
// LOG_INFO_LLAMA("system info",
// {
// {"n_threads", params.n_threads},
// {"total_threads",
// std::thread::hardware_concurrency()},
// {"system_info", llama_print_system_info()},
// });
//
// // load the model
// if (!llama.loadModel(params)) {
// LOG_ERROR << "Error loading the model will exit the program";
// std::terminate();
// }
// deprecate this if find no usecase
}
METHOD_LIST_BEGIN
// list path definitions here;
METHOD_ADD(llamaCPP::chatCompletion, "chat_completion");
METHOD_ADD(llamaCPP::embedding,"embedding");
METHOD_ADD(llamaCPP::chatCompletion, "chat_completion", Post);
METHOD_ADD(llamaCPP::embedding, "embedding", Post);
METHOD_ADD(llamaCPP::loadModel, "loadmodel", Post);
// PATH_ADD("/llama/chat_completion", Post);
METHOD_LIST_END
void chatCompletion(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);
void embedding(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);
void loadModel(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);

private:
llama_server_context llama;
bool model_loaded = false;
size_t sent_count = 0;
size_t sent_token_probs_index = 0;
};
Expand Down
66 changes: 16 additions & 50 deletions main.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

#include "controllers/nitro_utils.h"
#include <climits> // for PATH_MAX
#include <drogon/drogon.h>

Expand All @@ -14,59 +14,25 @@
#error "Unsupported platform!"
#endif

int main() {
std::string configPath;
int main(int argc, char *argv[]) {

#if defined(__APPLE__) && defined(__MACH__)
char path[PATH_MAX];
uint32_t size = sizeof(path);
if (_NSGetExecutablePath(path, &size) == 0) {
path[size] = '\0'; // Null-terminate the string
char *dir = dirname(path);
configPath = std::string(dir) + "/config/config.json";
} else {
LOG_ERROR << "Failed to get binary location!";
return 1;
}
#elif defined(__linux__)
char path[PATH_MAX];
ssize_t len = readlink("/proc/self/exe", path, sizeof(path) - 1);
if (len != -1) {
path[len] = '\0';
char *dir = dirname(path);
configPath = std::string(dir) + "/config/config.json";
} else {
LOG_ERROR << "Failed to get binary location!";
return 1;
}
#elif defined(_WIN32)
char path[MAX_PATH];
char dir[MAX_PATH];
// char dir[MAX_PATH];
if(GetModuleFileNameA(NULL, path, sizeof(path))) {
char* lastBackslash = strrchr(path, '\\');
if (lastBackslash == nullptr) {
return 1;
}
lastBackslash[0] = '\0';
strcpy(dir, path);
configPath = std::string(dir) + "/config/config.json";
}
else {
LOG_ERROR << "Failed to get binary location!";
return 1;
std::string host = "127.0.0.1";
int port = 3928;

// Check for host argument
if (argc > 1) {
host = argv[1];
}
#else
LOG_ERROR << "Unsupported platform!";
return 1;
#endif

// Set HTTP listener address and port
drogon::app().loadConfigFile(configPath);
auto app_conf = drogon::app().getCustomConfig();
// Check for port argument
if (argc > 2) {
port = std::atoi(argv[2]); // Convert string argument to int
}

LOG_INFO << app_conf["llama_model_file"].asString();
// drogon::app().addListener("0.0.0.0", 8080);
nitro_utils::nitro_logo();
LOG_INFO << "Server started, listening at: " << host << ":" << port;
LOG_INFO << "Please load your model";
drogon::app().addListener(host, port);
drogon::app().run();

return 0;
Expand Down

0 comments on commit ac5c5be

Please sign in to comment.