Skip to content

Commit

Permalink
Merge pull request #83 from janhq/63-feat-nitro-speed-up-for-1st-infe…
Browse files Browse the repository at this point in the history
…rence-time-after-model-loaded

63 feat nitro speed up for 1st inference time after model loaded
  • Loading branch information
tikikun authored Oct 18, 2023
2 parents ae30f12 + e7ee054 commit 1e51200
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 19 deletions.
45 changes: 45 additions & 0 deletions controllers/llamaCPP.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <drogon/HttpTypes.h>
#include <regex>
#include <thread>
#include <trantor/utils/Logger.h>

using namespace inferences;

Expand Down Expand Up @@ -39,6 +40,49 @@ std::string create_return_json(const std::string &id, const std::string &model,
return Json::writeString(writer, root);
}

void llamaCPP::warmupModel() {
auto lock = llama.lock();
llama.rewind();
llama_reset_timings(llama.ctx);

llama.prompt = "hello";
llama.params.n_predict = 1;
llama.loadPrompt();
llama.beginCompletion();
size_t stop_pos = std::string::npos;

while (llama.has_next_token) {
const completion_token_output token_with_probs = llama.doCompletion();
const std::string token_text =
token_with_probs.tok == -1
? ""
: llama_token_to_piece(llama.ctx, token_with_probs.tok);

stop_pos = llama.findStoppingStrings(llama.generated_text,
token_text.size(), STOP_FULL);
}

if (stop_pos == std::string::npos) {
stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
}
if (stop_pos != std::string::npos) {
llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
llama.generated_text.end());
}
auto probs = llama.generated_token_probs;
if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
const std::vector<llama_token> stop_word_toks =
llama_tokenize(llama.ctx, llama.stopping_word, false);
probs = std::vector<completion_token_output>(
llama.generated_token_probs.begin(),
llama.generated_token_probs.end() - stop_word_toks.size());
}

LOG_INFO << "Warm-up generated text:" << llama.generated_text;
LOG_INFO << "Warm-up finish";
return;
}

void llamaCPP::chatCompletion(
const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) {
Expand Down Expand Up @@ -297,5 +341,6 @@ void llamaCPP::loadModel(
jsonResp["message"] = "Model loaded successfully";
model_loaded = true;
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
warmupModel();
callback(resp);
}
43 changes: 24 additions & 19 deletions controllers/llamaCPP.h
Original file line number Diff line number Diff line change
Expand Up @@ -525,12 +525,12 @@ struct llama_server_context {
if (llama_decode(ctx,
llama_batch_get_one(&embd[n_past], n_eval, n_past, 0))) {
LOG_ERROR_LLAMA("failed to eval",
{
{"n_eval", n_eval},
{"n_past", n_past},
{"embd",
tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
});
{
{"n_eval", n_eval},
{"n_past", n_past},
{"embd", tokens_to_str(ctx, embd.cbegin() + n_past,
embd.cend())},
});
has_next_token = false;
return result;
}
Expand Down Expand Up @@ -677,9 +677,9 @@ struct llama_server_context {
static const int n_embd = llama_n_embd(model);
if (!params.embedding) {
LOG_WARNING_LLAMA("embedding disabled",
{
{"params.embedding", params.embedding},
});
{
{"params.embedding", params.embedding},
});
return std::vector<float>(n_embd, 0.0f);
}
const float *data = llama_get_embeddings(ctx);
Expand Down Expand Up @@ -891,17 +891,19 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
}
#else
LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not possible "
"to set a tensor split.\n",
{});
LOG_WARNING_LLAMA(
"llama.cpp was compiled without cuBLAS. It is not possible "
"to set a tensor split.\n",
{});
#endif // GGML_USE_CUBLAS
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
#ifdef GGML_USE_CUBLAS
params.mul_mat_q = false;
#else
LOG_WARNING_LLAMA("warning: llama.cpp was compiled without cuBLAS. Disabling "
"mul_mat_q kernels has no effect.\n",
{});
LOG_WARNING_LLAMA(
"warning: llama.cpp was compiled without cuBLAS. Disabling "
"mul_mat_q kernels has no effect.\n",
{});
#endif // GGML_USE_CUBLAS
} else if (arg == "--main-gpu" || arg == "-mg") {
if (++i >= argc) {
Expand All @@ -911,9 +913,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
#ifdef GGML_USE_CUBLAS
params.main_gpu = std::stoi(argv[i]);
#else
LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not possible "
"to set a main GPU.",
{});
LOG_WARNING_LLAMA(
"llama.cpp was compiled without cuBLAS. It is not possible "
"to set a main GPU.",
{});
#endif
} else if (arg == "--lora") {
if (++i >= argc) {
Expand Down Expand Up @@ -1260,7 +1263,8 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
public:
llamaCPP() {
// Some default values for now below
log_disable(); //Disable the log to file feature, reduce bloat for target system ()
log_disable(); // Disable the log to file feature, reduce bloat for target
// system ()
}
METHOD_LIST_BEGIN
// list path definitions here;
Expand All @@ -1275,6 +1279,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
std::function<void(const HttpResponsePtr &)> &&callback);
void loadModel(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);
void warmupModel();

private:
llama_server_context llama;
Expand Down

0 comments on commit 1e51200

Please sign in to comment.