diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index a050b8558..24e10d9be 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -7,6 +7,7 @@ #include #include #include +#include using namespace inferences; @@ -39,6 +40,49 @@ std::string create_return_json(const std::string &id, const std::string &model, return Json::writeString(writer, root); } +void llamaCPP::warmupModel() { + auto lock = llama.lock(); + llama.rewind(); + llama_reset_timings(llama.ctx); + + llama.prompt = "hello"; + llama.params.n_predict = 1; + llama.loadPrompt(); + llama.beginCompletion(); + size_t stop_pos = std::string::npos; + + while (llama.has_next_token) { + const completion_token_output token_with_probs = llama.doCompletion(); + const std::string token_text = + token_with_probs.tok == -1 + ? "" + : llama_token_to_piece(llama.ctx, token_with_probs.tok); + + stop_pos = llama.findStoppingStrings(llama.generated_text, + token_text.size(), STOP_FULL); + } + + if (stop_pos == std::string::npos) { + stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL); + } + if (stop_pos != std::string::npos) { + llama.generated_text.erase(llama.generated_text.begin() + stop_pos, + llama.generated_text.end()); + } + auto probs = llama.generated_token_probs; + if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) { + const std::vector stop_word_toks = + llama_tokenize(llama.ctx, llama.stopping_word, false); + probs = std::vector( + llama.generated_token_probs.begin(), + llama.generated_token_probs.end() - stop_word_toks.size()); + } + + LOG_INFO << "Warm-up generated text:" << llama.generated_text; + LOG_INFO << "Warm-up finish"; + return; +} + void llamaCPP::chatCompletion( const HttpRequestPtr &req, std::function &&callback) { @@ -297,5 +341,6 @@ void llamaCPP::loadModel( jsonResp["message"] = "Model loaded successfully"; model_loaded = true; auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); + warmupModel(); callback(resp); } diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 6df804e75..5cc7eb94d 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -525,12 +525,12 @@ struct llama_server_context { if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0))) { LOG_ERROR_LLAMA("failed to eval", - { - {"n_eval", n_eval}, - {"n_past", n_past}, - {"embd", - tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())}, - }); + { + {"n_eval", n_eval}, + {"n_past", n_past}, + {"embd", tokens_to_str(ctx, embd.cbegin() + n_past, + embd.cend())}, + }); has_next_token = false; return result; } @@ -677,9 +677,9 @@ struct llama_server_context { static const int n_embd = llama_n_embd(model); if (!params.embedding) { LOG_WARNING_LLAMA("embedding disabled", - { - {"params.embedding", params.embedding}, - }); + { + {"params.embedding", params.embedding}, + }); return std::vector(n_embd, 0.0f); } const float *data = llama_get_embeddings(ctx); @@ -891,17 +891,19 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } } #else - LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not possible " - "to set a tensor split.\n", - {}); + LOG_WARNING_LLAMA( + "llama.cpp was compiled without cuBLAS. It is not possible " + "to set a tensor split.\n", + {}); #endif // GGML_USE_CUBLAS } else if (arg == "--no-mul-mat-q" || arg == "-nommq") { #ifdef GGML_USE_CUBLAS params.mul_mat_q = false; #else - LOG_WARNING_LLAMA("warning: llama.cpp was compiled without cuBLAS. Disabling " - "mul_mat_q kernels has no effect.\n", - {}); + LOG_WARNING_LLAMA( + "warning: llama.cpp was compiled without cuBLAS. Disabling " + "mul_mat_q kernels has no effect.\n", + {}); #endif // GGML_USE_CUBLAS } else if (arg == "--main-gpu" || arg == "-mg") { if (++i >= argc) { @@ -911,9 +913,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, #ifdef GGML_USE_CUBLAS params.main_gpu = std::stoi(argv[i]); #else - LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not possible " - "to set a main GPU.", - {}); + LOG_WARNING_LLAMA( + "llama.cpp was compiled without cuBLAS. It is not possible " + "to set a main GPU.", + {}); #endif } else if (arg == "--lora") { if (++i >= argc) { @@ -1260,7 +1263,8 @@ class llamaCPP : public drogon::HttpController { public: llamaCPP() { // Some default values for now below - log_disable(); //Disable the log to file feature, reduce bloat for target system () + log_disable(); // Disable the log to file feature, reduce bloat for target + // system () } METHOD_LIST_BEGIN // list path definitions here; @@ -1275,6 +1279,7 @@ class llamaCPP : public drogon::HttpController { std::function &&callback); void loadModel(const HttpRequestPtr &req, std::function &&callback); + void warmupModel(); private: llama_server_context llama;