Merge pull request #83 from janhq/63-feat-nitro-speed-up-for-1st-infe…

…rence-time-after-model-loaded 63 feat nitro speed up for 1st inference time after model loaded
janhq · Oct 18, 2023 · 1e51200 · 1e51200
2 parents ae30f12 + e7ee054
commit 1e51200
Show file tree

Hide file tree

Showing 2 changed files with 69 additions and 19 deletions.
diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
@@ -7,6 +7,7 @@
 #include <drogon/HttpTypes.h>
 #include <regex>
 #include <thread>
+#include <trantor/utils/Logger.h>
 
 using namespace inferences;
 
@@ -39,6 +40,49 @@ std::string create_return_json(const std::string &id, const std::string &model,
   return Json::writeString(writer, root);
 }
 
+void llamaCPP::warmupModel() {
+  auto lock = llama.lock();
+  llama.rewind();
+  llama_reset_timings(llama.ctx);
+
+  llama.prompt = "hello";
+  llama.params.n_predict = 1;
+  llama.loadPrompt();
+  llama.beginCompletion();
+  size_t stop_pos = std::string::npos;
+
+  while (llama.has_next_token) {
+    const completion_token_output token_with_probs = llama.doCompletion();
+    const std::string token_text =
+        token_with_probs.tok == -1
+            ? ""
+            : llama_token_to_piece(llama.ctx, token_with_probs.tok);
+
+    stop_pos = llama.findStoppingStrings(llama.generated_text,
+                                         token_text.size(), STOP_FULL);
+  }
+
+  if (stop_pos == std::string::npos) {
+    stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
+  }
+  if (stop_pos != std::string::npos) {
+    llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
+                               llama.generated_text.end());
+  }
+  auto probs = llama.generated_token_probs;
+  if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
+    const std::vector<llama_token> stop_word_toks =
+        llama_tokenize(llama.ctx, llama.stopping_word, false);
+    probs = std::vector<completion_token_output>(
+        llama.generated_token_probs.begin(),
+        llama.generated_token_probs.end() - stop_word_toks.size());
+  }
+
+  LOG_INFO << "Warm-up generated text:" << llama.generated_text;
+  LOG_INFO << "Warm-up finish";
+  return;
+}
+
 void llamaCPP::chatCompletion(
     const HttpRequestPtr &req,
     std::function<void(const HttpResponsePtr &)> &&callback) {
@@ -297,5 +341,6 @@ void llamaCPP::loadModel(
   jsonResp["message"] = "Model loaded successfully";
   model_loaded = true;
   auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
+  warmupModel();
   callback(resp);
 }
diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
@@ -525,12 +525,12 @@ struct llama_server_context {
       if (llama_decode(ctx,
                        llama_batch_get_one(&embd[n_past], n_eval, n_past, 0))) {
         LOG_ERROR_LLAMA("failed to eval",
-                  {
-                      {"n_eval", n_eval},
-                      {"n_past", n_past},
-                      {"embd",
-                       tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
-                  });
+                        {
+                            {"n_eval", n_eval},
+                            {"n_past", n_past},
+                            {"embd", tokens_to_str(ctx, embd.cbegin() + n_past,
+                                                   embd.cend())},
+                        });
         has_next_token = false;
         return result;
       }
@@ -677,9 +677,9 @@ struct llama_server_context {
     static const int n_embd = llama_n_embd(model);
     if (!params.embedding) {
       LOG_WARNING_LLAMA("embedding disabled",
-                  {
-                      {"params.embedding", params.embedding},
-                  });
+                        {
+                            {"params.embedding", params.embedding},
+                        });
       return std::vector<float>(n_embd, 0.0f);
     }
     const float *data = llama_get_embeddings(ctx);
@@ -891,17 +891,19 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         }
       }
 #else
-      LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not possible "
-                  "to set a tensor split.\n",
-                  {});
+      LOG_WARNING_LLAMA(
+          "llama.cpp was compiled without cuBLAS. It is not possible "
+          "to set a tensor split.\n",
+          {});
 #endif // GGML_USE_CUBLAS
     } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
 #ifdef GGML_USE_CUBLAS
       params.mul_mat_q = false;
 #else
-      LOG_WARNING_LLAMA("warning: llama.cpp was compiled without cuBLAS. Disabling "
-                  "mul_mat_q kernels has no effect.\n",
-                  {});
+      LOG_WARNING_LLAMA(
+          "warning: llama.cpp was compiled without cuBLAS. Disabling "
+          "mul_mat_q kernels has no effect.\n",
+          {});
 #endif // GGML_USE_CUBLAS
     } else if (arg == "--main-gpu" || arg == "-mg") {
       if (++i >= argc) {
@@ -911,9 +913,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 #ifdef GGML_USE_CUBLAS
       params.main_gpu = std::stoi(argv[i]);
 #else
-      LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not possible "
-                  "to set a main GPU.",
-                  {});
+      LOG_WARNING_LLAMA(
+          "llama.cpp was compiled without cuBLAS. It is not possible "
+          "to set a main GPU.",
+          {});
 #endif
     } else if (arg == "--lora") {
       if (++i >= argc) {
@@ -1260,7 +1263,8 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
 public:
   llamaCPP() {
     // Some default values for now below
-    log_disable(); //Disable the log to file feature, reduce bloat for target system ()
+    log_disable(); // Disable the log to file feature, reduce bloat for target
+                   // system ()
   }
   METHOD_LIST_BEGIN
   // list path definitions here;
@@ -1275,6 +1279,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
                  std::function<void(const HttpResponsePtr &)> &&callback);
   void loadModel(const HttpRequestPtr &req,
                  std::function<void(const HttpResponsePtr &)> &&callback);
+  void warmupModel();
 
 private:
   llama_server_context llama;