-4 -> n_keep

intel · Mar 7, 2024 · c6b5fcd · c6b5fcd
1 parent c4b34c0
commit c6b5fcd
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 16 deletions.
diff --git a/neural_speed/application/main_pybind.cpp b/neural_speed/application/main_pybind.cpp
@@ -533,12 +533,12 @@ const std::vector<float>& Model::evaluate_(const std::vector<std::vector<model_t
     } else if (!curr_input_ids[bs].empty()) {
       fprintf(stderr, "%s: error: prompt confliction\n", __func__);
       return empty_ret;
-    } else if (input_id_cb.size() > n_ctx - 4) {  // long input_id_cb and empty curr_input_ids[bs]
+    } else if (input_id_cb.size() > n_ctx - n_keep) {  // long input_id_cb and empty curr_input_ids[bs]
       fprintf(stderr, "\n%s: Warning: prompt is too long (%zu tokens, max %d), will be truncated\n", __func__,
-              input_id_cb.size(), n_ctx - 4);
-      curr_input_ids[bs].resize(n_ctx - 4);
-      std::copy(input_id_cb.end() - n_ctx - 8, input_id_cb.end(), curr_input_ids[bs].begin() + 4);
-      std::copy(input_id_cb.begin(), input_id_cb.begin() + 4, curr_input_ids[bs].begin());
+              input_id_cb.size(), n_ctx - n_keep);
+      curr_input_ids[bs].resize(n_ctx - n_keep);
+      std::copy(input_id_cb.end() - n_ctx - n_keep * 2, input_id_cb.end(), curr_input_ids[bs].begin() + n_keep);
+      std::copy(input_id_cb.begin(), input_id_cb.begin() + n_keep, curr_input_ids[bs].begin());
     } else {  // good input_id_cb and empty curr_input_ids[bs]
       curr_input_ids[bs] = input_id_cb;
     }
@@ -648,13 +648,13 @@ std::vector<std::vector<model_token>> Model::generate_tokens(const std::vector<s
   }
 
   if (curr_input_ids[STATIC_INPUT_HEAD_IDX].empty()) {
-    if (input_ids[STATIC_INPUT_HEAD_IDX].size() > n_ctx - 4) {
+    if (input_ids[STATIC_INPUT_HEAD_IDX].size() > n_ctx - n_keep) {
       fprintf(stderr, "\n%s: Warning: prompt is too long (%zu tokens, max %d), will be truncated\n", __func__,
-              input_ids[STATIC_INPUT_HEAD_IDX].size(), n_ctx - 4);
-      curr_input_ids[STATIC_INPUT_HEAD_IDX].resize(n_ctx - 4);
-      std::copy(input_ids[STATIC_INPUT_HEAD_IDX].end() - n_ctx - 8, input_ids[STATIC_INPUT_HEAD_IDX].end(),
-                curr_input_ids[STATIC_INPUT_HEAD_IDX].begin() + 4);
-      std::copy(input_ids[STATIC_INPUT_HEAD_IDX].begin(), input_ids[STATIC_INPUT_HEAD_IDX].begin() + 4,
+              input_ids[STATIC_INPUT_HEAD_IDX].size(), n_ctx - n_keep);
+      curr_input_ids[STATIC_INPUT_HEAD_IDX].resize(n_ctx - n_keep);
+      std::copy(input_ids[STATIC_INPUT_HEAD_IDX].end() - n_ctx - n_keep * 2, input_ids[STATIC_INPUT_HEAD_IDX].end(),
+                curr_input_ids[STATIC_INPUT_HEAD_IDX].begin() + n_keep);
+      std::copy(input_ids[STATIC_INPUT_HEAD_IDX].begin(), input_ids[STATIC_INPUT_HEAD_IDX].begin() + n_keep,
                 curr_input_ids[STATIC_INPUT_HEAD_IDX].begin());
     } else {
       curr_input_ids[STATIC_INPUT_HEAD_IDX] = input_ids[STATIC_INPUT_HEAD_IDX];

diff --git a/neural_speed/application/main_run.cpp b/neural_speed/application/main_run.cpp
@@ -241,9 +241,9 @@ int main(int argc, char** argv) {  // NOLINT
 
   const int n_ctx = model_n_ctx(ctx);
 
-  if (static_cast<int>(embd_inp.size()) > n_ctx - 4) {
+  if (static_cast<int>(embd_inp.size()) > n_ctx - n_keep) {
     fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, static_cast<int>(embd_inp.size()),
-            n_ctx - 4);
+            n_ctx - n_keep);
     return 1;
   }
 

diff --git a/neural_speed/application/pybind_gptj.cpp b/neural_speed/application/pybind_gptj.cpp
@@ -35,9 +35,9 @@ static model_context** g_ctx;
 
 bool gptj_model_eval_ids(model_context* ctx, model_token* tokens, size_t n_eval, size_t n_past, size_t n_threads) {
   const int n_ctx = model_n_ctx(ctx);
-  if (static_cast<int>(n_eval) > n_ctx - 4) {
+  if (static_cast<int>(n_eval) > n_ctx - n_keep) {
     fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, static_cast<int>(n_eval),
-            n_ctx - 4);
+            n_ctx - n_keep);
     return true;
   }
 

diff --git a/neural_speed/models/model_utils/model_utils.cpp b/neural_speed/models/model_utils/model_utils.cpp
@@ -912,7 +912,7 @@ struct model_context* model_init_from_file(const char* path_model, struct model_
   ctx->cont_batching = params.cont_batching;
   ctx->generation_conf = params.gen_conf;
 
-  ctx->scratch_size_ratio = params.scratch_size_ratio * params.max_request_num * params.beam_size
+  ctx->scratch_size_ratio = params.scratch_size_ratio * params.max_request_num * params.beam_size;
 
   const model_archs arch = params.arch;