intel · VincyZhang · Mar 13, 2024 · Mar 7, 2024 · Mar 7, 2024 · Mar 7, 2024
diff --git a/developer_document.md b/developer_document.md
@@ -8,7 +8,7 @@ For simplicity, we take [polyglot](https://huggingface.co/EleutherAI/polyglot-ko
 
 Firstly, we need to add its temp buffer in its [related model-arch header file](neural_speed/models/gptneox/gptneox.h) and [re-compile](README.md#Install).
 ```diff
-static const model_scratch gptneox_mem_req(int n_layers) {
+static const model_scratch gptneox_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
   switch (n_layers) {
     case 44:
       return {2048ull * MB, 2048ull * MB, 4096ull * MB};
@@ -167,7 +167,7 @@ and update [model_name_to_arch()](neural_speed/models/model_utils/model_types.h#
 +  NEW_MODEL_13B,
 +};
 
-+static const model_scratch new_model_mem_req(int n_layers) {
++static const model_scratch new_model_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
 +  switch (n_layers) {
 +    case N:
 +      return {8192ull * MB, 8192ull * MB, 8192ull * MB};
@@ -390,7 +390,7 @@ We recommend to use continuous batching way since it has no padding effect and c
 +     ne_view_2d(ctx0, KQV_merged_contiguous, head_size * n_head, attn_sl * attn_bs, head_size * n_head * ne_element_size(KQV_merged_contiguous), ne_element_size(KQV_merged_contiguous) * off_sl)));
 +  off_sl += head_size * n_head * attn_sl * attn_bs;
 ```
->Note: You can set larger [`NE_MAX_NODES`](neural_speed/core/ne.h#43) and [`model_scratch_enlarge_scale`](neural_speed/models/llama/llama.h#29) values if out of memory when the inputs' batch size becomes larger.
+>Note: You can set larger [`NE_MAX_NODES`](neural_speed/core/ne.h#43) and [`scratch_size_ratio`](neural_speed/models/llama/llama.h#29) values if out of memory when the inputs' batch size becomes larger.
 
 ## 2.3.	Application
 - Q4_0 quant : We can quantize the model generated by convert by adding a quant layer class to quantize it into an int4 low-bit file, so as to obtain better inference performance. Register quant layer class in your new_model_utils.cpp, just like [gptneox_utils.cpp](neural_speed/models/gptneox/gptneox_utils.cpp#L163), replace `gptneox_quant_layer` to your `new_model_quant_layer`.

diff --git a/docs/gptq_and_awq.md b/docs/gptq_and_awq.md
@@ -13,7 +13,7 @@ Validated GPTQ & AWQ models directly from the HuggingFace:
 * [Qwen-7B-Chat-GPTQ](https://huggingface.co/TheBloke/Qwen-7B-Chat-GPTQ) & [Qwen-7B-Chat-AWQ](https://huggingface.co/TheBloke/Qwen-7B-Chat-AWQ) & * [Qwen1.5-7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4)
 * [SOLAR-10.7B-v1.0-GPTQ](https://huggingface.co/TheBloke/SOLAR-10.7B-v1.0-GPTQ)
 
-Please check more validated GPTQ & AWQ models in the list of [supported_models](./docs/supported_models.md).
+Please check more validated GPTQ & AWQ models in the list of [supported_models](./supported_models.md).
 
 ## Examples
 

diff --git a/docs/supported_models.md b/docs/supported_models.md
@@ -10,6 +10,7 @@ Neural Speed supports the following models:
     <th colspan="4">INT8</th>
     <th colspan="4">INT4</th>
     <th rowspan="2">Transformer Version</th>
+    <th rowspan="2">Max tokens length</th>
   </tr>
   <tr>
     <th>RTN</th>
@@ -36,6 +37,7 @@ Neural Speed supports the following models:
     <td>✅</td>
     <td>✅</td>
     <td>Latest</td>
+    <td>4096</td>
   </tr>
   <tr>
     <td><a href="https://huggingface.co/decapoda-research/llama-7b-hf" target="_blank" rel="noopener noreferrer">LLaMA-7B</a>,
@@ -49,6 +51,7 @@ Neural Speed supports the following models:
     <td>✅</td>
     <td>✅</td>
     <td>Latest</td>
+    <td>2048</td>
   </tr>
     <td><a href="https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf" target="_blank" rel="noopener noreferrer">CodeLlama-7b</a></td>
     <td>✅</td>
@@ -60,6 +63,7 @@ Neural Speed supports the following models:
     <td>✅</td>
     <td>✅</td>
     <td>Latest</td>
+    <td>16384</td>
   </tr>
   </tr>
     <td><a href="https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0" target="_blank" rel="noopener noreferrer">Solar-10.7B</a></td>
@@ -72,6 +76,7 @@ Neural Speed supports the following models:
     <td>✅</td>
     <td>✅</td>
     <td>Latest</td>
+    <td>4096</td>
   </tr>
     <tr>
     <td><a href="https://huggingface.co/Intel/neural-chat-7b-v3-1" target="_blank" rel="noopener noreferrer">Neural-Chat-7B-v3-1</a>,
@@ -85,6 +90,7 @@ Neural Speed supports the following models:
     <td>✅</td>
     <td>✅</td>
     <td>Latest</td>
+    <td>32768</td>
   </tr>
   <tr>
     <td><a href="https://huggingface.co/mistralai/Mistral-7B-v0.1" target="_blank" rel="noopener noreferrer">Mistral-7B</a>,
@@ -98,6 +104,7 @@ Neural Speed supports the following models:
     <td>✅</td>
     <td>✅</td>
     <td>4.36.0 or newer</td>
+    <td>32768</td>
   </tr>
   <tr>
     <td><a href="https://huggingface.co/Qwen/Qwen-7B-Chat" target="_blank" rel="noopener noreferrer">Qwen-7B</a>,
@@ -113,6 +120,7 @@ Neural Speed supports the following models:
     <td>✅</td>
     <td>✅</td>
     <td>Latest</td>
+    <td>8192 / 32768</td>
   </tr>
   <tr>
     <td><a href="https://huggingface.co/EleutherAI/gpt-j-6b" target="_blank" rel="noopener noreferrer">GPT-J-6B</a></td>
@@ -125,6 +133,7 @@ Neural Speed supports the following models:
     <td>✅</td>
     <td>✅</td>
     <td>Latest</td>
+    <td>2048</td>
   </tr>
   <tr>
     <td><a href="https://huggingface.co/EleutherAI/gpt-neox-20b" target="_blank" rel="noopener noreferrer">GPT-NeoX-20B</a></td>
@@ -137,6 +146,7 @@ Neural Speed supports the following models:
     <td> </td>
     <td> </td>
     <td>Latest</td>
+    <td>2048</td>
   </tr>
   <tr>
     <td><a href="https://huggingface.co/databricks/dolly-v2-3b" target="_blank" rel="noopener noreferrer">Dolly-v2-3B</a></td>
@@ -149,6 +159,7 @@ Neural Speed supports the following models:
     <td> </td>
     <td> </td>
     <td>4.28.1 or newer</td>
+    <td>2048</td>
   </tr>
   <tr>
     <td><a href="https://huggingface.co/mosaicml/mpt-7b" target="_blank" rel="noopener noreferrer">MPT-7B</a>,
@@ -162,6 +173,7 @@ Neural Speed supports the following models:
     <td> </td>
     <td> </td>
     <td>Latest</td>
+    <td>2048</td>
   </tr>
   <tr>
     <td><a href="https://huggingface.co/tiiuae/falcon-7b" target="_blank" rel="noopener noreferrer">Falcon-7B</a>,
@@ -175,6 +187,7 @@ Neural Speed supports the following models:
     <td> </td>
     <td> </td>
     <td>Latest</td>
+    <td>2048</td>
   </tr>
   <tr>
     <td><a href="https://huggingface.co/bigscience/bloomz-7b1" target="_blank" rel="noopener noreferrer">BLOOM-7B</a></td>
@@ -187,6 +200,7 @@ Neural Speed supports the following models:
     <td> </td>
     <td> </td>
     <td>Latest</td>
+    <td>2048</td>
   </tr>
   <tr>
     <td><a href="https://huggingface.co/facebook/opt-125m" target="_blank" rel="noopener noreferrer">OPT-125m</a>,
@@ -201,6 +215,7 @@ Neural Speed supports the following models:
     <td> </td>
     <td> </td>
     <td>Latest</td>
+    <td>2048</td>
   </tr>
   <tr>
     <td><a href="https://huggingface.co/THUDM/chatglm-6b" target="_blank" rel="noopener noreferrer">ChatGLM-6B</a>,
@@ -214,6 +229,7 @@ Neural Speed supports the following models:
     <td> </td>
     <td> </td>
     <td>4.33.1</td>
+    <td>2048 / 32768</td>
   </tr>
   <tr>
     <td><a href="https://huggingface.co/baichuan-inc/Baichuan-13B-Chat" target="_blank" rel="noopener noreferrer">Baichuan-13B-Chat</a>,
@@ -227,6 +243,7 @@ Neural Speed supports the following models:
     <td> </td>
     <td> </td>
     <td>4.33.1</td>
+    <td>4096</td>
   </tr>
   <tr>
     <td><a href="https://huggingface.co/microsoft/phi-2" target="_blank" rel="noopener noreferrer">phi-2</a>,
@@ -241,6 +258,7 @@ Neural Speed supports the following models:
     <td> </td>
     <td> </td>
     <td>Latest</td>
+    <td>2048</td>
   </tr>
     <tr>
     <td><a href="https://huggingface.co/openai/whisper-tiny" target="_blank" rel="noopener noreferrer">Whisper-tiny</a>,
@@ -257,6 +275,7 @@ Neural Speed supports the following models:
     <td> </td>
     <td> </td>
     <td>Latest</td>
+    <td>448</td>
   </tr>
 </tbody>
 </table>

diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
@@ -24,6 +24,7 @@
 
 
 class Model:
+
     def __init__(self):
         self.module = None
         self.model = None
@@ -84,9 +85,19 @@ def get_model_type(model_config):
             model_type = "chatglm2"
         return model_type
 
-    def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_autoround=False,
-            weight_dtype="int4", alg="sym", group_size=32,
-            scale_dtype="fp32", compute_dtype="int8", use_ggml=False, model_hub="huggingface"):
+    def init(self,
+             model_name,
+             use_quant=True,
+             use_gptq=False,
+             use_awq=False,
+             use_autoround=False,
+             weight_dtype="int4",
+             alg="sym",
+             group_size=32,
+             scale_dtype="fp32",
+             compute_dtype="int8",
+             use_ggml=False,
+             model_hub="huggingface"):
         if model_hub == "modelscope":
             from modelscope import AutoConfig
             self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
@@ -124,24 +135,28 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_au
             self.bin_file = quant_bin
 
         if os.path.exists(self.bin_file):
-            print("{} existed, will use cache file. Otherwise please remove the file".
-                  format(self.bin_file))
+            print("{} existed, will use cache file. Otherwise please remove the file".format(self.bin_file))
             return
 
         if use_gptq or use_awq or use_autoround:
             convert_model(model_name, quant_bin, use_quantized_model=True)
             return
 
         if not os.path.exists(fp32_bin):
-            convert_model(model_name, fp32_bin, "f32", model_hub = model_hub)
+            convert_model(model_name, fp32_bin, "f32", model_hub=model_hub)
             assert os.path.exists(fp32_bin), "Fail to convert pytorch model"
 
         if not use_quant:
             print("FP32 model will be used.")
             return
-        self.module.Model.quant_model(model_path=fp32_bin, out_path=quant_bin,
-                                    weight_dtype=weight_dtype, alg=alg, group_size=group_size,
-                                    scale_dtype=scale_dtype, compute_dtype=compute_dtype, use_ggml=use_ggml)
+        self.module.Model.quant_model(model_path=fp32_bin,
+                                      out_path=quant_bin,
+                                      weight_dtype=weight_dtype,
+                                      alg=alg,
+                                      group_size=group_size,
+                                      scale_dtype=scale_dtype,
+                                      compute_dtype=compute_dtype,
+                                      use_ggml=use_ggml)
         assert os.path.exists(quant_bin), "Fail to quantize model"
 
         # clean
@@ -150,9 +165,11 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_au
     def init_from_bin(self, model_type, model_path, **generate_kwargs):
         self.__import_package(model_type)
         self.model = self.module.Model()
+
         if self.max_request_num == -1:
-            self.max_request_num = max(generate_kwargs.get("max_request_num",
-                            max_request_num_default), generate_kwargs.get("batch_size", 1))
+            self.max_request_num = max(generate_kwargs.get("max_request_num", max_request_num_default),
+                                       generate_kwargs.get("batch_size", 1))
+
         if "threads" not in generate_kwargs:
             threads = os.getenv("OMP_NUM_THREADS")
             import platform
@@ -165,29 +182,107 @@ def init_from_bin(self, model_type, model_path, **generate_kwargs):
                     generate_kwargs["threads"] = len(os.sched_getaffinity(0))
             else:
                 generate_kwargs["threads"] = int(threads)
-        self.model.init_model(model_path, **generate_kwargs)
 
+        # Setting scratch_size_ratio according to the ctx_size & tokens_length
+        # If scratch_size_ratio has been set, will not enter this branch.
+        if generate_kwargs.get("ctx_size") is not None and generate_kwargs.get(
+                "ctx_size") > 2048 and generate_kwargs.get("scratch_size_ratio") is None:
+
+            def get_max_seq_length():
+                config = self.config.to_dict()
+                # chatglm2, bloom
+                if 'seq_length' in config:
+                    return config['seq_length']
+                # qwen2, llama-2, llama, dolly, gptneox, qwen, qwen1.5, opt, phi
+                elif 'max_position_embeddings' in config:
+                    return config['max_position_embeddings']
+                # baichuan, baichuan2
+                elif 'model_max_length' in config:
+                    return config['model_max_length']
+                # gptj
+                elif 'n_positions' in config:
+                    return config['n_positions']
+                # mpt
+                elif 'max_seq_len' in config:
+                    return config['max_seq_len']
+                # chatglm
+                elif 'max_sequence_length' in config:
+                    return config['max_sequence_length']
+                # whisper
+                elif 'max_length' in config:
+                    return config['max_length']
+                # Falcon does not have these parameters.
+                elif model_type == "falcon":
+                    return 2048
+                else:
+                    print("Not found max seq length, setting to default 512")
+                    return 512
+
+            # when tokens less than 10240
+            def get_scratch_size_ratio(size):
+                if size > 2048 and size <= 4096:
+                    return 2
+                elif size > 4096 and size <= 8192:
+                    return 4
+                elif size > 8192 and size <= 10240:
+                    return 8
+                else:
+                    # more than 10240
+                    return -1
+
+            max_seq_length = get_max_seq_length()
+            ctx_size = generate_kwargs.get("ctx_size")
+
+            if ctx_size > max_seq_length:
+                print(f'max_seq_length is {max_seq_length}, but ctx_size is {ctx_size}. Please reduce ctx_size.')
+                exit(0)
+
+            if max_seq_length > 2048 and max_seq_length <= 4096:
+                generate_kwargs["scratch_size_ratio"] = 2
+            elif max_seq_length > 4096 and max_seq_length <= 8192:
+                generate_kwargs["scratch_size_ratio"] = 4
+            elif max_seq_length > 8192:
+                if get_scratch_size_ratio(ctx_size) != -1:
+                    generate_kwargs["scratch_size_ratio"] = get_scratch_size_ratio(ctx_size)
+                else:
+                    if max_seq_length == 16384:
+                        generate_kwargs["scratch_size_ratio"] = 12
+                    elif max_seq_length == 32768:
+                        if ctx_size < 20480:
+                            generate_kwargs["scratch_size_ratio"] = 20
+                        else:
+                            generate_kwargs["scratch_size_ratio"] = 35
+
+        self.model.init_model(model_path, **generate_kwargs)
 
     def quant_model(self, model_type, model_path, out_path, **quant_kwargs):
         self.__import_package(model_type)
         self.module.Model.quant_model(model_path=model_path, out_path=out_path, **quant_kwargs)
 
+    def generate(self,
+                 input_ids,
+                 streamer=None,
+                 interactive=False,
+                 ignore_prompt=False,
+                 stopping_criteria=None,
+                 **generate_kwargs):
+        batch_size = input_ids.shape[0]
 
-    def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=False,
-                 stopping_criteria=None,  **generate_kwargs):
         max_new_tokens = generate_kwargs.get("max_new_tokens", -1)
-        input_bs = input_ids.shape[0]
         max_request_num = generate_kwargs.pop("max_request_num", max_request_num_default)
         reinit_from_bin = False
-        if max_request_num > self.max_request_num or input_bs > self.max_request_num:
+        if max_request_num > self.max_request_num or batch_size > self.max_request_num:
             reinit_from_bin = True
             if self.max_request_num > 0:
                 print("Will start to reinit model from bin due to different max request num.")
-            self.max_request_num = max(input_bs, max_request_num)
+            self.max_request_num = max(batch_size, max_request_num)
 
         if self.model is None or reinit_from_bin:
-            self.init_from_bin(self.model_type, self.bin_file, batch_size=input_bs,
-                               max_request_num = self.max_request_num, **generate_kwargs)
+            self.init_from_bin(self.model_type,
+                               self.bin_file,
+                               batch_size=batch_size,
+                               max_request_num=self.max_request_num,
+                               **generate_kwargs)
             self.generate_round = 0
         elif not interactive:
             self.model.reinit()
@@ -208,6 +303,7 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa
             assert input_ids.shape[0] == 1, "Streamer only supports batch size 1."
             assert beam_search == False, "ERROR, can not use streamer when use beam search for generation! \
                 Make sure that `num_beams` is set to 1."
+
             if self.generate_round == 0 and not ignore_prompt:
                 streamer.put(input_ids)
 
@@ -284,6 +380,6 @@ def _cont_batching_input(self, input_ids, pad_token_id=None):
         for il in range(len(input_list)):
             count = input_list[il].count(pti)
             # padding left
-            del input_list[il][0: count]
+            del input_list[il][0:count]
             assert input_list[il] != [], "there are all pad tokens in batch {}.".format(il)
         return input_list