intel · VincyZhang · Mar 27, 2024 · Mar 25, 2024 · Mar 25, 2024 · Mar 25, 2024
diff --git a/docs/supported_models.md b/docs/supported_models.md
@@ -94,6 +94,7 @@ Neural Speed supports the following models:
   </tr>
   <tr>
     <td><a href="https://huggingface.co/mistralai/Mistral-7B-v0.1" target="_blank" rel="noopener noreferrer">Mistral-7B</a>,
+     <a href="https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2" target="_blank" rel="noopener noreferrer">Mistral-7B-Instruct-v0.2</a>,
      <a href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1" target="_blank" rel="noopener noreferrer">Mixtral-8x7B</a></td>
     <td>✅</td>
     <td>✅</td>
@@ -402,15 +403,15 @@ Neural Speed supports the following models:
     <td></td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF" target="_blank" rel="noopener noreferrer">TheBloke/Mistral-7B-v0.1-GGUF</a>,
+    <td><a href="https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF" target="_blank" rel="noopener noreferrer">TheBloke/Mistral-7B-v0.1-GGUF</a>, <a href="https://huggingface.co/TheBloke/Mistral-7B-v0.2-GGUF" target="_blank" rel="noopener noreferrer">TheBloke/Mistral-7B-v0.2-GGUF</a>,
     <td>✅</td>
     <td>✅</td>
     <td>✅</td>
     <td>✅</td>
     <td></td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF" target="_blank" rel="noopener noreferrer">TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUFF</a>,
+    <td><a href="https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF" target="_blank" rel="noopener noreferrer">TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF</a>
     <td>✅</td>
     <td>✅</td>
     <td>✅</td>
@@ -425,18 +426,16 @@ Neural Speed supports the following models:
     <td>✅</td>
     <td></td>
   </tr>
-    </tr>
     <tr>
-    <td><a href="https://huggingface.co/codellama/CodeLlama-7b-hf" target="_blank" rel="noopener noreferrer">TheBloke/CodeLlama-7B-GGUF</a></td>
+    <td><a href="https://huggingface.co/codellama/CodeLlama-7b-hf" target="_blank" rel="noopener noreferrer">TheBloke/CodeLlama-7B-GGUF</a>,<a href="https://huggingface.co/codellama/CodeLlama-13b-hf" target="_blank" rel="noopener noreferrer">TheBloke/CodeLlama-13B-GGUF</a></td>
     <td>✅</td>
     <td>✅</td>
     <td>✅</td>
     <td>✅</td>
     <td></td>
   </tr>
-    </tr>
     <tr>
-    <td><a href="https://huggingface.co/codellama/CodeLlama-13b-hf" target="_blank" rel="noopener noreferrer">TheBloke/CodeLlama-13B-GGUF</a></td>
+    <td><a href="https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GGUF" target="_blank" rel="noopener noreferrer">Qwen1.5-7B-Chat-GGUF</a></td>
     <td>✅</td>
     <td>✅</td>
     <td>✅</td>
@@ -470,7 +469,7 @@ Neural Speed supports the following models:
   </tr>
   <tr>
     <td><a href="https://huggingface.co/Qwen/Qwen-7B-Chat" target="_blank" rel="noopener noreferrer">Qwen-7B-Chat</a>,
-    <a href="https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GGUF" target="_blank" rel="noopener noreferrer">Qwen1.5-7B-Chat-GGUF</a></td>
+    <a href="https://huggingface.co/Qwen/Qwen1.5-7B-Chat" target="_blank" rel="noopener noreferrer">Qwen1.5-7B-Chat</a></td>
     <td>✅</td>
     <td>✅</td>
     <td>✅</td>

diff --git a/neural_speed/models/model_utils/model_files.h b/neural_speed/models/model_utils/model_files.h
@@ -1138,15 +1138,15 @@ struct model_file_loader {
     printf("%-16s %d.hparams.original_max_position_embeddings = %-30d\n", __func__, count++,
            hparams.original_max_position_embeddings);
     printf("%-16s %d.hparams.use_yarn = %-30d\n", __func__, count++, hparams.use_yarn);
-    unsigned int total = 25;
+    unsigned int total = 26;
     if (count != total) {
-      fprintf(stderr, "The number of ne_parameters is wrong.\n");
+      fprintf(stderr, "The number of ne_parameters is wrong, total = %d, count = %d.\n", total, count);
     }
   }
 
   void load_ne_vocab() {
     unsigned int count = 0;
-    unsigned int ne_hparams_total = 25;
+    unsigned int ne_hparams_total = 26;
     file.read_raw(&vocab.bos_token_id, sizeof(model_vocab::id));
     file.read_raw(&vocab.eos_token_id, sizeof(model_vocab::id));
     file.read_raw(&vocab.pad_token_id, sizeof(model_vocab::id));

diff --git a/scripts/python_api_example_for_gguf.py b/scripts/python_api_example_for_gguf.py
@@ -50,7 +50,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
 
     gguf_path = args.model.as_posix()
 
-    prompt = "Once upon a time"
+    prompt = args.prompt
     tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
     inputs = tokenizer(prompt, return_tensors="pt").input_ids
     streamer = TextStreamer(tokenizer)