diff --git a/docs/supported_models.md b/docs/supported_models.md
index df8135677..115dd693a 100644
--- a/docs/supported_models.md
+++ b/docs/supported_models.md
@@ -94,6 +94,7 @@ Neural Speed supports the following models:
Mistral-7B,
+ Mistral-7B-Instruct-v0.2,
Mixtral-8x7B |
✅ |
✅ |
@@ -402,7 +403,7 @@ Neural Speed supports the following models:
|
- TheBloke/Mistral-7B-v0.1-GGUF,
+ | TheBloke/Mistral-7B-v0.1-GGUF, TheBloke/Mistral-7B-v0.2-GGUF,
| ✅ |
✅ |
✅ |
@@ -410,7 +411,7 @@ Neural Speed supports the following models:
|
- TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUFF,
+ | TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF
| ✅ |
✅ |
✅ |
@@ -425,18 +426,16 @@ Neural Speed supports the following models:
✅ |
|
-
- TheBloke/CodeLlama-7B-GGUF |
+ TheBloke/CodeLlama-7B-GGUF,TheBloke/CodeLlama-13B-GGUF |
✅ |
✅ |
✅ |
✅ |
|
-
- TheBloke/CodeLlama-13B-GGUF |
+ Qwen1.5-7B-Chat-GGUF |
✅ |
✅ |
✅ |
@@ -470,7 +469,7 @@ Neural Speed supports the following models:
Qwen-7B-Chat,
- Qwen1.5-7B-Chat-GGUF |
+ Qwen1.5-7B-Chat
✅ |
✅ |
✅ |
diff --git a/neural_speed/models/model_utils/model_files.h b/neural_speed/models/model_utils/model_files.h
index b586d0ea5..1a72d4c02 100644
--- a/neural_speed/models/model_utils/model_files.h
+++ b/neural_speed/models/model_utils/model_files.h
@@ -1138,15 +1138,15 @@ struct model_file_loader {
printf("%-16s %d.hparams.original_max_position_embeddings = %-30d\n", __func__, count++,
hparams.original_max_position_embeddings);
printf("%-16s %d.hparams.use_yarn = %-30d\n", __func__, count++, hparams.use_yarn);
- unsigned int total = 25;
+ unsigned int total = 26;
if (count != total) {
- fprintf(stderr, "The number of ne_parameters is wrong.\n");
+ fprintf(stderr, "The number of ne_parameters is wrong, total = %d, count = %d.\n", total, count);
}
}
void load_ne_vocab() {
unsigned int count = 0;
- unsigned int ne_hparams_total = 25;
+ unsigned int ne_hparams_total = 26;
file.read_raw(&vocab.bos_token_id, sizeof(model_vocab::id));
file.read_raw(&vocab.eos_token_id, sizeof(model_vocab::id));
file.read_raw(&vocab.pad_token_id, sizeof(model_vocab::id));
diff --git a/scripts/python_api_example_for_gguf.py b/scripts/python_api_example_for_gguf.py
index 905071ec4..ccd5d5688 100644
--- a/scripts/python_api_example_for_gguf.py
+++ b/scripts/python_api_example_for_gguf.py
@@ -50,7 +50,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
gguf_path = args.model.as_posix()
- prompt = "Once upon a time"
+ prompt = args.prompt
tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
inputs = tokenizer(prompt, return_tensors="pt").input_ids
streamer = TextStreamer(tokenizer)