diff --git a/docs/supported_models.md b/docs/supported_models.md index df8135677..115dd693a 100644 --- a/docs/supported_models.md +++ b/docs/supported_models.md @@ -94,6 +94,7 @@ Neural Speed supports the following models: Mistral-7B, + Mistral-7B-Instruct-v0.2, Mixtral-8x7B ✅ ✅ @@ -402,7 +403,7 @@ Neural Speed supports the following models: - TheBloke/Mistral-7B-v0.1-GGUF, + TheBloke/Mistral-7B-v0.1-GGUF, TheBloke/Mistral-7B-v0.2-GGUF, ✅ ✅ ✅ @@ -410,7 +411,7 @@ Neural Speed supports the following models: - TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUFF, + TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF ✅ ✅ ✅ @@ -425,18 +426,16 @@ Neural Speed supports the following models: ✅ - - TheBloke/CodeLlama-7B-GGUF + TheBloke/CodeLlama-7B-GGUF,TheBloke/CodeLlama-13B-GGUF ✅ ✅ ✅ ✅ - - TheBloke/CodeLlama-13B-GGUF + Qwen1.5-7B-Chat-GGUF ✅ ✅ ✅ @@ -470,7 +469,7 @@ Neural Speed supports the following models: Qwen-7B-Chat, - Qwen1.5-7B-Chat-GGUF + Qwen1.5-7B-Chat ✅ ✅ ✅ diff --git a/neural_speed/models/model_utils/model_files.h b/neural_speed/models/model_utils/model_files.h index b586d0ea5..1a72d4c02 100644 --- a/neural_speed/models/model_utils/model_files.h +++ b/neural_speed/models/model_utils/model_files.h @@ -1138,15 +1138,15 @@ struct model_file_loader { printf("%-16s %d.hparams.original_max_position_embeddings = %-30d\n", __func__, count++, hparams.original_max_position_embeddings); printf("%-16s %d.hparams.use_yarn = %-30d\n", __func__, count++, hparams.use_yarn); - unsigned int total = 25; + unsigned int total = 26; if (count != total) { - fprintf(stderr, "The number of ne_parameters is wrong.\n"); + fprintf(stderr, "The number of ne_parameters is wrong, total = %d, count = %d.\n", total, count); } } void load_ne_vocab() { unsigned int count = 0; - unsigned int ne_hparams_total = 25; + unsigned int ne_hparams_total = 26; file.read_raw(&vocab.bos_token_id, sizeof(model_vocab::id)); file.read_raw(&vocab.eos_token_id, sizeof(model_vocab::id)); file.read_raw(&vocab.pad_token_id, sizeof(model_vocab::id)); diff --git a/scripts/python_api_example_for_gguf.py b/scripts/python_api_example_for_gguf.py index 905071ec4..ccd5d5688 100644 --- a/scripts/python_api_example_for_gguf.py +++ b/scripts/python_api_example_for_gguf.py @@ -50,7 +50,7 @@ def main(args_in: Optional[List[str]] = None) -> None: gguf_path = args.model.as_posix() - prompt = "Once upon a time" + prompt = args.prompt tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) inputs = tokenizer(prompt, return_tensors="pt").input_ids streamer = TextStreamer(tokenizer)