diff --git a/README.md b/README.md index 57121ab41..563274fbb 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,22 @@ streamer = TextStreamer(tokenizer) model = AutoModelForCausalLM.from_pretrained(model_name, model_file = model_file) outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300) ``` +Pytorch format modelscpoe model +```python +import sys +from modelscope import AutoTokenizer +from transformers import TextStreamer +from neural_speed import Model +model_name = "qwen/Qwen1.5-7B-Chat" # Hugging Face model_id or local model +prompt = "Once upon a time, there existed a little girl," +tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) +inputs = tokenizer(prompt, return_tensors="pt").input_ids +streamer = TextStreamer(tokenizer) +model = Model() +model.init(model_name, weight_dtype="int4", compute_dtype="int8", model_hub="modelscope") +outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300) +``` Please refer [this link](./docs/supported_models.md) to check supported models. If you want to use [Transformer-based API](https://github.com/intel/intel-extension-for-transformers/blob/main/docs/weightonlyquant.md#llm-runtime-example-code) in [ITREX(Intel extension for transformers)](https://github.com/intel/intel-extension-for-transformers). Please refer to [ITREX Installation Page](https://github.com/intel/intel-extension-for-transformers/blob/main/docs/installation.md). diff --git a/neural_speed/convert/convert_bloom.py b/neural_speed/convert/convert_bloom.py index d68c5f56b..6b9415f96 100644 --- a/neural_speed/convert/convert_bloom.py +++ b/neural_speed/convert/convert_bloom.py @@ -70,15 +70,16 @@ def main(args_in: Optional[List[str]] = None) -> None: from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer else: from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) - config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True) - hparams = config.to_dict() - print("Loading model: ", dir_model) model = AutoModelForCausalLM.from_pretrained(dir_model, config=config, torch_dtype=torch.float16 if ftype == 1 else torch.float32, low_cpu_mem_usage=True, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(dir_model) + config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True) + hparams = config.to_dict() + print("Loading model: ", dir_model) + print("Model loaded: ", dir_model) fout = open(fname_out, "wb") diff --git a/neural_speed/convert/convert_chatglm.py b/neural_speed/convert/convert_chatglm.py index 171ffce6a..562cd7763 100644 --- a/neural_speed/convert/convert_chatglm.py +++ b/neural_speed/convert/convert_chatglm.py @@ -633,9 +633,9 @@ def main(args_in: Optional[List[str]] = None) -> None: from modelscope import AutoConfig, AutoModel, AutoTokenizer else: from transformers import AutoConfig, AutoModel, AutoTokenizer + model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True) config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True) hparams = config.to_dict() diff --git a/neural_speed/convert/convert_dolly.py b/neural_speed/convert/convert_dolly.py index 47335f94d..ac11f332f 100644 --- a/neural_speed/convert/convert_dolly.py +++ b/neural_speed/convert/convert_dolly.py @@ -78,9 +78,9 @@ def main(args_in: Optional[List[str]] = None) -> None: from modelscope import AutoModelForCausalLM, AutoTokenizer else: from transformers import AutoModelForCausalLM, AutoTokenizer + model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32) tokenizer = AutoTokenizer.from_pretrained(dir_model) print("Loading model: ", dir_model) - model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32) model.eval() for p in model.parameters(): p.requires_grad = False diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py index cf4f15f2e..6c974008c 100644 --- a/neural_speed/convert/convert_falcon.py +++ b/neural_speed/convert/convert_falcon.py @@ -70,13 +70,6 @@ def main(args_in: Optional[List[str]] = None) -> None: from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer else: from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True) - with open(os.path.join(dir_model, "config.json"), "r", encoding="utf-8") as f: - hparams = json.load(f) - if hparams["architectures"][0] != "FalconForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - sys.exit(1) print("Loading model: ", dir_model) model = AutoModelForCausalLM.from_pretrained(dir_model, config=config, @@ -84,6 +77,13 @@ def main(args_in: Optional[List[str]] = None) -> None: low_cpu_mem_usage=True, trust_remote_code=True) print("Model loaded: ", dir_model) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True) + with open(os.path.join(dir_model, "config.json"), "r", encoding="utf-8") as f: + hparams = json.load(f) + if hparams["architectures"][0] != "FalconForCausalLM": + print("Model architecture not supported: " + hparams["architectures"][0]) + sys.exit(1) n_head_kv = hparams.get("num_kv_heads", 1) n_head = hparams["num_attention_heads"] diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py index 8c67bbec1..48acf3648 100644 --- a/neural_speed/convert/convert_gptj.py +++ b/neural_speed/convert/convert_gptj.py @@ -73,8 +73,8 @@ def main(args_in: Optional[List[str]] = None) -> None: else: from transformers import AutoModelForCausalLM, AutoTokenizer print("Loading model: ", dir_model) - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) hparams = model.config.to_dict() list_vars = model.state_dict() fout = open(fname_out, "wb") diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py index cee3c7bbd..8b491a840 100644 --- a/neural_speed/convert/convert_gptneox.py +++ b/neural_speed/convert/convert_gptneox.py @@ -79,9 +79,9 @@ def main(args_in: Optional[List[str]] = None) -> None: from modelscope import AutoModelForCausalLM, AutoTokenizer else: from transformers import AutoModelForCausalLM, AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) print("Loading model: ", dir_model) model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32) + tokenizer = AutoTokenizer.from_pretrained(dir_model) model.eval() for p in model.parameters(): p.requires_grad = False diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py index 70b1b9384..cbf83f2c8 100644 --- a/neural_speed/convert/convert_llama.py +++ b/neural_speed/convert/convert_llama.py @@ -1453,6 +1453,7 @@ def main(args_in: Optional[List[str]] = None) -> None: else: from transformers import AutoModelForCausalLM, AutoTokenizer model = AutoModelForCausalLM.from_pretrained(str(args.model), low_cpu_mem_usage=True, trust_remote_code=True) + import pdb;pdb.set_trace() tokenizer = AutoTokenizer.from_pretrained(str(args.model), trust_remote_code=True) cache_path = Path(tokenizer.vocab_file).parent args.model = cache_path diff --git a/neural_speed/convert/convert_mpt.py b/neural_speed/convert/convert_mpt.py index 0ad152141..e7ffe6929 100644 --- a/neural_speed/convert/convert_mpt.py +++ b/neural_speed/convert/convert_mpt.py @@ -67,8 +67,8 @@ def main(args_in: Optional[List[str]] = None) -> None: from modelscope import AutoModelForCausalLM, AutoTokenizer else: from transformers import AutoModelForCausalLM, AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) hparams = model.config.to_dict() list_vars = model.state_dict() diff --git a/neural_speed/convert/convert_opt.py b/neural_speed/convert/convert_opt.py index 7dcb8620d..182f00c69 100644 --- a/neural_speed/convert/convert_opt.py +++ b/neural_speed/convert/convert_opt.py @@ -77,9 +77,9 @@ def main(args_in: Optional[List[str]] = None) -> None: from modelscope import AutoModelForCausalLM, AutoTokenizer else: from transformers import AutoModelForCausalLM, AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) print("Loading model: ", dir_model) model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32) + tokenizer = AutoTokenizer.from_pretrained(dir_model) model.eval() hparams = model.config.to_dict() diff --git a/neural_speed/convert/convert_phi.py b/neural_speed/convert/convert_phi.py index dad0b9574..3e45ad031 100644 --- a/neural_speed/convert/convert_phi.py +++ b/neural_speed/convert/convert_phi.py @@ -289,9 +289,9 @@ def main(args_in: Optional[List[str]] = None) -> None: from modelscope import AutoModelForCausalLM, AutoTokenizer else: from transformers import AutoModelForCausalLM, AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) print("Loading model: ", dir_model) model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) hparams = model.config.to_dict() if args.format == "GGUF": phi_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams) diff --git a/neural_speed/convert/convert_qwen.py b/neural_speed/convert/convert_qwen.py index 329b03245..7c6eb159b 100644 --- a/neural_speed/convert/convert_qwen.py +++ b/neural_speed/convert/convert_qwen.py @@ -73,8 +73,6 @@ def main(args_in: Optional[List[str]] = None) -> None: # ftype == 0 -> float32 # ftype == 1 -> float16 ftype = 0 - import pdb - pdb.set_trace() if args.outtype == "f16": ftype = 1 if args.model_hub == "modelscope": @@ -82,8 +80,8 @@ def main(args_in: Optional[List[str]] = None) -> None: else: from transformers import AutoModelForCausalLM, AutoTokenizer print("Loading model: ", dir_model) - tokenizer = AutoTokenizer.from_pretrained(dir_model) model = AutoModelForCausalLM.from_pretrained(dir_model) + tokenizer = AutoTokenizer.from_pretrained(dir_model) model.eval() for p in model.parameters(): p.requires_grad = False diff --git a/neural_speed/convert/convert_starcoder.py b/neural_speed/convert/convert_starcoder.py index 628b431b9..fd32d2109 100644 --- a/neural_speed/convert/convert_starcoder.py +++ b/neural_speed/convert/convert_starcoder.py @@ -74,15 +74,15 @@ def main(args_in: Optional[List[str]] = None) -> None: else: from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer print("Loading model: ", dir_model) - tokenizer = AutoTokenizer.from_pretrained(dir_model) - config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True) - hparams = config.to_dict() model = AutoModelForCausalLM.from_pretrained(dir_model, config=config, torch_dtype=torch.float16 \ if use_f16 else torch.float32, low_cpu_mem_usage=True, trust_remote_code=True) print("Model loaded: ", dir_model) + tokenizer = AutoTokenizer.from_pretrained(dir_model) + config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True) + hparams = config.to_dict() list_vars = model.state_dict()