Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
update README
Browse files Browse the repository at this point in the history
Signed-off-by: intellinjun <[email protected]>
  • Loading branch information
intellinjun committed Mar 6, 2024
1 parent d693673 commit dc07b67
Show file tree
Hide file tree
Showing 13 changed files with 39 additions and 24 deletions.
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,22 @@ streamer = TextStreamer(tokenizer)
model = AutoModelForCausalLM.from_pretrained(model_name, model_file = model_file)
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
```
Pytorch format modelscpoe model
```python
import sys
from modelscope import AutoTokenizer
from transformers import TextStreamer
from neural_speed import Model

model_name = "qwen/Qwen1.5-7B-Chat" # Hugging Face model_id or local model
prompt = "Once upon a time, there existed a little girl,"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
inputs = tokenizer(prompt, return_tensors="pt").input_ids
streamer = TextStreamer(tokenizer)
model = Model()
model.init(model_name, weight_dtype="int4", compute_dtype="int8", model_hub="modelscope")
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
```
Please refer [this link](./docs/supported_models.md) to check supported models.

If you want to use [Transformer-based API](https://github.com/intel/intel-extension-for-transformers/blob/main/docs/weightonlyquant.md#llm-runtime-example-code) in [ITREX(Intel extension for transformers)](https://github.com/intel/intel-extension-for-transformers). Please refer to [ITREX Installation Page](https://github.com/intel/intel-extension-for-transformers/blob/main/docs/installation.md).
Expand Down
9 changes: 5 additions & 4 deletions neural_speed/convert/convert_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,16 @@ def main(args_in: Optional[List[str]] = None) -> None:
from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model)
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
hparams = config.to_dict()
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model,
config=config,
torch_dtype=torch.float16 if ftype == 1 else torch.float32,
low_cpu_mem_usage=True,
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
hparams = config.to_dict()
print("Loading model: ", dir_model)

print("Model loaded: ", dir_model)

fout = open(fname_out, "wb")
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/convert/convert_chatglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,9 +633,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
from modelscope import AutoConfig, AutoModel, AutoTokenizer
else:
from transformers import AutoConfig, AutoModel, AutoTokenizer
model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)

hparams = config.to_dict()

Expand Down
2 changes: 1 addition & 1 deletion neural_speed/convert/convert_dolly.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
from modelscope import AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
model.eval()
for p in model.parameters():
p.requires_grad = False
Expand Down
14 changes: 7 additions & 7 deletions neural_speed/convert/convert_falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,20 +70,20 @@ def main(args_in: Optional[List[str]] = None) -> None:
from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
with open(os.path.join(dir_model, "config.json"), "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "FalconForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model,
config=config,
torch_dtype=torch.float16 if ftype == 1 else torch.float32,
low_cpu_mem_usage=True,
trust_remote_code=True)
print("Model loaded: ", dir_model)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
with open(os.path.join(dir_model, "config.json"), "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "FalconForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)

n_head_kv = hparams.get("num_kv_heads", 1)
n_head = hparams["num_attention_heads"]
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/convert/convert_gptj.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
print("Loading model: ", dir_model)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
hparams = model.config.to_dict()
list_vars = model.state_dict()
fout = open(fname_out, "wb")
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/convert/convert_gptneox.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
from modelscope import AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model)
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
model.eval()
for p in model.parameters():
p.requires_grad = False
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -1453,6 +1453,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(str(args.model), low_cpu_mem_usage=True, trust_remote_code=True)
import pdb;pdb.set_trace()
tokenizer = AutoTokenizer.from_pretrained(str(args.model), trust_remote_code=True)
cache_path = Path(tokenizer.vocab_file).parent
args.model = cache_path
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/convert/convert_mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
from modelscope import AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
hparams = model.config.to_dict()

list_vars = model.state_dict()
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/convert/convert_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
from modelscope import AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model)
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
model.eval()
hparams = model.config.to_dict()

Expand Down
2 changes: 1 addition & 1 deletion neural_speed/convert/convert_phi.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,9 +289,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
from modelscope import AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
hparams = model.config.to_dict()
if args.format == "GGUF":
phi_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams)
Expand Down
4 changes: 1 addition & 3 deletions neural_speed/convert/convert_qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,15 @@ def main(args_in: Optional[List[str]] = None) -> None:
# ftype == 0 -> float32
# ftype == 1 -> float16
ftype = 0
import pdb
pdb.set_trace()
if args.outtype == "f16":
ftype = 1
if args.model_hub == "modelscope":
from modelscope import AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
print("Loading model: ", dir_model)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
model.eval()
for p in model.parameters():
p.requires_grad = False
Expand Down
6 changes: 3 additions & 3 deletions neural_speed/convert/convert_starcoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,15 @@ def main(args_in: Optional[List[str]] = None) -> None:
else:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
print("Loading model: ", dir_model)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
hparams = config.to_dict()
model = AutoModelForCausalLM.from_pretrained(dir_model, config=config,
torch_dtype=torch.float16 \
if use_f16 else torch.float32,
low_cpu_mem_usage=True,
trust_remote_code=True)
print("Model loaded: ", dir_model)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
hparams = config.to_dict()

list_vars = model.state_dict()

Expand Down

0 comments on commit dc07b67

Please sign in to comment.