Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

enable load model from modelscope #154

Merged
merged 21 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
51fb111
enable load model from modelscope
intellinjun Mar 6, 2024
ce5a691
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 6, 2024
70cb61d
add load modelscope example
intellinjun Mar 6, 2024
d693673
Merge branch 'modelscope' of https://github.com/intel/neural-speed in…
intellinjun Mar 6, 2024
dc07b67
update README
intellinjun Mar 6, 2024
c18ea8a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 6, 2024
ab51edd
fix convert error
intellinjun Mar 6, 2024
0c4de62
Merge branch 'modelscope' of https://github.com/intel/neural-speed in…
intellinjun Mar 6, 2024
18af765
fix format error
intellinjun Mar 6, 2024
b4ffc40
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 6, 2024
34b2829
Update convert_llama.py
intellinjun Mar 6, 2024
9ce0c6c
fix format error
intellinjun Mar 6, 2024
74def5d
Merge branch 'modelscope' of https://github.com/intel/neural-speed in…
intellinjun Mar 6, 2024
21ce1b4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 6, 2024
9024130
fix ut error
intellinjun Mar 6, 2024
d5f96be
Merge branch 'modelscope' of https://github.com/intel/neural-speed in…
intellinjun Mar 6, 2024
4732e28
Update README.md
intellinjun Mar 7, 2024
c01046c
Merge branch 'main' of https://github.com/intel/neural-speed into mod…
intellinjun Mar 7, 2024
6c86248
fix qwen convert error
intellinjun Mar 7, 2024
26eadba
fix convert qwen error
intellinjun Mar 8, 2024
7fe34c3
fix convert qwen error
intellinjun Mar 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,22 @@ streamer = TextStreamer(tokenizer)
model = AutoModelForCausalLM.from_pretrained(model_name, model_file = model_file)
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
```
Pytorch format modelscpoe model
```python
import sys
from modelscope import AutoTokenizer
from transformers import TextStreamer
from neural_speed import Model

model_name = "qwen/Qwen1.5-7B-Chat" # modelscope model_id or local model
prompt = "Once upon a time, there existed a little girl,"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
inputs = tokenizer(prompt, return_tensors="pt").input_ids
streamer = TextStreamer(tokenizer)
model = Model()
model.init(model_name, weight_dtype="int4", compute_dtype="int8", model_hub="modelscope")
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
```
Please refer [this link](./docs/supported_models.md) to check supported models.

If you want to use [Transformer-based API](https://github.com/intel/intel-extension-for-transformers/blob/main/docs/weightonlyquant.md#llm-runtime-example-code) in [ITREX(Intel extension for transformers)](https://github.com/intel/intel-extension-for-transformers). Please refer to [ITREX Installation Page](https://github.com/intel/intel-extension-for-transformers/blob/main/docs/installation.md).
Expand Down
14 changes: 9 additions & 5 deletions neural_speed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import torch
from neural_speed.convert import convert_model
from transformers import AutoConfig, AutoTokenizer

model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder"}
max_request_num_default = 8
Expand Down Expand Up @@ -66,7 +65,7 @@ def __import_package(self, model_type):
import neural_speed.qwen_cpp as cpp_model
elif model_type == "mistral":
import neural_speed.mistral_cpp as cpp_model
elif model_type == "qwen":
elif model_type == "qwen2":
import neural_speed.qwen_cpp as cpp_model
elif model_type == "phi":
import neural_speed.phi_cpp as cpp_model
Expand All @@ -87,8 +86,13 @@ def get_model_type(model_config):

def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_autoround=False,
weight_dtype="int4", alg="sym", group_size=32,
scale_dtype="fp32", compute_dtype="int8", use_ggml=False):
self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
scale_dtype="fp32", compute_dtype="int8", use_ggml=False, model_hub="huggingface"):
if model_hub == "modelscope":
from modelscope import AutoConfig
self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
else:
from transformers import AutoConfig
self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
model_type = Model.get_model_type(self.config)
self.model_type = model_type
self.__import_package(model_type)
Expand Down Expand Up @@ -129,7 +133,7 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_au
return

if not os.path.exists(fp32_bin):
convert_model(model_name, fp32_bin, "f32")
convert_model(model_name, fp32_bin, "f32", model_hub = model_hub)
assert os.path.exists(fp32_bin), "Fail to convert pytorch model"

if not use_quant:
Expand Down
13 changes: 9 additions & 4 deletions neural_speed/convert/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,18 @@
# limitations under the License.

from pathlib import Path
from transformers import AutoConfig
import subprocess

model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder", "whisper": "whisper"}
model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder", "whisper": "whisper", "qwen2": "qwen"}


def convert_model(model, outfile, outtype="f32", whisper_repo_path=None, use_quantized_model=False):
config = AutoConfig.from_pretrained(model, trust_remote_code=True)
def convert_model(model, outfile, outtype="f32", model_hub="huggingface", use_quantized_model=False):
if model_hub == "modelscope":
from modelscope import AutoConfig
config = AutoConfig.from_pretrained(model, trust_remote_code=True)
else:
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model, trust_remote_code=True)
model_type = model_maps.get(config.model_type, config.model_type)

if use_quantized_model:
Expand All @@ -34,6 +38,7 @@ def convert_model(model, outfile, outtype="f32", whisper_repo_path=None, use_qua
cmd.extend(["python", path])
cmd.extend(["--outfile", outfile])
cmd.extend(["--outtype", outtype])
cmd.extend(["--model_hub", model_hub])
cmd.extend([model])

print("cmd:", cmd)
Expand Down
11 changes: 8 additions & 3 deletions neural_speed/convert/convert_baichuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import argparse
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
Union)
from transformers import AutoModel, AutoConfig, AutoModelForCausalLM, AutoTokenizer
from sentencepiece import SentencePieceProcessor # type: ignore


Expand Down Expand Up @@ -231,6 +230,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
default="huggingface", help="hub to load model")
parser.add_argument("model", type=Path, help="directory containing model file")
args = parser.parse_args(args_in)

Expand All @@ -243,10 +244,14 @@ def main(args_in: Optional[List[str]] = None) -> None:
ftype = 0
if args.outtype == "f16":
ftype = 1

if args.model_hub == "modelscope":
from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True)
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True)

hparams = config.to_dict()

Expand Down
15 changes: 10 additions & 5 deletions neural_speed/convert/convert_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import argparse
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
Union)
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig


# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
Expand Down Expand Up @@ -54,6 +53,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
default="huggingface", help="hub to load model")
parser.add_argument("model", type=Path, help="directory containing model file")
args = parser.parse_args(args_in)

Expand All @@ -66,16 +67,20 @@ def main(args_in: Optional[List[str]] = None) -> None:
ftype = 0
if args.outtype == "f16":
ftype = 1

tokenizer = AutoTokenizer.from_pretrained(dir_model)
if args.model_hub == "modelscope":
from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
hparams = config.to_dict()
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model,
config=config,
torch_dtype=torch.float16 if ftype == 1 else torch.float32,
low_cpu_mem_usage=True,
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
hparams = config.to_dict()
print("Loading model: ", dir_model)

print("Model loaded: ", dir_model)

fout = open(fname_out, "wb")
Expand Down
10 changes: 7 additions & 3 deletions neural_speed/convert/convert_chatglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import argparse
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
Union)
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
from sentencepiece import SentencePieceProcessor # type: ignore
import gguf

Expand Down Expand Up @@ -612,6 +611,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
default="huggingface", help="hub to load model")
parser.add_argument("model", type=Path, help="directory containing model file")
parser.add_argument("--format",
type=str,
Expand All @@ -629,10 +630,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
ftype = 0
if args.outtype == "f16":
ftype = 1

if args.model_hub == "modelscope":
from modelscope import AutoConfig, AutoModel, AutoTokenizer
else:
from transformers import AutoConfig, AutoModel, AutoTokenizer
model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)

hparams = config.to_dict()

Expand Down
10 changes: 7 additions & 3 deletions neural_speed/convert/convert_dolly.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
import argparse
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
Union)
from transformers import AutoModelForCausalLM, AutoTokenizer


# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
Expand Down Expand Up @@ -62,6 +61,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
default="huggingface", help="hub to load model")
parser.add_argument("model", type=Path, help="directory containing model file")
args = parser.parse_args(args_in)

Expand All @@ -74,10 +75,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
ftype = 0
if args.outtype == "f16":
ftype = 1

if args.model_hub == "modelscope":
from modelscope import AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
model.eval()
for p in model.parameters():
p.requires_grad = False
Expand Down
22 changes: 13 additions & 9 deletions neural_speed/convert/convert_falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import argparse
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
Union)
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig


# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
Expand Down Expand Up @@ -54,6 +53,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
default="huggingface", help="hub to load model")
parser.add_argument("model", type=Path, help="directory containing model file")
args = parser.parse_args(args_in)

Expand All @@ -66,21 +67,24 @@ def main(args_in: Optional[List[str]] = None) -> None:
ftype = 0
if args.outtype == "f16":
ftype = 1

tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
with open(os.path.join(dir_model, "config.json"), "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "FalconForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)
if args.model_hub == "modelscope":
from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
print("Loading model: ", dir_model)
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(dir_model,
config=config,
torch_dtype=torch.float16 if ftype == 1 else torch.float32,
low_cpu_mem_usage=True,
trust_remote_code=True)
print("Model loaded: ", dir_model)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
with open(os.path.join(dir_model, "config.json"), "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "FalconForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)

n_head_kv = hparams.get("num_kv_heads", 1)
n_head = hparams["num_attention_heads"]
Expand Down
10 changes: 7 additions & 3 deletions neural_speed/convert/convert_gptj.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
import argparse
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
Union)
from transformers import AutoModelForCausalLM, AutoTokenizer


# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
Expand Down Expand Up @@ -59,6 +58,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
default="huggingface", help="hub to load model")
parser.add_argument("model", type=Path, help="directory containing model file")
args = parser.parse_args(args_in)

Expand All @@ -68,10 +69,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
ftype = 0
if args.outtype == "f16":
ftype = 1

if args.model_hub == "modelscope":
from modelscope import AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
print("Loading model: ", dir_model)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
hparams = model.config.to_dict()
list_vars = model.state_dict()
fout = open(fname_out, "wb")
Expand Down
9 changes: 7 additions & 2 deletions neural_speed/convert/convert_gptneox.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
default="huggingface", help="hub to load model")
parser.add_argument("model", type=Path, help="directory containing model file")
args = parser.parse_args(args_in)

Expand All @@ -74,10 +76,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
ftype = 0
if args.outtype == "f16":
ftype = 1

tokenizer = AutoTokenizer.from_pretrained(dir_model)
if args.model_hub == "modelscope":
from modelscope import AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
print("Loading model: ", dir_model)
model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
tokenizer = AutoTokenizer.from_pretrained(dir_model)
model.eval()
for p in model.parameters():
p.requires_grad = False
Expand Down
13 changes: 9 additions & 4 deletions neural_speed/convert/convert_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
Union)
import numpy as np
from sentencepiece import SentencePieceProcessor # type: ignore
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
import gguf

if TYPE_CHECKING:
Expand Down Expand Up @@ -1423,6 +1422,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
type=Path,
help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
default="huggingface", help="hub to load model")
parser.add_argument("model",
type=Path,
help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
Expand All @@ -1432,7 +1433,6 @@ def main(args_in: Optional[List[str]] = None) -> None:
choices=["NE", "GGUF"],
help="convert to the GGUF or NE format")
args = parser.parse_args(args_in)

vocab: Vocab
if args.dump_single:
model_plus = lazy_load_file(args.model)
Expand All @@ -1449,8 +1449,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
model_plus = load_some_model(args.model)
else:
print("Loadding the model from HF.")
model = AutoModel.from_pretrained(args.model, low_cpu_mem_usage=True, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
if args.model_hub == "modelscope":
from modelscope import AutoModelForCausalLM, AutoTokenizer
else:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(str(args.model), low_cpu_mem_usage=True,
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(str(args.model), trust_remote_code=True)
cache_path = Path(tokenizer.vocab_file).parent
args.model = cache_path

Expand Down
Loading
Loading