From 51fb11143cc5a6759caca0b83da846df2d6af0f4 Mon Sep 17 00:00:00 2001
From: intellinjun <jun.lin@intel.com>
Date: Wed, 6 Mar 2024 16:02:23 +0800
Subject: [PATCH 01/16] enable load model from modelscope

Signed-off-by: intellinjun <jun.lin@intel.com>
---
 neural_speed/__init__.py                  | 12 ++++++++----
 neural_speed/convert/__init__.py          | 11 ++++++++---
 neural_speed/convert/convert_baichuan.py  |  8 ++++++--
 neural_speed/convert/convert_bloom.py     |  7 +++++--
 neural_speed/convert/convert_chatglm.py   |  7 +++++--
 neural_speed/convert/convert_dolly.py     |  7 +++++--
 neural_speed/convert/convert_falcon.py    |  7 +++++--
 neural_speed/convert/convert_gptj.py      |  7 +++++--
 neural_speed/convert/convert_gptneox.py   |  6 +++++-
 neural_speed/convert/convert_llama.py     | 11 +++++++----
 neural_speed/convert/convert_mistral.py   | 11 +++++++----
 neural_speed/convert/convert_mixtral.py   | 10 +++++++---
 neural_speed/convert/convert_mpt.py       |  6 +++++-
 neural_speed/convert/convert_opt.py       |  6 +++++-
 neural_speed/convert/convert_phi.py       |  6 +++++-
 neural_speed/convert/convert_qwen.py      | 14 ++++++++++----
 neural_speed/convert/convert_starcoder.py |  6 +++++-
 neural_speed/convert/convert_whisper.py   |  7 +++++--
 18 files changed, 108 insertions(+), 41 deletions(-)

diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
index 7bb39ce16..4edd465af 100644
--- a/neural_speed/__init__.py
+++ b/neural_speed/__init__.py
@@ -66,7 +66,7 @@ def __import_package(self, model_type):
             import neural_speed.qwen_cpp as cpp_model
         elif model_type == "mistral":
             import neural_speed.mistral_cpp as cpp_model
-        elif model_type == "qwen":
+        elif model_type == "qwen2":
             import neural_speed.qwen_cpp as cpp_model
         elif model_type == "phi":
             import neural_speed.phi_cpp as cpp_model
@@ -87,8 +87,12 @@ def get_model_type(model_config):
 
     def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_autoround=False,
             weight_dtype="int4", alg="sym", group_size=32,
-            scale_dtype="fp32", compute_dtype="int8", use_ggml=False):
-        self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+            scale_dtype="fp32", compute_dtype="int8", use_ggml=False, model_hub="huggingface"):
+        if model_hub == "modelscope":
+            from modelscope import AutoConfig
+            self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+        else:           
+            self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
         model_type = Model.get_model_type(self.config)
         self.model_type = model_type
         self.__import_package(model_type)
@@ -129,7 +133,7 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_au
             return
 
         if not os.path.exists(fp32_bin):
-            convert_model(model_name, fp32_bin, "f32")
+            convert_model(model_name, fp32_bin, "f32", model_hub = model_hub)
             assert os.path.exists(fp32_bin), "Fail to convert pytorch model"
 
         if not use_quant:
diff --git a/neural_speed/convert/__init__.py b/neural_speed/convert/__init__.py
index 9f063a5ec..025df9e2e 100644
--- a/neural_speed/convert/__init__.py
+++ b/neural_speed/convert/__init__.py
@@ -19,11 +19,15 @@
 from transformers import AutoConfig
 import subprocess
 
-model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder", "whisper": "whisper"}
+model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder", "whisper": "whisper", "qwen2": "qwen"}
 
 
-def convert_model(model, outfile, outtype="f32", whisper_repo_path=None, use_quantized_model=False):
-    config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+def convert_model(model, outfile, outtype="f32", model_hub="huggingface", use_quantized_model=False):
+    if model_hub == "modelscope":
+        from modelscope import AutoConfig
+        config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+    else:       
+        config = AutoConfig.from_pretrained(model, trust_remote_code=True)
     model_type = model_maps.get(config.model_type, config.model_type)
 
     if use_quantized_model:
@@ -34,6 +38,7 @@ def convert_model(model, outfile, outtype="f32", whisper_repo_path=None, use_qua
     cmd.extend(["python", path])
     cmd.extend(["--outfile", outfile])
     cmd.extend(["--outtype", outtype])
+    cmd.extend(["--model_hub", model_hub])
     cmd.extend([model])
 
     print("cmd:", cmd)
diff --git a/neural_speed/convert/convert_baichuan.py b/neural_speed/convert/convert_baichuan.py
index 9303df2c1..24cf8207b 100644
--- a/neural_speed/convert/convert_baichuan.py
+++ b/neural_speed/convert/convert_baichuan.py
@@ -19,7 +19,6 @@
 import argparse
 from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
                     Union)
-from transformers import AutoModel, AutoConfig, AutoModelForCausalLM, AutoTokenizer
 from sentencepiece import SentencePieceProcessor  # type: ignore
 
 
@@ -231,6 +230,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -243,7 +243,11 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+    if args.model_hub == "modelscope":
+        from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+    print("Loading model: ", dir_model)
     config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True)
diff --git a/neural_speed/convert/convert_bloom.py b/neural_speed/convert/convert_bloom.py
index 7bd263d52..d68c5f56b 100644
--- a/neural_speed/convert/convert_bloom.py
+++ b/neural_speed/convert/convert_bloom.py
@@ -24,7 +24,6 @@
 import argparse
 from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
                     Union)
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
 
 
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
@@ -54,6 +53,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -66,7 +66,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+    if args.model_hub == "modelscope":
+        from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(dir_model)
     config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
     hparams = config.to_dict()
diff --git a/neural_speed/convert/convert_chatglm.py b/neural_speed/convert/convert_chatglm.py
index 124d19ec3..171ffce6a 100644
--- a/neural_speed/convert/convert_chatglm.py
+++ b/neural_speed/convert/convert_chatglm.py
@@ -19,7 +19,6 @@
 import argparse
 from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
                     Union)
-from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
 from sentencepiece import SentencePieceProcessor  # type: ignore
 import gguf
 
@@ -612,6 +611,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     parser.add_argument("--format",
                         type=str,
@@ -629,7 +629,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+    if args.model_hub == "modelscope":
+        from modelscope import AutoConfig, AutoModel, AutoTokenizer
+    else:
+        from transformers import AutoConfig, AutoModel, AutoTokenizer
     config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
     model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)
diff --git a/neural_speed/convert/convert_dolly.py b/neural_speed/convert/convert_dolly.py
index 0d90c29fb..47335f94d 100644
--- a/neural_speed/convert/convert_dolly.py
+++ b/neural_speed/convert/convert_dolly.py
@@ -32,7 +32,6 @@
 import argparse
 from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
                     Union)
-from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
@@ -62,6 +61,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -74,7 +74,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+    if args.model_hub == "modelscope":
+        from modelscope import AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(dir_model)
     print("Loading model: ", dir_model)
     model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py
index 3c88850f5..cf4f15f2e 100644
--- a/neural_speed/convert/convert_falcon.py
+++ b/neural_speed/convert/convert_falcon.py
@@ -24,7 +24,6 @@
 import argparse
 from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
                     Union)
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
 
 
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
@@ -54,6 +53,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -66,7 +66,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+        if args.model_hub == "modelscope":
+        from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
     config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
     with open(os.path.join(dir_model, "config.json"), "r", encoding="utf-8") as f:
diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py
index d41b0cf5f..8c67bbec1 100644
--- a/neural_speed/convert/convert_gptj.py
+++ b/neural_speed/convert/convert_gptj.py
@@ -29,7 +29,6 @@
 import argparse
 from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
                     Union)
-from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
@@ -59,6 +58,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -68,7 +68,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+    if args.model_hub == "modelscope":
+        from modelscope import  AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
     print("Loading model: ", dir_model)
     tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py
index 0dade0563..cee3c7bbd 100644
--- a/neural_speed/convert/convert_gptneox.py
+++ b/neural_speed/convert/convert_gptneox.py
@@ -62,6 +62,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -74,7 +75,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+    if args.model_hub == "modelscope":
+        from modelscope import AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(dir_model)
     print("Loading model: ", dir_model)
     model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
index 5470035db..70b1b9384 100644
--- a/neural_speed/convert/convert_llama.py
+++ b/neural_speed/convert/convert_llama.py
@@ -35,7 +35,6 @@
                     Union)
 import numpy as np
 from sentencepiece import SentencePieceProcessor  # type: ignore
-from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
 import gguf
 
 if TYPE_CHECKING:
@@ -1423,6 +1422,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         type=Path,
                         help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model",
                         type=Path,
                         help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
@@ -1432,7 +1432,6 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         choices=["NE", "GGUF"],
                         help="convert to the GGUF or NE format")
     args = parser.parse_args(args_in)
-
     vocab: Vocab
     if args.dump_single:
         model_plus = lazy_load_file(args.model)
@@ -1449,8 +1448,12 @@ def main(args_in: Optional[List[str]] = None) -> None:
             model_plus = load_some_model(args.model)
         else:
             print("Loadding the model from HF.")
-            model = AutoModel.from_pretrained(args.model, low_cpu_mem_usage=True, trust_remote_code=True)
-            tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+            if args.model_hub == "modelscope":
+                from modelscope import AutoModelForCausalLM, AutoTokenizer
+            else:
+                from transformers import AutoModelForCausalLM, AutoTokenizer
+            model = AutoModelForCausalLM.from_pretrained(str(args.model), low_cpu_mem_usage=True, trust_remote_code=True)
+            tokenizer = AutoTokenizer.from_pretrained(str(args.model), trust_remote_code=True)
             cache_path = Path(tokenizer.vocab_file).parent
             args.model = cache_path
 
diff --git a/neural_speed/convert/convert_mistral.py b/neural_speed/convert/convert_mistral.py
index a440f529d..283a01562 100644
--- a/neural_speed/convert/convert_mistral.py
+++ b/neural_speed/convert/convert_mistral.py
@@ -36,7 +36,6 @@
 
 import numpy as np
 from sentencepiece import SentencePieceProcessor  # type: ignore
-from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
 
 if TYPE_CHECKING:
     from typing_extensions import TypeAlias
@@ -1298,11 +1297,11 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         type=Path,
                         help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model",
                         type=Path,
                         help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
     args = parser.parse_args(args_in)
-
     vocab: Vocab
     if args.dump_single:
         model_plus = lazy_load_file(args.model)
@@ -1318,8 +1317,12 @@ def main(args_in: Optional[List[str]] = None) -> None:
             print("Loadding the model from the local path.")
         else:
             print("Loadding the model from HF.")
-            model = AutoModel.from_pretrained(args.model, low_cpu_mem_usage=True, trust_remote_code=True)
-            tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+            if args.model_hub == "modelscope":
+                from modelscope import AutoConfig, AutoModel, AutoTokenizer
+            else:
+                from transformers import AutoConfig, AutoModel, AutoTokenizer
+            model = AutoModel.from_pretrained(str(args.model), low_cpu_mem_usage=True, trust_remote_code=True)
+            tokenizer = AutoTokenizer.from_pretrained(str(args.model), trust_remote_code=True)
             cache_path = Path(tokenizer.vocab_file).parent
             args.model = cache_path
 
diff --git a/neural_speed/convert/convert_mixtral.py b/neural_speed/convert/convert_mixtral.py
index 1c8eded16..19ccb3d49 100644
--- a/neural_speed/convert/convert_mixtral.py
+++ b/neural_speed/convert/convert_mixtral.py
@@ -36,7 +36,6 @@
 
 import numpy as np
 from sentencepiece import SentencePieceProcessor  # type: ignore
-from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
 
 if TYPE_CHECKING:
     from typing_extensions import TypeAlias
@@ -1300,6 +1299,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         type=Path,
                         help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model",
                         type=Path,
                         help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
@@ -1320,8 +1320,12 @@ def main(args_in: Optional[List[str]] = None) -> None:
             print("Loadding the model from the local path.")
         else:
             print("Loadding the model from HF.")
-            model = AutoModel.from_pretrained(args.model, low_cpu_mem_usage=True, trust_remote_code=True)
-            tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+            if args.model_hub == "modelscope":
+                from modelscope import AutoConfig, AutoModel, AutoTokenizer
+            else:
+                from transformers import AutoConfig, AutoModel, AutoTokenizer
+            model = AutoModel.from_pretrained(str(args.model), low_cpu_mem_usage=True, trust_remote_code=True)
+            tokenizer = AutoTokenizer.from_pretrained(str(args.model), trust_remote_code=True)
             cache_path = Path(tokenizer.vocab_file).parent
             args.model = cache_path
 
diff --git a/neural_speed/convert/convert_mpt.py b/neural_speed/convert/convert_mpt.py
index cd56af41d..0ad152141 100644
--- a/neural_speed/convert/convert_mpt.py
+++ b/neural_speed/convert/convert_mpt.py
@@ -51,6 +51,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -62,7 +63,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+    if args.model_hub == "modelscope":
+        from modelscope import AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)
     hparams = model.config.to_dict()
diff --git a/neural_speed/convert/convert_opt.py b/neural_speed/convert/convert_opt.py
index 07b7a632a..7dcb8620d 100644
--- a/neural_speed/convert/convert_opt.py
+++ b/neural_speed/convert/convert_opt.py
@@ -60,6 +60,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -72,7 +73,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+    if args.model_hub == "modelscope":
+        from modelscope import AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(dir_model)
     print("Loading model: ", dir_model)
     model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
diff --git a/neural_speed/convert/convert_phi.py b/neural_speed/convert/convert_phi.py
index 6e02b0b55..dad0b9574 100644
--- a/neural_speed/convert/convert_phi.py
+++ b/neural_speed/convert/convert_phi.py
@@ -267,6 +267,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     parser.add_argument("--format",
                         type=str,
@@ -284,7 +285,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+    if args.model_hub == "modelscope":
+        from modelscope import AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
     print("Loading model: ", dir_model)
     model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True)
diff --git a/neural_speed/convert/convert_qwen.py b/neural_speed/convert/convert_qwen.py
index 7966717b2..329b03245 100644
--- a/neural_speed/convert/convert_qwen.py
+++ b/neural_speed/convert/convert_qwen.py
@@ -32,7 +32,7 @@
 import argparse
 from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
                     Union)
-from transformers import AutoModelForCausalLM, AutoTokenizer
+
 
 
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
@@ -62,6 +62,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -72,12 +73,17 @@ def main(args_in: Optional[List[str]] = None) -> None:
     #   ftype == 0 -> float32
     #   ftype == 1 -> float16
     ftype = 0
+    import pdb
+    pdb.set_trace()
     if args.outtype == "f16":
         ftype = 1
-
-    tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
+    if args.model_hub == "modelscope":
+        from modelscope import AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
     print("Loading model: ", dir_model)
-    model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
+    model = AutoModelForCausalLM.from_pretrained(dir_model)
     model.eval()
     for p in model.parameters():
         p.requires_grad = False
diff --git a/neural_speed/convert/convert_starcoder.py b/neural_speed/convert/convert_starcoder.py
index f176ef8d9..628b431b9 100644
--- a/neural_speed/convert/convert_starcoder.py
+++ b/neural_speed/convert/convert_starcoder.py
@@ -56,6 +56,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         default="fp32",
                         help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -68,7 +69,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     use_f16 = False
     if args.outtype == "f16":
         use_f16 = True
-
+    if args.model_hub == "modelscope":
+        from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
     print("Loading model: ", dir_model)
     tokenizer = AutoTokenizer.from_pretrained(dir_model)
     config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
diff --git a/neural_speed/convert/convert_whisper.py b/neural_speed/convert/convert_whisper.py
index 157f34175..397cce022 100644
--- a/neural_speed/convert/convert_whisper.py
+++ b/neural_speed/convert/convert_whisper.py
@@ -42,7 +42,6 @@
 from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
                     Union)
 
-from transformers import WhisperForConditionalGeneration
 
 conv_map = {
     'self_attn.k_proj': 'attn.key',
@@ -98,6 +97,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         default="fp32",
                         help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
     dir_model = args.model
@@ -108,7 +108,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     encoder = json.load((dir_model / "vocab.json").open("r", encoding="utf8"))
     encoder_added = json.load((dir_model / "added_tokens.json").open("r", encoding="utf8"))
     hparams = json.load((dir_model / "config.json").open("r", encoding="utf8"))
-
+    if args.model_hub == "modelscope":
+        from modelscope import WhisperForConditionalGeneration
+    else:
+        from transformers import WhisperForConditionalGeneration
     model = WhisperForConditionalGeneration.from_pretrained(dir_model)
 
     #code.interact(local=locals())

From ce5a691972afd9099858c81103f757f486fa916a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 6 Mar 2024 08:03:16 +0000
Subject: [PATCH 02/16] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_speed/__init__.py         | 2 +-
 neural_speed/convert/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
index 4edd465af..65e71e937 100644
--- a/neural_speed/__init__.py
+++ b/neural_speed/__init__.py
@@ -91,7 +91,7 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_au
         if model_hub == "modelscope":
             from modelscope import AutoConfig
             self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-        else:           
+        else:
             self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
         model_type = Model.get_model_type(self.config)
         self.model_type = model_type
diff --git a/neural_speed/convert/__init__.py b/neural_speed/convert/__init__.py
index 025df9e2e..ed086badd 100644
--- a/neural_speed/convert/__init__.py
+++ b/neural_speed/convert/__init__.py
@@ -26,7 +26,7 @@ def convert_model(model, outfile, outtype="f32", model_hub="huggingface", use_qu
     if model_hub == "modelscope":
         from modelscope import AutoConfig
         config = AutoConfig.from_pretrained(model, trust_remote_code=True)
-    else:       
+    else:
         config = AutoConfig.from_pretrained(model, trust_remote_code=True)
     model_type = model_maps.get(config.model_type, config.model_type)
 

From 70cb61deab511e522b475554b479da62b7acd0c9 Mon Sep 17 00:00:00 2001
From: intellinjun <jun.lin@intel.com>
Date: Wed, 6 Mar 2024 16:49:08 +0800
Subject: [PATCH 03/16] add load modelscope example

Signed-off-by: intellinjun <jun.lin@intel.com>
---
 scripts/python_api_example_for_modelscope.py | 34 ++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 scripts/python_api_example_for_modelscope.py

diff --git a/scripts/python_api_example_for_modelscope.py b/scripts/python_api_example_for_modelscope.py
new file mode 100644
index 000000000..7dd311afe
--- /dev/null
+++ b/scripts/python_api_example_for_modelscope.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from modelscope import AutoTokenizer
+from transformers import TextStreamer
+from neural_speed import Model
+
+if len(sys.argv) != 2:
+    print("Usage: python python_api_example.py model_path")
+model_name = sys.argv[1]
+
+prompt = "Once upon a time, a little girl"
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+inputs = tokenizer(prompt, return_tensors="pt").input_ids
+streamer = TextStreamer(tokenizer)
+
+model = Model()
+# If you want to run GPTQ or AWQ models, just set use_gptq = True or use_awq = True.
+model.init(model_name, weight_dtype="int4", compute_dtype="int8", model_hub="modelscope")
+outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True)

From dc07b67795e03d73c9bc2aa35dff6a5d3e18f2a3 Mon Sep 17 00:00:00 2001
From: intellinjun <jun.lin@intel.com>
Date: Wed, 6 Mar 2024 17:10:16 +0800
Subject: [PATCH 04/16] update README

Signed-off-by: intellinjun <jun.lin@intel.com>
---
 README.md                                 | 15 +++++++++++++++
 neural_speed/convert/convert_bloom.py     |  9 +++++----
 neural_speed/convert/convert_chatglm.py   |  2 +-
 neural_speed/convert/convert_dolly.py     |  2 +-
 neural_speed/convert/convert_falcon.py    | 14 +++++++-------
 neural_speed/convert/convert_gptj.py      |  2 +-
 neural_speed/convert/convert_gptneox.py   |  2 +-
 neural_speed/convert/convert_llama.py     |  1 +
 neural_speed/convert/convert_mpt.py       |  2 +-
 neural_speed/convert/convert_opt.py       |  2 +-
 neural_speed/convert/convert_phi.py       |  2 +-
 neural_speed/convert/convert_qwen.py      |  4 +---
 neural_speed/convert/convert_starcoder.py |  6 +++---
 13 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 57121ab41..563274fbb 100644
--- a/README.md
+++ b/README.md
@@ -60,7 +60,22 @@ streamer = TextStreamer(tokenizer)
 model = AutoModelForCausalLM.from_pretrained(model_name, model_file = model_file)
 outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
 ```
+Pytorch format modelscpoe model
+```python
+import sys
+from modelscope import AutoTokenizer
+from transformers import TextStreamer
+from neural_speed import Model
 
+model_name = "qwen/Qwen1.5-7B-Chat"     # Hugging Face model_id or local model
+prompt = "Once upon a time, there existed a little girl,"
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+inputs = tokenizer(prompt, return_tensors="pt").input_ids
+streamer = TextStreamer(tokenizer)
+model = Model()
+model.init(model_name, weight_dtype="int4", compute_dtype="int8", model_hub="modelscope")
+outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
+```
 Please refer [this link](./docs/supported_models.md) to check supported models.
 
 If you want to use [Transformer-based API](https://github.com/intel/intel-extension-for-transformers/blob/main/docs/weightonlyquant.md#llm-runtime-example-code) in [ITREX(Intel extension for transformers)](https://github.com/intel/intel-extension-for-transformers). Please refer to [ITREX Installation Page](https://github.com/intel/intel-extension-for-transformers/blob/main/docs/installation.md).
diff --git a/neural_speed/convert/convert_bloom.py b/neural_speed/convert/convert_bloom.py
index d68c5f56b..6b9415f96 100644
--- a/neural_speed/convert/convert_bloom.py
+++ b/neural_speed/convert/convert_bloom.py
@@ -70,15 +70,16 @@ def main(args_in: Optional[List[str]] = None) -> None:
         from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
     else:
         from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained(dir_model)
-    config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
-    hparams = config.to_dict()
-    print("Loading model: ", dir_model)
     model = AutoModelForCausalLM.from_pretrained(dir_model,
                                                  config=config,
                                                  torch_dtype=torch.float16 if ftype == 1 else torch.float32,
                                                  low_cpu_mem_usage=True,
                                                  trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
+    config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
+    hparams = config.to_dict()
+    print("Loading model: ", dir_model)
+    
     print("Model loaded: ", dir_model)
 
     fout = open(fname_out, "wb")
diff --git a/neural_speed/convert/convert_chatglm.py b/neural_speed/convert/convert_chatglm.py
index 171ffce6a..562cd7763 100644
--- a/neural_speed/convert/convert_chatglm.py
+++ b/neural_speed/convert/convert_chatglm.py
@@ -633,9 +633,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
         from modelscope import AutoConfig, AutoModel, AutoTokenizer
     else:
         from transformers import AutoConfig, AutoModel, AutoTokenizer
+    model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)
     config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-    model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)
 
     hparams = config.to_dict()
 
diff --git a/neural_speed/convert/convert_dolly.py b/neural_speed/convert/convert_dolly.py
index 47335f94d..ac11f332f 100644
--- a/neural_speed/convert/convert_dolly.py
+++ b/neural_speed/convert/convert_dolly.py
@@ -78,9 +78,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
         from modelscope import AutoModelForCausalLM, AutoTokenizer
     else:
         from transformers import AutoModelForCausalLM, AutoTokenizer
+    model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
     tokenizer = AutoTokenizer.from_pretrained(dir_model)
     print("Loading model: ", dir_model)
-    model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
     model.eval()
     for p in model.parameters():
         p.requires_grad = False
diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py
index cf4f15f2e..6c974008c 100644
--- a/neural_speed/convert/convert_falcon.py
+++ b/neural_speed/convert/convert_falcon.py
@@ -70,13 +70,6 @@ def main(args_in: Optional[List[str]] = None) -> None:
         from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
     else:
         from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-    config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
-    with open(os.path.join(dir_model, "config.json"), "r", encoding="utf-8") as f:
-        hparams = json.load(f)
-    if hparams["architectures"][0] != "FalconForCausalLM":
-        print("Model architecture not supported: " + hparams["architectures"][0])
-        sys.exit(1)
     print("Loading model: ", dir_model)
     model = AutoModelForCausalLM.from_pretrained(dir_model,
                                                  config=config,
@@ -84,6 +77,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
                                                  low_cpu_mem_usage=True,
                                                  trust_remote_code=True)
     print("Model loaded: ", dir_model)
+    tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
+    config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
+    with open(os.path.join(dir_model, "config.json"), "r", encoding="utf-8") as f:
+        hparams = json.load(f)
+    if hparams["architectures"][0] != "FalconForCausalLM":
+        print("Model architecture not supported: " + hparams["architectures"][0])
+        sys.exit(1)
 
     n_head_kv = hparams.get("num_kv_heads", 1)
     n_head = hparams["num_attention_heads"]
diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py
index 8c67bbec1..48acf3648 100644
--- a/neural_speed/convert/convert_gptj.py
+++ b/neural_speed/convert/convert_gptj.py
@@ -73,8 +73,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     else:
         from transformers import AutoModelForCausalLM, AutoTokenizer
     print("Loading model: ", dir_model)
-    tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
+    tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
     hparams = model.config.to_dict()
     list_vars = model.state_dict()
     fout = open(fname_out, "wb")
diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py
index cee3c7bbd..8b491a840 100644
--- a/neural_speed/convert/convert_gptneox.py
+++ b/neural_speed/convert/convert_gptneox.py
@@ -79,9 +79,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
         from modelscope import AutoModelForCausalLM, AutoTokenizer
     else:
         from transformers import AutoModelForCausalLM, AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained(dir_model)
     print("Loading model: ", dir_model)
     model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
     model.eval()
     for p in model.parameters():
         p.requires_grad = False
diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
index 70b1b9384..cbf83f2c8 100644
--- a/neural_speed/convert/convert_llama.py
+++ b/neural_speed/convert/convert_llama.py
@@ -1453,6 +1453,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
             else:
                 from transformers import AutoModelForCausalLM, AutoTokenizer
             model = AutoModelForCausalLM.from_pretrained(str(args.model), low_cpu_mem_usage=True, trust_remote_code=True)
+            import pdb;pdb.set_trace()
             tokenizer = AutoTokenizer.from_pretrained(str(args.model), trust_remote_code=True)
             cache_path = Path(tokenizer.vocab_file).parent
             args.model = cache_path
diff --git a/neural_speed/convert/convert_mpt.py b/neural_speed/convert/convert_mpt.py
index 0ad152141..e7ffe6929 100644
--- a/neural_speed/convert/convert_mpt.py
+++ b/neural_speed/convert/convert_mpt.py
@@ -67,8 +67,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
         from modelscope import AutoModelForCausalLM, AutoTokenizer
     else:
         from transformers import AutoModelForCausalLM, AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
     hparams = model.config.to_dict()
 
     list_vars = model.state_dict()
diff --git a/neural_speed/convert/convert_opt.py b/neural_speed/convert/convert_opt.py
index 7dcb8620d..182f00c69 100644
--- a/neural_speed/convert/convert_opt.py
+++ b/neural_speed/convert/convert_opt.py
@@ -77,9 +77,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
         from modelscope import AutoModelForCausalLM, AutoTokenizer
     else:
         from transformers import AutoModelForCausalLM, AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained(dir_model)
     print("Loading model: ", dir_model)
     model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
     model.eval()
     hparams = model.config.to_dict()
 
diff --git a/neural_speed/convert/convert_phi.py b/neural_speed/convert/convert_phi.py
index dad0b9574..3e45ad031 100644
--- a/neural_speed/convert/convert_phi.py
+++ b/neural_speed/convert/convert_phi.py
@@ -289,9 +289,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
         from modelscope import AutoModelForCausalLM, AutoTokenizer
     else:
         from transformers import AutoModelForCausalLM, AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
     print("Loading model: ", dir_model)
     model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
     hparams = model.config.to_dict()
     if args.format == "GGUF":
         phi_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams)
diff --git a/neural_speed/convert/convert_qwen.py b/neural_speed/convert/convert_qwen.py
index 329b03245..7c6eb159b 100644
--- a/neural_speed/convert/convert_qwen.py
+++ b/neural_speed/convert/convert_qwen.py
@@ -73,8 +73,6 @@ def main(args_in: Optional[List[str]] = None) -> None:
     #   ftype == 0 -> float32
     #   ftype == 1 -> float16
     ftype = 0
-    import pdb
-    pdb.set_trace()
     if args.outtype == "f16":
         ftype = 1
     if args.model_hub == "modelscope":
@@ -82,8 +80,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     else:
         from transformers import AutoModelForCausalLM, AutoTokenizer
     print("Loading model: ", dir_model)
-    tokenizer = AutoTokenizer.from_pretrained(dir_model)
     model = AutoModelForCausalLM.from_pretrained(dir_model)
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
     model.eval()
     for p in model.parameters():
         p.requires_grad = False
diff --git a/neural_speed/convert/convert_starcoder.py b/neural_speed/convert/convert_starcoder.py
index 628b431b9..fd32d2109 100644
--- a/neural_speed/convert/convert_starcoder.py
+++ b/neural_speed/convert/convert_starcoder.py
@@ -74,15 +74,15 @@ def main(args_in: Optional[List[str]] = None) -> None:
     else:
         from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
     print("Loading model: ", dir_model)
-    tokenizer = AutoTokenizer.from_pretrained(dir_model)
-    config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
-    hparams = config.to_dict()
     model = AutoModelForCausalLM.from_pretrained(dir_model, config=config,
                                                  torch_dtype=torch.float16 \
                                                  if use_f16 else torch.float32,
                                                  low_cpu_mem_usage=True,
                                                  trust_remote_code=True)
     print("Model loaded: ", dir_model)
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
+    config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
+    hparams = config.to_dict()
 
     list_vars = model.state_dict()
 

From c18ea8ae0c91bb388fd79b22de123d763edb2113 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 6 Mar 2024 09:10:32 +0000
Subject: [PATCH 05/16] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_speed/convert/convert_bloom.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_speed/convert/convert_bloom.py b/neural_speed/convert/convert_bloom.py
index 6b9415f96..f42e47169 100644
--- a/neural_speed/convert/convert_bloom.py
+++ b/neural_speed/convert/convert_bloom.py
@@ -79,7 +79,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
     hparams = config.to_dict()
     print("Loading model: ", dir_model)
-    
+
     print("Model loaded: ", dir_model)
 
     fout = open(fname_out, "wb")

From ab51eddea540c486a834511f25cc2e4e4d525c27 Mon Sep 17 00:00:00 2001
From: intellinjun <jun.lin@intel.com>
Date: Wed, 6 Mar 2024 17:12:40 +0800
Subject: [PATCH 06/16] fix convert error

Signed-off-by: intellinjun <jun.lin@intel.com>
---
 neural_speed/convert/convert_baichuan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_speed/convert/convert_baichuan.py b/neural_speed/convert/convert_baichuan.py
index 24cf8207b..dc0d36593 100644
--- a/neural_speed/convert/convert_baichuan.py
+++ b/neural_speed/convert/convert_baichuan.py
@@ -248,9 +248,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
     else:
         from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
     print("Loading model: ", dir_model)
+    model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True)
     config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True)
 
     hparams = config.to_dict()
 

From 18af765f50e547e0c8557464d451d6ab1b7646ba Mon Sep 17 00:00:00 2001
From: intellinjun <jun.lin@intel.com>
Date: Wed, 6 Mar 2024 18:40:28 +0800
Subject: [PATCH 07/16] fix format error

Signed-off-by: intellinjun <jun.lin@intel.com>
---
 neural_speed/__init__.py                  | 2 +-
 neural_speed/convert/convert_baichuan.py  | 3 ++-
 neural_speed/convert/convert_bloom.py     | 5 +++--
 neural_speed/convert/convert_chatglm.py   | 3 ++-
 neural_speed/convert/convert_dolly.py     | 3 ++-
 neural_speed/convert/convert_falcon.py    | 2 +-
 neural_speed/convert/convert_gptj.py      | 3 ++-
 neural_speed/convert/convert_gptneox.py   | 3 ++-
 neural_speed/convert/convert_llama.py     | 6 ++++--
 neural_speed/convert/convert_mistral.py   | 3 ++-
 neural_speed/convert/convert_mixtral.py   | 3 ++-
 neural_speed/convert/convert_mpt.py       | 3 ++-
 neural_speed/convert/convert_opt.py       | 3 ++-
 neural_speed/convert/convert_phi.py       | 3 ++-
 neural_speed/convert/convert_qwen.py      | 3 ++-
 neural_speed/convert/convert_starcoder.py | 5 +++--
 neural_speed/convert/convert_whisper.py   | 3 ++-
 17 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
index 65e71e937..12229812e 100644
--- a/neural_speed/__init__.py
+++ b/neural_speed/__init__.py
@@ -18,7 +18,6 @@
 
 import torch
 from neural_speed.convert import convert_model
-from transformers import AutoConfig, AutoTokenizer
 
 model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder"}
 max_request_num_default = 8
@@ -92,6 +91,7 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_au
             from modelscope import AutoConfig
             self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
         else:
+            from transformers import AutoConfig
             self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
         model_type = Model.get_model_type(self.config)
         self.model_type = model_type
diff --git a/neural_speed/convert/convert_baichuan.py b/neural_speed/convert/convert_baichuan.py
index dc0d36593..bcbf6c74c 100644
--- a/neural_speed/convert/convert_baichuan.py
+++ b/neural_speed/convert/convert_baichuan.py
@@ -230,7 +230,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+                        default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
diff --git a/neural_speed/convert/convert_bloom.py b/neural_speed/convert/convert_bloom.py
index f42e47169..b26fee236 100644
--- a/neural_speed/convert/convert_bloom.py
+++ b/neural_speed/convert/convert_bloom.py
@@ -53,7 +53,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
+                        default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -70,13 +71,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
         from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
     else:
         from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+    config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(dir_model,
                                                  config=config,
                                                  torch_dtype=torch.float16 if ftype == 1 else torch.float32,
                                                  low_cpu_mem_usage=True,
                                                  trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(dir_model)
-    config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
     hparams = config.to_dict()
     print("Loading model: ", dir_model)
 
diff --git a/neural_speed/convert/convert_chatglm.py b/neural_speed/convert/convert_chatglm.py
index 562cd7763..bf85999a9 100644
--- a/neural_speed/convert/convert_chatglm.py
+++ b/neural_speed/convert/convert_chatglm.py
@@ -611,7 +611,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+                        default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     parser.add_argument("--format",
                         type=str,
diff --git a/neural_speed/convert/convert_dolly.py b/neural_speed/convert/convert_dolly.py
index ac11f332f..828cd02ca 100644
--- a/neural_speed/convert/convert_dolly.py
+++ b/neural_speed/convert/convert_dolly.py
@@ -61,7 +61,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+                        default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py
index 6c974008c..8bc124e21 100644
--- a/neural_speed/convert/convert_falcon.py
+++ b/neural_speed/convert/convert_falcon.py
@@ -66,7 +66,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-        if args.model_hub == "modelscope":
+    if args.model_hub == "modelscope":
         from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
     else:
         from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py
index 48acf3648..5afea53d1 100644
--- a/neural_speed/convert/convert_gptj.py
+++ b/neural_speed/convert/convert_gptj.py
@@ -58,7 +58,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+                        default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py
index 8b491a840..6f94ac7cc 100644
--- a/neural_speed/convert/convert_gptneox.py
+++ b/neural_speed/convert/convert_gptneox.py
@@ -62,7 +62,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+                        default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
index cbf83f2c8..239fce45f 100644
--- a/neural_speed/convert/convert_llama.py
+++ b/neural_speed/convert/convert_llama.py
@@ -1422,7 +1422,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         type=Path,
                         help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+                        default="huggingface", help="hub to load model")
     parser.add_argument("model",
                         type=Path,
                         help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
@@ -1452,7 +1453,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
                 from modelscope import AutoModelForCausalLM, AutoTokenizer
             else:
                 from transformers import AutoModelForCausalLM, AutoTokenizer
-            model = AutoModelForCausalLM.from_pretrained(str(args.model), low_cpu_mem_usage=True, trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(str(args.model), low_cpu_mem_usage=True,
+                                                         trust_remote_code=True)
             import pdb;pdb.set_trace()
             tokenizer = AutoTokenizer.from_pretrained(str(args.model), trust_remote_code=True)
             cache_path = Path(tokenizer.vocab_file).parent
diff --git a/neural_speed/convert/convert_mistral.py b/neural_speed/convert/convert_mistral.py
index 283a01562..308329a30 100644
--- a/neural_speed/convert/convert_mistral.py
+++ b/neural_speed/convert/convert_mistral.py
@@ -1297,7 +1297,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         type=Path,
                         help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+                        default="huggingface", help="hub to load model")
     parser.add_argument("model",
                         type=Path,
                         help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
diff --git a/neural_speed/convert/convert_mixtral.py b/neural_speed/convert/convert_mixtral.py
index 19ccb3d49..b9745106e 100644
--- a/neural_speed/convert/convert_mixtral.py
+++ b/neural_speed/convert/convert_mixtral.py
@@ -1299,7 +1299,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         type=Path,
                         help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+                        default="huggingface", help="hub to load model")
     parser.add_argument("model",
                         type=Path,
                         help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
diff --git a/neural_speed/convert/convert_mpt.py b/neural_speed/convert/convert_mpt.py
index e7ffe6929..929b41818 100644
--- a/neural_speed/convert/convert_mpt.py
+++ b/neural_speed/convert/convert_mpt.py
@@ -51,7 +51,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
+                        default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
diff --git a/neural_speed/convert/convert_opt.py b/neural_speed/convert/convert_opt.py
index 182f00c69..f95b433f9 100644
--- a/neural_speed/convert/convert_opt.py
+++ b/neural_speed/convert/convert_opt.py
@@ -60,7 +60,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+                        default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
diff --git a/neural_speed/convert/convert_phi.py b/neural_speed/convert/convert_phi.py
index 3e45ad031..a81069afc 100644
--- a/neural_speed/convert/convert_phi.py
+++ b/neural_speed/convert/convert_phi.py
@@ -267,7 +267,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default="huggingface",
+                        help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     parser.add_argument("--format",
                         type=str,
diff --git a/neural_speed/convert/convert_qwen.py b/neural_speed/convert/convert_qwen.py
index 7c6eb159b..d4fabc386 100644
--- a/neural_speed/convert/convert_qwen.py
+++ b/neural_speed/convert/convert_qwen.py
@@ -62,7 +62,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+                        default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
diff --git a/neural_speed/convert/convert_starcoder.py b/neural_speed/convert/convert_starcoder.py
index fd32d2109..b7febcf82 100644
--- a/neural_speed/convert/convert_starcoder.py
+++ b/neural_speed/convert/convert_starcoder.py
@@ -56,7 +56,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         default="fp32",
                         help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+                        default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -74,6 +75,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     else:
         from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
     print("Loading model: ", dir_model)
+    config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(dir_model, config=config,
                                                  torch_dtype=torch.float16 \
                                                  if use_f16 else torch.float32,
@@ -81,7 +83,6 @@ def main(args_in: Optional[List[str]] = None) -> None:
                                                  trust_remote_code=True)
     print("Model loaded: ", dir_model)
     tokenizer = AutoTokenizer.from_pretrained(dir_model)
-    config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
     hparams = config.to_dict()
 
     list_vars = model.state_dict()
diff --git a/neural_speed/convert/convert_whisper.py b/neural_speed/convert/convert_whisper.py
index 397cce022..36472bc1a 100644
--- a/neural_speed/convert/convert_whisper.py
+++ b/neural_speed/convert/convert_whisper.py
@@ -97,7 +97,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         default="fp32",
                         help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+                        default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
     dir_model = args.model

From b4ffc40a292001d4d50b6733d0f730c9b8eb4046 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 6 Mar 2024 10:40:46 +0000
Subject: [PATCH 08/16] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_speed/convert/convert_baichuan.py  | 2 +-
 neural_speed/convert/convert_chatglm.py   | 2 +-
 neural_speed/convert/convert_dolly.py     | 2 +-
 neural_speed/convert/convert_gptj.py      | 2 +-
 neural_speed/convert/convert_gptneox.py   | 2 +-
 neural_speed/convert/convert_llama.py     | 2 +-
 neural_speed/convert/convert_mistral.py   | 2 +-
 neural_speed/convert/convert_mixtral.py   | 2 +-
 neural_speed/convert/convert_opt.py       | 2 +-
 neural_speed/convert/convert_qwen.py      | 2 +-
 neural_speed/convert/convert_starcoder.py | 2 +-
 neural_speed/convert/convert_whisper.py   | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/neural_speed/convert/convert_baichuan.py b/neural_speed/convert/convert_baichuan.py
index bcbf6c74c..3fddfe29f 100644
--- a/neural_speed/convert/convert_baichuan.py
+++ b/neural_speed/convert/convert_baichuan.py
@@ -230,7 +230,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
                         default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
diff --git a/neural_speed/convert/convert_chatglm.py b/neural_speed/convert/convert_chatglm.py
index bf85999a9..47db7f569 100644
--- a/neural_speed/convert/convert_chatglm.py
+++ b/neural_speed/convert/convert_chatglm.py
@@ -611,7 +611,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
                         default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     parser.add_argument("--format",
diff --git a/neural_speed/convert/convert_dolly.py b/neural_speed/convert/convert_dolly.py
index 828cd02ca..28d477297 100644
--- a/neural_speed/convert/convert_dolly.py
+++ b/neural_speed/convert/convert_dolly.py
@@ -61,7 +61,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
                         default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py
index 5afea53d1..0670cb3de 100644
--- a/neural_speed/convert/convert_gptj.py
+++ b/neural_speed/convert/convert_gptj.py
@@ -58,7 +58,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
                         default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py
index 6f94ac7cc..da3937451 100644
--- a/neural_speed/convert/convert_gptneox.py
+++ b/neural_speed/convert/convert_gptneox.py
@@ -62,7 +62,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
                         default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
index 239fce45f..1f2ca4460 100644
--- a/neural_speed/convert/convert_llama.py
+++ b/neural_speed/convert/convert_llama.py
@@ -1422,7 +1422,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         type=Path,
                         help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
                         default="huggingface", help="hub to load model")
     parser.add_argument("model",
                         type=Path,
diff --git a/neural_speed/convert/convert_mistral.py b/neural_speed/convert/convert_mistral.py
index 308329a30..1889a9860 100644
--- a/neural_speed/convert/convert_mistral.py
+++ b/neural_speed/convert/convert_mistral.py
@@ -1297,7 +1297,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         type=Path,
                         help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
                         default="huggingface", help="hub to load model")
     parser.add_argument("model",
                         type=Path,
diff --git a/neural_speed/convert/convert_mixtral.py b/neural_speed/convert/convert_mixtral.py
index b9745106e..4166d94be 100644
--- a/neural_speed/convert/convert_mixtral.py
+++ b/neural_speed/convert/convert_mixtral.py
@@ -1299,7 +1299,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         type=Path,
                         help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
                         default="huggingface", help="hub to load model")
     parser.add_argument("model",
                         type=Path,
diff --git a/neural_speed/convert/convert_opt.py b/neural_speed/convert/convert_opt.py
index f95b433f9..10fb9e4c7 100644
--- a/neural_speed/convert/convert_opt.py
+++ b/neural_speed/convert/convert_opt.py
@@ -60,7 +60,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
                         default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
diff --git a/neural_speed/convert/convert_qwen.py b/neural_speed/convert/convert_qwen.py
index d4fabc386..68c06584a 100644
--- a/neural_speed/convert/convert_qwen.py
+++ b/neural_speed/convert/convert_qwen.py
@@ -62,7 +62,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
                         default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
diff --git a/neural_speed/convert/convert_starcoder.py b/neural_speed/convert/convert_starcoder.py
index b7febcf82..105f58e4f 100644
--- a/neural_speed/convert/convert_starcoder.py
+++ b/neural_speed/convert/convert_starcoder.py
@@ -56,7 +56,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         default="fp32",
                         help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
                         default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
diff --git a/neural_speed/convert/convert_whisper.py b/neural_speed/convert/convert_whisper.py
index 36472bc1a..b9718d9e7 100644
--- a/neural_speed/convert/convert_whisper.py
+++ b/neural_speed/convert/convert_whisper.py
@@ -97,7 +97,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         default="fp32",
                         help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
                         default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)

From 34b2829b244c229c1dd2c1553207a489fc5df50a Mon Sep 17 00:00:00 2001
From: intellinjun <105184542+intellinjun@users.noreply.github.com>
Date: Wed, 6 Mar 2024 18:42:25 +0800
Subject: [PATCH 09/16] Update convert_llama.py

---
 neural_speed/convert/convert_llama.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
index 1f2ca4460..37af73ab4 100644
--- a/neural_speed/convert/convert_llama.py
+++ b/neural_speed/convert/convert_llama.py
@@ -1455,7 +1455,6 @@ def main(args_in: Optional[List[str]] = None) -> None:
                 from transformers import AutoModelForCausalLM, AutoTokenizer
             model = AutoModelForCausalLM.from_pretrained(str(args.model), low_cpu_mem_usage=True,
                                                          trust_remote_code=True)
-            import pdb;pdb.set_trace()
             tokenizer = AutoTokenizer.from_pretrained(str(args.model), trust_remote_code=True)
             cache_path = Path(tokenizer.vocab_file).parent
             args.model = cache_path

From 9ce0c6c174082590b20854ff730491c6b3814322 Mon Sep 17 00:00:00 2001
From: intellinjun <jun.lin@intel.com>
Date: Wed, 6 Mar 2024 23:01:58 +0800
Subject: [PATCH 10/16] fix format error

Signed-off-by: intellinjun <jun.lin@intel.com>
---
 neural_speed/convert/convert_falcon.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py
index 8bc124e21..fc519cfac 100644
--- a/neural_speed/convert/convert_falcon.py
+++ b/neural_speed/convert/convert_falcon.py
@@ -53,7 +53,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+                        default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -71,6 +72,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     else:
         from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
     print("Loading model: ", dir_model)
+    config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(dir_model,
                                                  config=config,
                                                  torch_dtype=torch.float16 if ftype == 1 else torch.float32,
@@ -78,7 +80,6 @@ def main(args_in: Optional[List[str]] = None) -> None:
                                                  trust_remote_code=True)
     print("Model loaded: ", dir_model)
     tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-    config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
     with open(os.path.join(dir_model, "config.json"), "r", encoding="utf-8") as f:
         hparams = json.load(f)
     if hparams["architectures"][0] != "FalconForCausalLM":

From 21ce1b47672b9d58729fd07051deb067a94a40b0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 6 Mar 2024 15:04:47 +0000
Subject: [PATCH 11/16] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_speed/convert/convert_falcon.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py
index fc519cfac..d0ac575a2 100644
--- a/neural_speed/convert/convert_falcon.py
+++ b/neural_speed/convert/convert_falcon.py
@@ -53,7 +53,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], 
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"],
                         default="huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)

From 9024130f366dbfae8b6afcb0bcb57382a1fdddc1 Mon Sep 17 00:00:00 2001
From: intellinjun <jun.lin@intel.com>
Date: Wed, 6 Mar 2024 23:37:19 +0800
Subject: [PATCH 12/16] fix ut error

Signed-off-by: intellinjun <jun.lin@intel.com>
---
 neural_speed/convert/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_speed/convert/__init__.py b/neural_speed/convert/__init__.py
index ed086badd..18ce11490 100644
--- a/neural_speed/convert/__init__.py
+++ b/neural_speed/convert/__init__.py
@@ -16,7 +16,6 @@
 # limitations under the License.
 
 from pathlib import Path
-from transformers import AutoConfig
 import subprocess
 
 model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder", "whisper": "whisper", "qwen2": "qwen"}
@@ -27,6 +26,7 @@ def convert_model(model, outfile, outtype="f32", model_hub="huggingface", use_qu
         from modelscope import AutoConfig
         config = AutoConfig.from_pretrained(model, trust_remote_code=True)
     else:
+        from transformers import AutoConfig
         config = AutoConfig.from_pretrained(model, trust_remote_code=True)
     model_type = model_maps.get(config.model_type, config.model_type)
 

From 4732e28b47292ae906b0db4d5927deb4f23c3a65 Mon Sep 17 00:00:00 2001
From: intellinjun <105184542+intellinjun@users.noreply.github.com>
Date: Thu, 7 Mar 2024 08:56:28 +0800
Subject: [PATCH 13/16] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 563274fbb..4a4102186 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ from modelscope import AutoTokenizer
 from transformers import TextStreamer
 from neural_speed import Model
 
-model_name = "qwen/Qwen1.5-7B-Chat"     # Hugging Face model_id or local model
+model_name = "qwen/Qwen1.5-7B-Chat"     # modelscope model_id or local model
 prompt = "Once upon a time, there existed a little girl,"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 inputs = tokenizer(prompt, return_tensors="pt").input_ids

From 6c8624827b434c61573a02026b9d20f3490ab97e Mon Sep 17 00:00:00 2001
From: intellinjun <jun.lin@intel.com>
Date: Thu, 7 Mar 2024 15:14:43 +0800
Subject: [PATCH 14/16] fix qwen convert error

Signed-off-by: intellinjun <jun.lin@intel.com>
---
 neural_speed/convert/convert_qwen.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/neural_speed/convert/convert_qwen.py b/neural_speed/convert/convert_qwen.py
index 648c8ff16..5f694d5ce 100644
--- a/neural_speed/convert/convert_qwen.py
+++ b/neural_speed/convert/convert_qwen.py
@@ -130,12 +130,12 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", 0.0))  # config.json "rope_scaling.factor", not enabled
     fout.write(struct.pack("i", 0))  # rope_scaling.original_max_position_embeddings
     fout.write(struct.pack("i", 0))  # params["rope_scaling"]["type"] =="yarn" else 0))
-    fout.write(
-        struct.pack(
-            "i", hparams["bos_token_id"] if "bos_token_id" in hparams else tokenizer.special_tokens['<|endoftext|>']))
-    fout.write(
-        struct.pack(
-            "i", hparams["eos_token_id"] if "eos_token_id" in hparams else tokenizer.special_tokens['<|endoftext|>']))
+    if hparams['model_type']=='qwen2':
+        fout.write(struct.pack("i", hparams["bos_token_id"]))
+        fout.write(struct.pack("i", hparams["eos_token_id"]))
+    else:
+        fout.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>']))
+        fout.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>']))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
     fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1))
 

From 26eadba7fa678b2da2e376a63fd54826c97ad9dd Mon Sep 17 00:00:00 2001
From: intellinjun <jun.lin@intel.com>
Date: Fri, 8 Mar 2024 13:47:12 +0800
Subject: [PATCH 15/16] fix convert qwen error

Signed-off-by: intellinjun <jun.lin@intel.com>
---
 neural_speed/convert/convert_quantized_qwen.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/neural_speed/convert/convert_quantized_qwen.py b/neural_speed/convert/convert_quantized_qwen.py
index fc0b87ed2..149a62660 100644
--- a/neural_speed/convert/convert_quantized_qwen.py
+++ b/neural_speed/convert/convert_quantized_qwen.py
@@ -177,12 +177,11 @@ def main(args_in: Optional[List[str]] = None) -> None:
     f.write(struct.pack("i", 0))  # rope_scaling.original_max_position_embeddings
     f.write(struct.pack("i", 0))  # params["rope_scaling"]["type"] =="yarn" else 0))
 
-    f.write(
-        struct.pack(
-            "i", hparams["bos_token_id"] if "bos_token_id" in hparams else tokenizer.special_tokens['<|endoftext|>']))
-    f.write(
-        struct.pack(
-            "i", hparams["eos_token_id"] if "eos_token_id" in hparams else tokenizer.special_tokens['<|endoftext|>']))
+    if hparams['model_type']=='qwen2':
+        f.write(struct.pack("i", hparams["bos_token_id"]))
+        f.write(struct.pack("i", hparams["eos_token_id"]))
+    else:
+        f.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>']))
     f.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
     f.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1))
 

From 7fe34c3354bd80f878e077eb182b81f78c06ae57 Mon Sep 17 00:00:00 2001
From: intellinjun <jun.lin@intel.com>
Date: Fri, 8 Mar 2024 13:47:54 +0800
Subject: [PATCH 16/16] fix convert qwen error

Signed-off-by: intellinjun <jun.lin@intel.com>
---
 neural_speed/convert/convert_quantized_qwen.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/neural_speed/convert/convert_quantized_qwen.py b/neural_speed/convert/convert_quantized_qwen.py
index 149a62660..d5444b25d 100644
--- a/neural_speed/convert/convert_quantized_qwen.py
+++ b/neural_speed/convert/convert_quantized_qwen.py
@@ -182,6 +182,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
         f.write(struct.pack("i", hparams["eos_token_id"]))
     else:
         f.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>']))
+        f.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>']))
     f.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
     f.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1))