Skip to content

Commit

Permalink
check actual value of vocab_file (#11228)
Browse files Browse the repository at this point in the history
* check actual value of vocab_file

Signed-off-by: Alexandros Koumparoulis <[email protected]>

* pylint

Signed-off-by: Alexandros Koumparoulis <[email protected]>

* Apply isort and black reformatting

Signed-off-by: akoumpa <[email protected]>

---------

Signed-off-by: Alexandros Koumparoulis <[email protected]>
Signed-off-by: akoumpa <[email protected]>
Co-authored-by: akoumpa <[email protected]>
  • Loading branch information
akoumpa and akoumpa authored Nov 8, 2024
1 parent 2b8a160 commit 8a191f3
Showing 1 changed file with 12 additions and 1 deletion.
13 changes: 12 additions & 1 deletion scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@


def get_args():
"""parses cli args"""
parser = ArgumentParser()
parser.add_argument(
"--input_name_or_path",
Expand All @@ -63,6 +64,7 @@ def get_args():


def restore_model_from_checkpoint(cls, checkpoint, strict, **kwargs):
"""Loads mcore ckpt"""
try:
if 'cfg' in kwargs:
model = ptl_load_state(cls, checkpoint, strict=strict, **kwargs)
Expand Down Expand Up @@ -103,6 +105,7 @@ def restore_model_from_checkpoint(cls, checkpoint, strict, **kwargs):


def load_config(mistral_config, tokenizer, config_path):
"""Create mcor config"""
nemo_config = OmegaConf.load(
os.path.join(os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_llama_config.yaml')
).model
Expand Down Expand Up @@ -130,7 +133,7 @@ def load_config(mistral_config, tokenizer, config_path):
nemo_config.activation = 'fast-swiglu'

# Tokenizer config
if hasattr(tokenizer, 'vocab_file'):
if getattr(tokenizer, 'vocab_file', None) is not None:
nemo_config.tokenizer.model = tokenizer.vocab_file
elif os.path.exists(os.path.join(config_path, 'tekken.json')):
# Load tekken.json, extract the 'vocab' field & write it to file.
Expand Down Expand Up @@ -177,6 +180,8 @@ def load_config(mistral_config, tokenizer, config_path):


class LazyStateDict:
"""Lazy"""

def __init__(self, ckpt_index, root):
self.map = ckpt_index
self.root = root
Expand All @@ -192,6 +197,7 @@ def __getitem__(self, key):


def load_mistral_ckpt(in_dir, load_model=True):
"""loads mistral hf ckpt"""
params_file = os.path.join(in_dir, 'config.json')
assert os.path.exists(params_file)
with open(params_file, 'r') as fp:
Expand All @@ -217,6 +223,7 @@ def load_mistral_ckpt(in_dir, load_model=True):


def parse_precision(precision):
"""parses precision string"""
if precision in ["32", "16"]:
return int(float(precision))
elif precision in ["bf16", "bf16-mixed"]:
Expand All @@ -230,6 +237,7 @@ def parse_precision(precision):


def make_trainer(args, nemo_config):
"""creates PTL trainer"""
model_args, ckpt, tokenizer = load_mistral_ckpt(args.input_name_or_path, load_model=False)
nemo_config = load_config(model_args, tokenizer, args.input_name_or_path)
precision = parse_precision(args.precision)
Expand Down Expand Up @@ -269,6 +277,7 @@ def make_trainer(args, nemo_config):


def convert(args):
"""converts chceckpoint from hf to nemo"""
logging.info(f"loading checkpoint {args.input_name_or_path}")

model_args, ckpt, tokenizer = load_mistral_ckpt(args.input_name_or_path)
Expand Down Expand Up @@ -408,6 +417,7 @@ def convert(args):


def merge(a: dict, b: dict, path=[]):
"""merges two state dicts"""
is_dict = lambda x: isinstance(x, OrderedDict) or isinstance(x, dict)
for key in b:
if key in a:
Expand All @@ -421,6 +431,7 @@ def merge(a: dict, b: dict, path=[]):


def save_to_nemo(args, checkpoint):
"""saves checkpoint to nemo format"""

logging.info(f"loading checkpoint {args.input_name_or_path}")
model_args, ckpt, tokenizer = load_mistral_ckpt(args.input_name_or_path, load_model=False)
Expand Down

0 comments on commit 8a191f3

Please sign in to comment.