From 4c9169f0a340454739156b7e9ad76dd1041ddc2b Mon Sep 17 00:00:00 2001 From: lukex Date: Wed, 30 Oct 2024 20:36:59 +0800 Subject: [PATCH 1/2] updated qwen tokenizer config when converting to nemo format --- .../convert_qwen2_hf_to_nemo.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py b/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py index 223c7af50843..b6974acca0d8 100644 --- a/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py +++ b/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py @@ -81,8 +81,16 @@ def load_config(args, qwen_config): nemo_config.num_query_groups = qwen_config['num_key_value_heads'] nemo_config.use_cpu_initialization = True nemo_config.activation = 'fast-swiglu' - nemo_config.tokenizer.type = str(args.input_name_or_path) - nemo_config.tokenizer.model = str(args.input_name_or_path) + '/vocab.json' + + # use HF tokenizer + tokenizer_dict = { + 'library': 'huggingface', + 'type': args.input_name_or_path, + 'use_fast': True, + 'trust_remote_code': True, + } + nemo_config.tokenizer = tokenizer_dict + nemo_config.override_vocab_size = qwen_config['vocab_size'] base = 128 From 972cfb801b6c0fdf7686588bffec3599cba10a8c Mon Sep 17 00:00:00 2001 From: chrjxj Date: Wed, 30 Oct 2024 12:52:18 +0000 Subject: [PATCH 2/2] Apply isort and black reformatting Signed-off-by: chrjxj --- scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py b/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py index b6974acca0d8..a29a58557c0c 100644 --- a/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py +++ b/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py @@ -90,7 +90,7 @@ def load_config(args, qwen_config): 'trust_remote_code': True, } nemo_config.tokenizer = tokenizer_dict - + nemo_config.override_vocab_size = qwen_config['vocab_size'] base = 128