From 3a0704482c586acbbb66fe9c3b076b7fee911e60 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Fri, 25 Oct 2024 05:27:20 -0700 Subject: [PATCH 1/3] Set TE spec name for NeMo to HF checkpoint converters Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py | 1 + scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py | 1 + scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py | 1 + scripts/checkpoint_converters/convert_llama_nemo_to_hf.py | 1 + scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py | 1 + scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py | 1 + scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py | 1 + scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py | 1 + scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py | 1 + 9 files changed, 9 insertions(+) diff --git a/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py index 18ddb8935942..41676b5b34eb 100644 --- a/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py @@ -94,6 +94,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) model_config.use_cpu_initialization = True model_config.tensor_model_parallel_size = 1 + model_config.name = "te_gpt" else: map_location, model_config = None, None diff --git a/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py b/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py index 59bc0a64bbe9..365865b7d18c 100644 --- a/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py @@ -90,6 +90,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 + model_config.name = "te_gpt" if cpu_only: map_location = torch.device('cpu') model_config.use_cpu_initialization = True diff --git a/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py b/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py index 997f0ac23835..d14ac61ed7d7 100644 --- a/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py @@ -94,6 +94,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) model_config.use_cpu_initialization = True model_config.tensor_model_parallel_size = 1 + model_config.name = "te_gpt" else: map_location, model_config = None, None diff --git a/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py index 8da15148dfd8..69875148c2e1 100644 --- a/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py @@ -105,6 +105,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 + model_config.name = "te_gpt" if cpu_only: map_location = torch.device('cpu') model_config.use_cpu_initialization = True diff --git a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py index 796819c38ba4..b8c30a1b929d 100644 --- a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py @@ -81,6 +81,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None: model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 model_config.sequence_parallel = False + model_config.name = "te_gpt" if cpu_only: map_location = torch.device('cpu') model_config.use_cpu_initialization = True diff --git a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py index 58311d0324c2..2bac2eaad616 100644 --- a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py @@ -83,6 +83,7 @@ def convert(in_file, precision=None) -> None: model_config = MegatronGPTModel.restore_from(in_file, trainer=dummy_trainer, return_config=True) model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 + model_config.name = "te_gpt" cpu_only = True if cpu_only: map_location = torch.device('cpu') diff --git a/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py b/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py index 7a58573278af..fc0f660cbd42 100644 --- a/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py @@ -140,6 +140,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> model_config.pipeline_model_parallel_size = 1 model_config.sequence_parallel = False model_config.transformer_engine = True + model_config.name = "te_gpt" if cpu_only: map_location = torch.device("cpu") model_config.use_cpu_initialization = True diff --git a/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py index c6a218020c21..6080499ffdf8 100644 --- a/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py @@ -108,6 +108,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 + model_config.name = "te_gpt" if cpu_only: map_location = torch.device('cpu') model_config.use_cpu_initialization = True diff --git a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py index 043d1fd35261..4b65533b74ec 100644 --- a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py @@ -89,6 +89,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None: model_config = MegatronGPTModel.restore_from(in_file, trainer=dummy_trainer, return_config=True) model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 + model_config.name = "te_gpt" if cpu_only: map_location = torch.device('cpu') model_config.use_cpu_initialization = True From 035ecedde12054c6f3b16bd46cf9ed28c4aef446 Mon Sep 17 00:00:00 2001 From: kevalmorabia97 Date: Fri, 25 Oct 2024 12:32:02 +0000 Subject: [PATCH 2/3] Apply isort and black reformatting Signed-off-by: kevalmorabia97 --- .../convert_baichuan2_nemo_to_hf.py | 6 ++++- .../convert_chatglm_nemo_to_hf.py | 24 +++++++++++++++--- .../convert_falcon_nemo_to_hf.py | 5 +++- .../convert_llama_nemo_to_hf.py | 25 ++++++++++++++++--- 4 files changed, 50 insertions(+), 10 deletions(-) diff --git a/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py index 41676b5b34eb..ec048e4b6f19 100644 --- a/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py @@ -50,7 +50,11 @@ def get_args(): parser = ArgumentParser() parser.add_argument( - "--input_name_or_path", type=str, default=None, required=True, help="Path to .nemo file", + "--input_name_or_path", + type=str, + default=None, + required=True, + help="Path to .nemo file", ) parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file") parser.add_argument( diff --git a/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py b/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py index 365865b7d18c..5a8e52ee8be5 100644 --- a/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py @@ -50,7 +50,11 @@ def get_args(): parser = ArgumentParser() parser.add_argument( - "--input_name_or_path", type=str, default=None, required=True, help="Path to .nemo file", + "--input_name_or_path", + type=str, + default=None, + required=True, + help="Path to .nemo file", ) parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file") parser.add_argument( @@ -169,9 +173,21 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2)) qkv_bias_base_name = f'transformer.encoder.layers.{l}.self_attention.query_key_value.bias' - q_bias = param_to_weights(qkv_bias[q_slice].reshape(-1,)) - k_bias = param_to_weights(qkv_bias[k_slice].reshape(-1,)) - v_bias = param_to_weights(qkv_bias[v_slice].reshape(-1,)) + q_bias = param_to_weights( + qkv_bias[q_slice].reshape( + -1, + ) + ) + k_bias = param_to_weights( + qkv_bias[k_slice].reshape( + -1, + ) + ) + v_bias = param_to_weights( + qkv_bias[v_slice].reshape( + -1, + ) + ) checkpoint[qkv_bias_base_name] = torch.cat((q_bias, k_bias, v_bias)) # attention dense diff --git a/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py b/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py index d14ac61ed7d7..df6935ca0ead 100644 --- a/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py @@ -51,7 +51,10 @@ def get_args(): parser = ArgumentParser() parser.add_argument( - "--input_name_or_path", type=str, required=True, help="Path to .nemo file", + "--input_name_or_path", + type=str, + required=True, + help="Path to .nemo file", ) parser.add_argument("--output_path", type=str, required=True, help="Path to HF .bin file") parser.add_argument( diff --git a/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py index 69875148c2e1..a3c40676a980 100644 --- a/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py @@ -53,7 +53,11 @@ def get_args(): parser = ArgumentParser() parser.add_argument( - "--input_name_or_path", type=str, default=None, required=True, help="Path to .nemo file or extracted folder", + "--input_name_or_path", + type=str, + default=None, + required=True, + help="Path to .nemo file or extracted folder", ) parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file") parser.add_argument( @@ -227,13 +231,26 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> def replace_hf_weights_and_tokenizer( - weights_file, dtype, input_hf_path, output_hf_path, tokenizer_path, output_hf_tokenizer, + weights_file, + dtype, + input_hf_path, + output_hf_path, + tokenizer_path, + output_hf_tokenizer, ): - model = AutoModelForCausalLM.from_pretrained(input_hf_path, local_files_only=True, torch_dtype=dtype,) + model = AutoModelForCausalLM.from_pretrained( + input_hf_path, + local_files_only=True, + torch_dtype=dtype, + ) nemo_exported = torch.load(weights_file) if tokenizer_path: - tokenizer = LlamaTokenizer.from_pretrained(tokenizer_path, local_files_only=True, legacy=False,) + tokenizer = LlamaTokenizer.from_pretrained( + tokenizer_path, + local_files_only=True, + legacy=False, + ) tmp_tokenizer = convert_slow_tokenizer.convert_slow_tokenizer(tokenizer) fast_tokenizer = LlamaTokenizerFast(tokenizer_object=tmp_tokenizer) tokenizer_length = len(fast_tokenizer) From 7f846a9a2d2ac9ab2837f93f10737208931ed9d8 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Sat, 26 Oct 2024 01:27:11 +0530 Subject: [PATCH 3/3] Update convert_falcon_nemo_to_hf.py Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py b/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py index df6935ca0ead..da8f15b92649 100644 --- a/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py @@ -97,7 +97,6 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) model_config.use_cpu_initialization = True model_config.tensor_model_parallel_size = 1 - model_config.name = "te_gpt" else: map_location, model_config = None, None