From 3e20b523c73c28d7521370365f3293e316eed95a Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Tue, 29 Oct 2024 09:52:58 +0530 Subject: [PATCH] Set TE spec name for NeMo to HF checkpoint converters (#11036) * Set TE spec name for NeMo to HF checkpoint converters Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> * Apply isort and black reformatting Signed-off-by: kevalmorabia97 * Update convert_falcon_nemo_to_hf.py Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --------- Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Signed-off-by: kevalmorabia97 Co-authored-by: kevalmorabia97 --- .../convert_baichuan2_nemo_to_hf.py | 7 ++++- .../convert_chatglm_nemo_to_hf.py | 25 +++++++++++++++--- .../convert_falcon_nemo_to_hf.py | 5 +++- .../convert_llama_nemo_to_hf.py | 26 ++++++++++++++++--- .../convert_mistral_7b_nemo_to_hf.py | 1 + .../convert_mixtral_nemo_to_hf.py | 1 + .../convert_nemotron_nemo_to_hf.py | 1 + .../convert_qwen2_nemo_to_hf.py | 1 + .../convert_starcoder2_nemo_to_hf.py | 1 + 9 files changed, 58 insertions(+), 10 deletions(-) diff --git a/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py index 18ddb89359420..ec048e4b6f190 100644 --- a/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py @@ -50,7 +50,11 @@ def get_args(): parser = ArgumentParser() parser.add_argument( - "--input_name_or_path", type=str, default=None, required=True, help="Path to .nemo file", + "--input_name_or_path", + type=str, + default=None, + required=True, + help="Path to .nemo file", ) parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file") parser.add_argument( @@ -94,6 +98,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) model_config.use_cpu_initialization = True model_config.tensor_model_parallel_size = 1 + model_config.name = "te_gpt" else: map_location, model_config = None, None diff --git a/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py b/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py index 59bc0a64bbe99..5a8e52ee8be51 100644 --- a/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py @@ -50,7 +50,11 @@ def get_args(): parser = ArgumentParser() parser.add_argument( - "--input_name_or_path", type=str, default=None, required=True, help="Path to .nemo file", + "--input_name_or_path", + type=str, + default=None, + required=True, + help="Path to .nemo file", ) parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file") parser.add_argument( @@ -90,6 +94,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 + model_config.name = "te_gpt" if cpu_only: map_location = torch.device('cpu') model_config.use_cpu_initialization = True @@ -168,9 +173,21 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2)) qkv_bias_base_name = f'transformer.encoder.layers.{l}.self_attention.query_key_value.bias' - q_bias = param_to_weights(qkv_bias[q_slice].reshape(-1,)) - k_bias = param_to_weights(qkv_bias[k_slice].reshape(-1,)) - v_bias = param_to_weights(qkv_bias[v_slice].reshape(-1,)) + q_bias = param_to_weights( + qkv_bias[q_slice].reshape( + -1, + ) + ) + k_bias = param_to_weights( + qkv_bias[k_slice].reshape( + -1, + ) + ) + v_bias = param_to_weights( + qkv_bias[v_slice].reshape( + -1, + ) + ) checkpoint[qkv_bias_base_name] = torch.cat((q_bias, k_bias, v_bias)) # attention dense diff --git a/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py b/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py index 997f0ac23835a..da8f15b92649c 100644 --- a/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py @@ -51,7 +51,10 @@ def get_args(): parser = ArgumentParser() parser.add_argument( - "--input_name_or_path", type=str, required=True, help="Path to .nemo file", + "--input_name_or_path", + type=str, + required=True, + help="Path to .nemo file", ) parser.add_argument("--output_path", type=str, required=True, help="Path to HF .bin file") parser.add_argument( diff --git a/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py index 8da15148dfd87..a3c40676a9807 100644 --- a/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py @@ -53,7 +53,11 @@ def get_args(): parser = ArgumentParser() parser.add_argument( - "--input_name_or_path", type=str, default=None, required=True, help="Path to .nemo file or extracted folder", + "--input_name_or_path", + type=str, + default=None, + required=True, + help="Path to .nemo file or extracted folder", ) parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file") parser.add_argument( @@ -105,6 +109,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 + model_config.name = "te_gpt" if cpu_only: map_location = torch.device('cpu') model_config.use_cpu_initialization = True @@ -226,13 +231,26 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> def replace_hf_weights_and_tokenizer( - weights_file, dtype, input_hf_path, output_hf_path, tokenizer_path, output_hf_tokenizer, + weights_file, + dtype, + input_hf_path, + output_hf_path, + tokenizer_path, + output_hf_tokenizer, ): - model = AutoModelForCausalLM.from_pretrained(input_hf_path, local_files_only=True, torch_dtype=dtype,) + model = AutoModelForCausalLM.from_pretrained( + input_hf_path, + local_files_only=True, + torch_dtype=dtype, + ) nemo_exported = torch.load(weights_file) if tokenizer_path: - tokenizer = LlamaTokenizer.from_pretrained(tokenizer_path, local_files_only=True, legacy=False,) + tokenizer = LlamaTokenizer.from_pretrained( + tokenizer_path, + local_files_only=True, + legacy=False, + ) tmp_tokenizer = convert_slow_tokenizer.convert_slow_tokenizer(tokenizer) fast_tokenizer = LlamaTokenizerFast(tokenizer_object=tmp_tokenizer) tokenizer_length = len(fast_tokenizer) diff --git a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py index 796819c38ba44..b8c30a1b929d2 100644 --- a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py @@ -81,6 +81,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None: model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 model_config.sequence_parallel = False + model_config.name = "te_gpt" if cpu_only: map_location = torch.device('cpu') model_config.use_cpu_initialization = True diff --git a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py index 58311d0324c2a..2bac2eaad616b 100644 --- a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py @@ -83,6 +83,7 @@ def convert(in_file, precision=None) -> None: model_config = MegatronGPTModel.restore_from(in_file, trainer=dummy_trainer, return_config=True) model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 + model_config.name = "te_gpt" cpu_only = True if cpu_only: map_location = torch.device('cpu') diff --git a/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py b/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py index 7a58573278afe..fc0f660cbd425 100644 --- a/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py @@ -140,6 +140,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> model_config.pipeline_model_parallel_size = 1 model_config.sequence_parallel = False model_config.transformer_engine = True + model_config.name = "te_gpt" if cpu_only: map_location = torch.device("cpu") model_config.use_cpu_initialization = True diff --git a/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py index c6a218020c213..6080499ffdf8c 100644 --- a/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py @@ -108,6 +108,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 + model_config.name = "te_gpt" if cpu_only: map_location = torch.device('cpu') model_config.use_cpu_initialization = True diff --git a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py index 043d1fd35261e..4b65533b74ec4 100644 --- a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py @@ -89,6 +89,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None: model_config = MegatronGPTModel.restore_from(in_file, trainer=dummy_trainer, return_config=True) model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 + model_config.name = "te_gpt" if cpu_only: map_location = torch.device('cpu') model_config.use_cpu_initialization = True