diff --git a/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py index 18ddb8935942..ec048e4b6f19 100644 --- a/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py @@ -50,7 +50,11 @@ def get_args(): parser = ArgumentParser() parser.add_argument( - "--input_name_or_path", type=str, default=None, required=True, help="Path to .nemo file", + "--input_name_or_path", + type=str, + default=None, + required=True, + help="Path to .nemo file", ) parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file") parser.add_argument( @@ -94,6 +98,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) model_config.use_cpu_initialization = True model_config.tensor_model_parallel_size = 1 + model_config.name = "te_gpt" else: map_location, model_config = None, None diff --git a/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py b/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py index 59bc0a64bbe9..5a8e52ee8be5 100644 --- a/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py @@ -50,7 +50,11 @@ def get_args(): parser = ArgumentParser() parser.add_argument( - "--input_name_or_path", type=str, default=None, required=True, help="Path to .nemo file", + "--input_name_or_path", + type=str, + default=None, + required=True, + help="Path to .nemo file", ) parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file") parser.add_argument( @@ -90,6 +94,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 + model_config.name = "te_gpt" if cpu_only: map_location = torch.device('cpu') model_config.use_cpu_initialization = True @@ -168,9 +173,21 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2)) qkv_bias_base_name = f'transformer.encoder.layers.{l}.self_attention.query_key_value.bias' - q_bias = param_to_weights(qkv_bias[q_slice].reshape(-1,)) - k_bias = param_to_weights(qkv_bias[k_slice].reshape(-1,)) - v_bias = param_to_weights(qkv_bias[v_slice].reshape(-1,)) + q_bias = param_to_weights( + qkv_bias[q_slice].reshape( + -1, + ) + ) + k_bias = param_to_weights( + qkv_bias[k_slice].reshape( + -1, + ) + ) + v_bias = param_to_weights( + qkv_bias[v_slice].reshape( + -1, + ) + ) checkpoint[qkv_bias_base_name] = torch.cat((q_bias, k_bias, v_bias)) # attention dense diff --git a/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py b/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py index 997f0ac23835..da8f15b92649 100644 --- a/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py @@ -51,7 +51,10 @@ def get_args(): parser = ArgumentParser() parser.add_argument( - "--input_name_or_path", type=str, required=True, help="Path to .nemo file", + "--input_name_or_path", + type=str, + required=True, + help="Path to .nemo file", ) parser.add_argument("--output_path", type=str, required=True, help="Path to HF .bin file") parser.add_argument( diff --git a/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py index 8da15148dfd8..a3c40676a980 100644 --- a/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py @@ -53,7 +53,11 @@ def get_args(): parser = ArgumentParser() parser.add_argument( - "--input_name_or_path", type=str, default=None, required=True, help="Path to .nemo file or extracted folder", + "--input_name_or_path", + type=str, + default=None, + required=True, + help="Path to .nemo file or extracted folder", ) parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file") parser.add_argument( @@ -105,6 +109,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 + model_config.name = "te_gpt" if cpu_only: map_location = torch.device('cpu') model_config.use_cpu_initialization = True @@ -226,13 +231,26 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> def replace_hf_weights_and_tokenizer( - weights_file, dtype, input_hf_path, output_hf_path, tokenizer_path, output_hf_tokenizer, + weights_file, + dtype, + input_hf_path, + output_hf_path, + tokenizer_path, + output_hf_tokenizer, ): - model = AutoModelForCausalLM.from_pretrained(input_hf_path, local_files_only=True, torch_dtype=dtype,) + model = AutoModelForCausalLM.from_pretrained( + input_hf_path, + local_files_only=True, + torch_dtype=dtype, + ) nemo_exported = torch.load(weights_file) if tokenizer_path: - tokenizer = LlamaTokenizer.from_pretrained(tokenizer_path, local_files_only=True, legacy=False,) + tokenizer = LlamaTokenizer.from_pretrained( + tokenizer_path, + local_files_only=True, + legacy=False, + ) tmp_tokenizer = convert_slow_tokenizer.convert_slow_tokenizer(tokenizer) fast_tokenizer = LlamaTokenizerFast(tokenizer_object=tmp_tokenizer) tokenizer_length = len(fast_tokenizer) diff --git a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py index 796819c38ba4..b8c30a1b929d 100644 --- a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py @@ -81,6 +81,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None: model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 model_config.sequence_parallel = False + model_config.name = "te_gpt" if cpu_only: map_location = torch.device('cpu') model_config.use_cpu_initialization = True diff --git a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py index 58311d0324c2..2bac2eaad616 100644 --- a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py @@ -83,6 +83,7 @@ def convert(in_file, precision=None) -> None: model_config = MegatronGPTModel.restore_from(in_file, trainer=dummy_trainer, return_config=True) model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 + model_config.name = "te_gpt" cpu_only = True if cpu_only: map_location = torch.device('cpu') diff --git a/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py b/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py index 7a58573278af..fc0f660cbd42 100644 --- a/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py @@ -140,6 +140,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> model_config.pipeline_model_parallel_size = 1 model_config.sequence_parallel = False model_config.transformer_engine = True + model_config.name = "te_gpt" if cpu_only: map_location = torch.device("cpu") model_config.use_cpu_initialization = True diff --git a/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py index c6a218020c21..6080499ffdf8 100644 --- a/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py @@ -108,6 +108,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 + model_config.name = "te_gpt" if cpu_only: map_location = torch.device('cpu') model_config.use_cpu_initialization = True diff --git a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py index 043d1fd35261..4b65533b74ec 100644 --- a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py @@ -89,6 +89,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None: model_config = MegatronGPTModel.restore_from(in_file, trainer=dummy_trainer, return_config=True) model_config.tensor_model_parallel_size = 1 model_config.pipeline_model_parallel_size = 1 + model_config.name = "te_gpt" if cpu_only: map_location = torch.device('cpu') model_config.use_cpu_initialization = True