Skip to content

Commit

Permalink
Set TE spec name for NeMo to HF checkpoint converters (NVIDIA#11036)
Browse files Browse the repository at this point in the history
* Set TE spec name for NeMo to HF checkpoint converters

Signed-off-by: Keval Morabia <[email protected]>

* Apply isort and black reformatting

Signed-off-by: kevalmorabia97 <[email protected]>

* Update convert_falcon_nemo_to_hf.py

Signed-off-by: Keval Morabia <[email protected]>

---------

Signed-off-by: Keval Morabia <[email protected]>
Signed-off-by: kevalmorabia97 <[email protected]>
Co-authored-by: kevalmorabia97 <[email protected]>
  • Loading branch information
2 people authored and HuiyingLi committed Nov 15, 2024
1 parent 4893315 commit ae67d3d
Show file tree
Hide file tree
Showing 9 changed files with 58 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@
def get_args():
parser = ArgumentParser()
parser.add_argument(
"--input_name_or_path", type=str, default=None, required=True, help="Path to .nemo file",
"--input_name_or_path",
type=str,
default=None,
required=True,
help="Path to .nemo file",
)
parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file")
parser.add_argument(
Expand Down Expand Up @@ -94,6 +98,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) ->
model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True)
model_config.use_cpu_initialization = True
model_config.tensor_model_parallel_size = 1
model_config.name = "te_gpt"
else:
map_location, model_config = None, None

Expand Down
25 changes: 21 additions & 4 deletions scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@
def get_args():
parser = ArgumentParser()
parser.add_argument(
"--input_name_or_path", type=str, default=None, required=True, help="Path to .nemo file",
"--input_name_or_path",
type=str,
default=None,
required=True,
help="Path to .nemo file",
)
parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file")
parser.add_argument(
Expand Down Expand Up @@ -90,6 +94,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) ->
model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True)
model_config.tensor_model_parallel_size = 1
model_config.pipeline_model_parallel_size = 1
model_config.name = "te_gpt"
if cpu_only:
map_location = torch.device('cpu')
model_config.use_cpu_initialization = True
Expand Down Expand Up @@ -168,9 +173,21 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) ->
v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))

qkv_bias_base_name = f'transformer.encoder.layers.{l}.self_attention.query_key_value.bias'
q_bias = param_to_weights(qkv_bias[q_slice].reshape(-1,))
k_bias = param_to_weights(qkv_bias[k_slice].reshape(-1,))
v_bias = param_to_weights(qkv_bias[v_slice].reshape(-1,))
q_bias = param_to_weights(
qkv_bias[q_slice].reshape(
-1,
)
)
k_bias = param_to_weights(
qkv_bias[k_slice].reshape(
-1,
)
)
v_bias = param_to_weights(
qkv_bias[v_slice].reshape(
-1,
)
)
checkpoint[qkv_bias_base_name] = torch.cat((q_bias, k_bias, v_bias))

# attention dense
Expand Down
5 changes: 4 additions & 1 deletion scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,10 @@
def get_args():
parser = ArgumentParser()
parser.add_argument(
"--input_name_or_path", type=str, required=True, help="Path to .nemo file",
"--input_name_or_path",
type=str,
required=True,
help="Path to .nemo file",
)
parser.add_argument("--output_path", type=str, required=True, help="Path to HF .bin file")
parser.add_argument(
Expand Down
26 changes: 22 additions & 4 deletions scripts/checkpoint_converters/convert_llama_nemo_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,11 @@
def get_args():
parser = ArgumentParser()
parser.add_argument(
"--input_name_or_path", type=str, default=None, required=True, help="Path to .nemo file or extracted folder",
"--input_name_or_path",
type=str,
default=None,
required=True,
help="Path to .nemo file or extracted folder",
)
parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file")
parser.add_argument(
Expand Down Expand Up @@ -105,6 +109,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) ->
model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True)
model_config.tensor_model_parallel_size = 1
model_config.pipeline_model_parallel_size = 1
model_config.name = "te_gpt"
if cpu_only:
map_location = torch.device('cpu')
model_config.use_cpu_initialization = True
Expand Down Expand Up @@ -226,13 +231,26 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) ->


def replace_hf_weights_and_tokenizer(
weights_file, dtype, input_hf_path, output_hf_path, tokenizer_path, output_hf_tokenizer,
weights_file,
dtype,
input_hf_path,
output_hf_path,
tokenizer_path,
output_hf_tokenizer,
):
model = AutoModelForCausalLM.from_pretrained(input_hf_path, local_files_only=True, torch_dtype=dtype,)
model = AutoModelForCausalLM.from_pretrained(
input_hf_path,
local_files_only=True,
torch_dtype=dtype,
)
nemo_exported = torch.load(weights_file)

if tokenizer_path:
tokenizer = LlamaTokenizer.from_pretrained(tokenizer_path, local_files_only=True, legacy=False,)
tokenizer = LlamaTokenizer.from_pretrained(
tokenizer_path,
local_files_only=True,
legacy=False,
)
tmp_tokenizer = convert_slow_tokenizer.convert_slow_tokenizer(tokenizer)
fast_tokenizer = LlamaTokenizerFast(tokenizer_object=tmp_tokenizer)
tokenizer_length = len(fast_tokenizer)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
model_config.tensor_model_parallel_size = 1
model_config.pipeline_model_parallel_size = 1
model_config.sequence_parallel = False
model_config.name = "te_gpt"
if cpu_only:
map_location = torch.device('cpu')
model_config.use_cpu_initialization = True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def convert(in_file, precision=None) -> None:
model_config = MegatronGPTModel.restore_from(in_file, trainer=dummy_trainer, return_config=True)
model_config.tensor_model_parallel_size = 1
model_config.pipeline_model_parallel_size = 1
model_config.name = "te_gpt"
cpu_only = True
if cpu_only:
map_location = torch.device('cpu')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) ->
model_config.pipeline_model_parallel_size = 1
model_config.sequence_parallel = False
model_config.transformer_engine = True
model_config.name = "te_gpt"
if cpu_only:
map_location = torch.device("cpu")
model_config.use_cpu_initialization = True
Expand Down
1 change: 1 addition & 0 deletions scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) ->
model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True)
model_config.tensor_model_parallel_size = 1
model_config.pipeline_model_parallel_size = 1
model_config.name = "te_gpt"
if cpu_only:
map_location = torch.device('cpu')
model_config.use_cpu_initialization = True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
model_config = MegatronGPTModel.restore_from(in_file, trainer=dummy_trainer, return_config=True)
model_config.tensor_model_parallel_size = 1
model_config.pipeline_model_parallel_size = 1
model_config.name = "te_gpt"
if cpu_only:
map_location = torch.device('cpu')
model_config.use_cpu_initialization = True
Expand Down

0 comments on commit ae67d3d

Please sign in to comment.