From 3957dc65586dfc64062836c9231e64e09abbec8e Mon Sep 17 00:00:00 2001 From: Keval Morabia <=> Date: Wed, 16 Oct 2024 03:33:39 -0700 Subject: [PATCH] Update Minitron Width Pruning example --- .github/workflows/cicd-main.yml | 2 +- .../conf/megatron_gpt_prune.yaml | 19 +++++++----- .../language_modeling/megatron_gpt_prune.py | 29 ++++++++++--------- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 800d91acb7ed9..c08425625efd1 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -658,7 +658,7 @@ jobs: prune.ffn_hidden_size=192 \ prune.num_attention_heads=2 \ prune.num_query_groups=2 \ - prune.hidden_size=null \ + prune.hidden_size=128 \ export.save_path=examples/nlp/language_modeling/ci_prune_width.nemo AFTER_SCRIPT: | rm -rf examples/nlp/language_modeling/ci_prune_width.nemo diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml index cb26d5744b5b4..f174aafed0ee2 100644 --- a/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml @@ -23,19 +23,22 @@ trainer: model: tensor_model_parallel_size: 1 # Pruning currently only supports tensor_model_parallel_size=1 pipeline_model_parallel_size: 1 - restore_from_path: llama3.1-8b-base.nemo # Nemo file path + sequence_parallel: false # Sequence parallelism is not supported with pipeline parallelism + restore_from_path: llama3.1-8b-instruct.nemo # Nemo file path ## Activation Checkpoint activations_checkpoint_granularity: null # 'selective' or 'full' activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' prune: - calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset - num_calib_size: 512 # number of samples used for calibration - ffn_hidden_size: 3584 # ffn_hidden_size in the pruned model, ffn_hidden_size // 4 - num_attention_heads: 8 # num_attention_heads in the pruned model, num_attention_heads // 4 - num_query_groups: 4 # num_query_groups in the pruned model, num_query_groups // 2 - hidden_size: 2048 # hidden_size in the pruned model, hidden_size // 2 + calib_dataset: wikitext # wikitext, cnn_dailymail, or a local dataset + num_calib_size: 1024 # number of samples used for calibration + # pruning constraints (null means no pruning) + ffn_hidden_size: 9216 # ffn_hidden_size in the pruned model + num_attention_heads: null # num_attention_heads in the pruned model + num_query_groups: null # num_query_groups in the pruned model + hidden_size: 3072 # hidden_size (embedding size) in the pruned model + num_layers: null # num_layers (depth) in the pruned model export: - save_path: llama3.1-8b-base-pruned.nemo # Path where the pruned model will be saved + save_path: llama3.1-8b-instruct-pruned.nemo # Path where the pruned model will be saved diff --git a/examples/nlp/language_modeling/megatron_gpt_prune.py b/examples/nlp/language_modeling/megatron_gpt_prune.py index b9bf8edbfb1ab..de12b861a1c0f 100644 --- a/examples/nlp/language_modeling/megatron_gpt_prune.py +++ b/examples/nlp/language_modeling/megatron_gpt_prune.py @@ -36,23 +36,23 @@ Example usage: ``` python examples/nlp/language_modeling/megatron_gpt_prune.py \ - model.restore_from_path=llama3.1-8b-base.nemo \ + model.restore_from_path=llama3.1-8b-instruct.nemo \ model.tensor_model_parallel_size=1 \ model.pipeline_model_parallel_size=8 \ trainer.num_nodes=1 \ trainer.precision=bf16 \ trainer.devices=8 \ - prune.ffn_hidden_size=3584 \ - prune.num_attention_heads=8 \ - prune.num_query_groups=4 \ - prune.hidden_size=2048 \ - export.save_path=llama3.1-8b-base-pruned.nemo + prune.ffn_hidden_size=9216 \ + prune.num_attention_heads=null \ + prune.num_query_groups=null \ + prune.hidden_size=3072 \ + export.save_path=llama3.1-8b-instruct-pruned.nemo ``` where tensor_model_parallel_size must be 1 because of the current prune API limitation """ -def get_calib_data_iter(data="cnn_dailymail", batch_size=64, calib_size=512, max_sequence_length=512): +def get_calib_data_iter(data="wikitext", batch_size=64, calib_size=512, max_sequence_length=512): if data == "wikitext": dataset = load_dataset("wikitext", "wikitext-103-v1", split="train") text_column = "text" @@ -73,18 +73,12 @@ def get_calib_data_iter(data="cnn_dailymail", batch_size=64, calib_size=512, max @hydra_runner(config_path="conf", config_name="megatron_gpt_prune") def main(cfg) -> None: - if not torch.cuda.is_available(): - raise EnvironmentError("GPU is required for the pruning.") - # Overwrite model config with the one from the model checkpoint and apply pruning modifications model_cfg = load_config(cfg.model.restore_from_path) model_cfg.update(cfg.model) model_cfg.name = "modelopt" # Use modelopt transformer spec for pruning assert cfg.model.tensor_model_parallel_size == 1, "Pruning currently only supports tensor_model_parallel_size=1" - assert ( - not hasattr(cfg.model, "sequence_parallel") or not cfg.model.sequence_parallel - ), "Pruning currently does not support sequence parallelism" trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) model = MegatronGPTModel.restore_from( @@ -112,7 +106,13 @@ def forward_loop(model): constraints={ "export_config": { k: cfg.prune.get(k) - for k in ["ffn_hidden_size", "num_attention_heads", "num_query_groups", "hidden_size"] + for k in [ + "ffn_hidden_size", + "num_attention_heads", + "num_query_groups", + "hidden_size", + "num_layers", + ] if cfg.prune.get(k) is not None }, }, @@ -121,6 +121,7 @@ def forward_loop(model): ) model_pruned.save_to(cfg.export.save_path) + print(f"Pruned model saved to {cfg.export.save_path}") if __name__ == '__main__':