From 60cce8d4470bd7ab0f8fb84e150fe082d45c5b6a Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Tue, 29 Oct 2024 00:48:16 +0530
Subject: [PATCH] Update ModelOpt Width Pruning example defaults (#10902)

* update width pruning example defaults

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update Dockerfile.ci

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Undo CI version update

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

---------

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
---
 .../conf/megatron_gpt_prune.yaml              | 19 +++++++-----
 .../language_modeling/megatron_gpt_prune.py   | 29 ++++++++++---------
 2 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml
index cb26d5744b5b..f174aafed0ee 100644
--- a/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml
@@ -23,19 +23,22 @@ trainer:
 model:
   tensor_model_parallel_size: 1 # Pruning currently only supports tensor_model_parallel_size=1
   pipeline_model_parallel_size: 1
-  restore_from_path: llama3.1-8b-base.nemo # Nemo file path
+  sequence_parallel: false # Sequence parallelism is not supported with pipeline parallelism
+  restore_from_path: llama3.1-8b-instruct.nemo # Nemo file path
 
   ## Activation Checkpoint
   activations_checkpoint_granularity: null # 'selective' or 'full'
   activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
 
 prune:
-  calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset
-  num_calib_size: 512 # number of samples used for calibration
-  ffn_hidden_size: 3584 # ffn_hidden_size in the pruned model, ffn_hidden_size // 4
-  num_attention_heads: 8 # num_attention_heads in the pruned model, num_attention_heads // 4
-  num_query_groups: 4 # num_query_groups in the pruned model, num_query_groups // 2
-  hidden_size: 2048 # hidden_size in the pruned model, hidden_size // 2
+  calib_dataset: wikitext # wikitext, cnn_dailymail, or a local dataset
+  num_calib_size: 1024 # number of samples used for calibration
+  # pruning constraints (null means no pruning)
+  ffn_hidden_size: 9216 # ffn_hidden_size in the pruned model
+  num_attention_heads: null # num_attention_heads in the pruned model
+  num_query_groups: null # num_query_groups in the pruned model
+  hidden_size: 3072 # hidden_size (embedding size) in the pruned model
+  num_layers: null # num_layers (depth) in the pruned model
 
 export:
-  save_path: llama3.1-8b-base-pruned.nemo # Path where the pruned model will be saved
+  save_path: llama3.1-8b-instruct-pruned.nemo # Path where the pruned model will be saved
diff --git a/examples/nlp/language_modeling/megatron_gpt_prune.py b/examples/nlp/language_modeling/megatron_gpt_prune.py
index b9bf8edbfb1a..de12b861a1c0 100644
--- a/examples/nlp/language_modeling/megatron_gpt_prune.py
+++ b/examples/nlp/language_modeling/megatron_gpt_prune.py
@@ -36,23 +36,23 @@
 Example usage:
 ```
 python examples/nlp/language_modeling/megatron_gpt_prune.py \
-    model.restore_from_path=llama3.1-8b-base.nemo \
+    model.restore_from_path=llama3.1-8b-instruct.nemo \
     model.tensor_model_parallel_size=1 \
     model.pipeline_model_parallel_size=8 \
     trainer.num_nodes=1 \
     trainer.precision=bf16 \
     trainer.devices=8 \
-    prune.ffn_hidden_size=3584 \
-    prune.num_attention_heads=8 \
-    prune.num_query_groups=4 \
-    prune.hidden_size=2048 \
-    export.save_path=llama3.1-8b-base-pruned.nemo
+    prune.ffn_hidden_size=9216 \
+    prune.num_attention_heads=null \
+    prune.num_query_groups=null \
+    prune.hidden_size=3072 \
+    export.save_path=llama3.1-8b-instruct-pruned.nemo
 ```
 where tensor_model_parallel_size must be 1 because of the current prune API limitation
 """
 
 
-def get_calib_data_iter(data="cnn_dailymail", batch_size=64, calib_size=512, max_sequence_length=512):
+def get_calib_data_iter(data="wikitext", batch_size=64, calib_size=512, max_sequence_length=512):
     if data == "wikitext":
         dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
         text_column = "text"
@@ -73,18 +73,12 @@ def get_calib_data_iter(data="cnn_dailymail", batch_size=64, calib_size=512, max
 
 @hydra_runner(config_path="conf", config_name="megatron_gpt_prune")
 def main(cfg) -> None:
-    if not torch.cuda.is_available():
-        raise EnvironmentError("GPU is required for the pruning.")
-
     # Overwrite model config with the one from the model checkpoint and apply pruning modifications
     model_cfg = load_config(cfg.model.restore_from_path)
     model_cfg.update(cfg.model)
     model_cfg.name = "modelopt"  # Use modelopt transformer spec for pruning
 
     assert cfg.model.tensor_model_parallel_size == 1, "Pruning currently only supports tensor_model_parallel_size=1"
-    assert (
-        not hasattr(cfg.model, "sequence_parallel") or not cfg.model.sequence_parallel
-    ), "Pruning currently does not support sequence parallelism"
 
     trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
     model = MegatronGPTModel.restore_from(
@@ -112,7 +106,13 @@ def forward_loop(model):
         constraints={
             "export_config": {
                 k: cfg.prune.get(k)
-                for k in ["ffn_hidden_size", "num_attention_heads", "num_query_groups", "hidden_size"]
+                for k in [
+                    "ffn_hidden_size",
+                    "num_attention_heads",
+                    "num_query_groups",
+                    "hidden_size",
+                    "num_layers",
+                ]
                 if cfg.prune.get(k) is not None
             },
         },
@@ -121,6 +121,7 @@ def forward_loop(model):
     )
 
     model_pruned.save_to(cfg.export.save_path)
+    print(f"Pruned model saved to {cfg.export.save_path}")
 
 
 if __name__ == '__main__':