Skip to content

Commit

Permalink
Update Minitron Width Pruning example
Browse files Browse the repository at this point in the history
  • Loading branch information
Keval Morabia committed Oct 16, 2024
1 parent ebab657 commit 3957dc6
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 23 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,7 @@ jobs:
prune.ffn_hidden_size=192 \
prune.num_attention_heads=2 \
prune.num_query_groups=2 \
prune.hidden_size=null \
prune.hidden_size=128 \
export.save_path=examples/nlp/language_modeling/ci_prune_width.nemo
AFTER_SCRIPT: |
rm -rf examples/nlp/language_modeling/ci_prune_width.nemo
Expand Down
19 changes: 11 additions & 8 deletions examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,22 @@ trainer:
model:
tensor_model_parallel_size: 1 # Pruning currently only supports tensor_model_parallel_size=1
pipeline_model_parallel_size: 1
restore_from_path: llama3.1-8b-base.nemo # Nemo file path
sequence_parallel: false # Sequence parallelism is not supported with pipeline parallelism
restore_from_path: llama3.1-8b-instruct.nemo # Nemo file path

## Activation Checkpoint
activations_checkpoint_granularity: null # 'selective' or 'full'
activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'

prune:
calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset
num_calib_size: 512 # number of samples used for calibration
ffn_hidden_size: 3584 # ffn_hidden_size in the pruned model, ffn_hidden_size // 4
num_attention_heads: 8 # num_attention_heads in the pruned model, num_attention_heads // 4
num_query_groups: 4 # num_query_groups in the pruned model, num_query_groups // 2
hidden_size: 2048 # hidden_size in the pruned model, hidden_size // 2
calib_dataset: wikitext # wikitext, cnn_dailymail, or a local dataset
num_calib_size: 1024 # number of samples used for calibration
# pruning constraints (null means no pruning)
ffn_hidden_size: 9216 # ffn_hidden_size in the pruned model
num_attention_heads: null # num_attention_heads in the pruned model
num_query_groups: null # num_query_groups in the pruned model
hidden_size: 3072 # hidden_size (embedding size) in the pruned model
num_layers: null # num_layers (depth) in the pruned model

export:
save_path: llama3.1-8b-base-pruned.nemo # Path where the pruned model will be saved
save_path: llama3.1-8b-instruct-pruned.nemo # Path where the pruned model will be saved
29 changes: 15 additions & 14 deletions examples/nlp/language_modeling/megatron_gpt_prune.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,23 +36,23 @@
Example usage:
```
python examples/nlp/language_modeling/megatron_gpt_prune.py \
model.restore_from_path=llama3.1-8b-base.nemo \
model.restore_from_path=llama3.1-8b-instruct.nemo \
model.tensor_model_parallel_size=1 \
model.pipeline_model_parallel_size=8 \
trainer.num_nodes=1 \
trainer.precision=bf16 \
trainer.devices=8 \
prune.ffn_hidden_size=3584 \
prune.num_attention_heads=8 \
prune.num_query_groups=4 \
prune.hidden_size=2048 \
export.save_path=llama3.1-8b-base-pruned.nemo
prune.ffn_hidden_size=9216 \
prune.num_attention_heads=null \
prune.num_query_groups=null \
prune.hidden_size=3072 \
export.save_path=llama3.1-8b-instruct-pruned.nemo
```
where tensor_model_parallel_size must be 1 because of the current prune API limitation
"""


def get_calib_data_iter(data="cnn_dailymail", batch_size=64, calib_size=512, max_sequence_length=512):
def get_calib_data_iter(data="wikitext", batch_size=64, calib_size=512, max_sequence_length=512):
if data == "wikitext":
dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
text_column = "text"
Expand All @@ -73,18 +73,12 @@ def get_calib_data_iter(data="cnn_dailymail", batch_size=64, calib_size=512, max

@hydra_runner(config_path="conf", config_name="megatron_gpt_prune")
def main(cfg) -> None:
if not torch.cuda.is_available():
raise EnvironmentError("GPU is required for the pruning.")

# Overwrite model config with the one from the model checkpoint and apply pruning modifications
model_cfg = load_config(cfg.model.restore_from_path)
model_cfg.update(cfg.model)
model_cfg.name = "modelopt" # Use modelopt transformer spec for pruning

assert cfg.model.tensor_model_parallel_size == 1, "Pruning currently only supports tensor_model_parallel_size=1"
assert (
not hasattr(cfg.model, "sequence_parallel") or not cfg.model.sequence_parallel
), "Pruning currently does not support sequence parallelism"

trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
model = MegatronGPTModel.restore_from(
Expand Down Expand Up @@ -112,7 +106,13 @@ def forward_loop(model):
constraints={
"export_config": {
k: cfg.prune.get(k)
for k in ["ffn_hidden_size", "num_attention_heads", "num_query_groups", "hidden_size"]
for k in [
"ffn_hidden_size",
"num_attention_heads",
"num_query_groups",
"hidden_size",
"num_layers",
]
if cfg.prune.get(k) is not None
},
},
Expand All @@ -121,6 +121,7 @@ def forward_loop(model):
)

model_pruned.save_to(cfg.export.save_path)
print(f"Pruned model saved to {cfg.export.save_path}")


if __name__ == '__main__':
Expand Down

0 comments on commit 3957dc6

Please sign in to comment.