Skip to content

Commit

Permalink
Merge branch 'main' into cherry-pick-main-dffbaf4d21d0ee507d1e422b7d2…
Browse files Browse the repository at this point in the history
…12d3f12445428
  • Loading branch information
marcromeyn authored Jul 10, 2024
2 parents 53cbc7d + f5d5221 commit adc65d7
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 10 deletions.
65 changes: 61 additions & 4 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2391,7 +2391,7 @@ jobs:
L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
runs-on: self-hosted-azure-gpus-2-h100
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
Expand All @@ -2403,6 +2403,21 @@ jobs:
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
env:
# This is to improve p2p overlap on H100
NVTE_FWD_LAYERNORM_SM_MARGIN: 8
NVTE_BWD_LAYERNORM_SM_MARGIN: 8
TORCH_NCCL_AVOID_RECORD_STREAMS: 1
NCCL_MIN_NCHANNELS: 4
# TP overlap is not supported in docker environment
#NVTE_UB_SPLIT_RS: 0
#NVTE_UB_ATOMIC_GEMM_RS: 1
#NVTE_RS_STRIDED_ATOMIC: 1
#NVTE_UB_FP8_RS: 1
# Increase p2p chunksize to 2MB
NCCL_P2P_NET_CHUNKSIZE: 2097152
# Disable gc when switching to/from validation steps
NEMO_MANUAL_GC_IN_VALIDATION: 0
steps:
- name: Checkout repository
uses: actions/checkout@v4
Expand All @@ -2417,8 +2432,17 @@ jobs:
trainer.max_steps=3 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
++model.transformer_engine=True \
++model.fp8=True \
++model.fp8_hybrid=True \
++model.fp8_amax_history_len=1024 \
++model.fp8_amax_compute_algo=max \
++model.reduce_amax=True \
++model.use_te_rng_tracker=True \
++model.name=megatron_gpt_full_te_layer_autocast \
model.ub_tp_comm_overlap=False \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.name=distributed_fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=1 \
model.optim.sched.constant_steps=1 \
Expand Down Expand Up @@ -2452,8 +2476,17 @@ jobs:
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
exp_manager.resume_if_exists=True \
++model.transformer_engine=True \
++model.fp8=True \
++model.fp8_hybrid=True \
++model.fp8_amax_history_len=1024 \
++model.fp8_amax_compute_algo=max \
++model.reduce_amax=True \
++model.use_te_rng_tracker=True \
++model.name=megatron_gpt_full_te_layer_autocast \
model.ub_tp_comm_overlap=False \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.name=distributed_fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
Expand Down Expand Up @@ -2945,10 +2978,11 @@ jobs:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure
RUNNER: self-hosted-azure-gpus-2-h100
SCRIPT: |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
Expand All @@ -2957,6 +2991,15 @@ jobs:
trainer.precision=bf16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
++model.transformer_engine=True \
++model.fp8=True \
++model.fp8_hybrid=True \
++model.fp8_amax_history_len=1024 \
++model.fp8_amax_compute_algo=max \
++model.reduce_amax=True \
++model.use_te_rng_tracker=True \
++model.name=megatron_gpt_full_te_layer_autocast \
model.ub_tp_comm_overlap=False \
model.pipeline_model_parallel_size=2 \
model.tensor_model_parallel_size=1 \
model.mcore_gpt=True \
Expand All @@ -2981,12 +3024,15 @@ jobs:
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.validation_drop_last=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
Expand All @@ -2998,6 +3044,15 @@ jobs:
model.megatron_amp_O2=True \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
exp_manager.resume_if_exists=True \
++model.transformer_engine=True \
++model.fp8=True \
++model.fp8_hybrid=True \
++model.fp8_amax_history_len=1024 \
++model.fp8_amax_compute_algo=max \
++model.reduce_amax=True \
++model.use_te_rng_tracker=True \
++model.name=megatron_gpt_full_te_layer_autocast \
model.ub_tp_comm_overlap=False \
model.pipeline_model_parallel_size=2 \
model.tensor_model_parallel_size=1 \
model.optim.name=distributed_fused_adam \
Expand All @@ -3020,7 +3075,9 @@ jobs:
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.validation_drop_last=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
AFTER_SCRIPT: |
Expand Down
42 changes: 36 additions & 6 deletions nemo/lightning/megatron_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@

import torch
import torch.distributed
from megatron.core import parallel_state
from megatron.core.distributed import DistributedDataParallel as McoreDDP
from megatron.core.distributed import DistributedDataParallelConfig
from megatron.core.transformer.transformer_config import TransformerConfig
from pytorch_lightning.utilities import move_data_to_device
from torch import Tensor, nn
from typing_extensions import override

Expand All @@ -43,15 +45,43 @@ def convert_output(self, output: torch.Tensor) -> torch.Tensor: ...


def default_data_step(dataloader_iter: Iterator[DataT]) -> DataT:
batch = next(dataloader_iter)
"""
Moves the data to a device.
In this case we utilize the match function to unpack the dataloader iterator. There may be a wrapper on the dataloader
iter from here: https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/fabric/strategies.py#L441.
if isinstance(batch, tuple) and len(batch) == 3:
batch = batch[0]
This will not subset the data for your with context parallel so please override this function if you
want to use context parallel.
if isinstance(batch, dict):
batch = {k: v.cuda(non_blocking=True) for k, v in batch.items()}
Examples:
If the dataloader_iter returns: [Tuple[<tensor>, <int>, <int>]] -> move to device
If the dataloader_iter returns: [<tensor>, <tensor>] -> move to device
return batch
Returns:
DataT: The data moved to the device.
"""
if parallel_state.get_context_parallel_world_size() > 1:
raise ValueError(
"Default data step is being used in a context parallel environment."
"Please define your own data step that appropriately slices the data for context parallel."
)

match next(dataloader_iter):
# If its wrapped in a tuple, unpack it.
case (batch, int(_), int(_)):
pass
# Canonical case.
case batch:
pass
# If the dataloader_iter is empty, return a ValueError.
case _:
batch = None

if batch is not None:
return move_data_to_device(batch, torch.cuda.current_device())
else:
raise ValueError("None returned from dataloader.")


def default_forward_step(model: nn.Module, batch, *args, **kwargs) -> torch.Tensor:
Expand Down

0 comments on commit adc65d7

Please sign in to comment.