Skip to content

Commit

Permalink
Merge branch 'main' into vchen/neva-blend-data
Browse files Browse the repository at this point in the history
  • Loading branch information
xuanzic authored Aug 6, 2024
2 parents 1ca19ac + 71ab9d7 commit 1df0603
Show file tree
Hide file tree
Showing 70 changed files with 3,816 additions and 801 deletions.
264 changes: 188 additions & 76 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2359,81 +2359,128 @@ jobs:
L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure
SCRIPT: |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=3 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=1 \
model.optim.sched.constant_steps=1 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.bias=False \
model.bias_activation_fusion=False \
model.bias_dropout_add_fusion=False \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.validation_drop_last=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=6 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
exp_manager.resume_if_exists=True \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.bias=False \
model.bias_activation_fusion=False \
model.bias_dropout_add_fusion=False \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.validation_drop_last=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
AFTER_SCRIPT: |
rm -rf examples/nlp/language_modeling/gpt_pretrain_results
rm -rf examples/nlp/language_modeling/gpt_index_mappings
runs-on: self-hosted-azure-gpus-2-h100
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
env:
# This is to improve p2p overlap on H100
NVTE_FWD_LAYERNORM_SM_MARGIN: 8
NVTE_BWD_LAYERNORM_SM_MARGIN: 8
TORCH_NCCL_AVOID_RECORD_STREAMS: 1
NCCL_MIN_NCHANNELS: 4
# TP overlap is not supported in docker environment
#NVTE_UB_SPLIT_RS: 0
#NVTE_UB_ATOMIC_GEMM_RS: 1
#NVTE_RS_STRIDED_ATOMIC: 1
#NVTE_UB_FP8_RS: 1
# Increase p2p chunksize to 2MB
NCCL_P2P_NET_CHUNKSIZE: 2097152
# Disable gc when switching to/from validation steps
NEMO_MANUAL_GC_IN_VALIDATION: 0
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=3 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
++model.transformer_engine=True \
++model.fp8=True \
++model.fp8_hybrid=True \
++model.fp8_amax_history_len=1024 \
++model.fp8_amax_compute_algo=max \
++model.reduce_amax=True \
++model.use_te_rng_tracker=True \
++model.name=megatron_gpt_full_te_layer_autocast \
model.ub_tp_comm_overlap=False \
model.tensor_model_parallel_size=2 \
model.optim.name=distributed_fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=1 \
model.optim.sched.constant_steps=1 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.bias=False \
model.bias_activation_fusion=False \
model.bias_dropout_add_fusion=False \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.validation_drop_last=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=6 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
exp_manager.resume_if_exists=True \
++model.transformer_engine=True \
++model.fp8=True \
++model.fp8_hybrid=True \
++model.fp8_amax_history_len=1024 \
++model.fp8_amax_compute_algo=max \
++model.reduce_amax=True \
++model.use_te_rng_tracker=True \
++model.name=megatron_gpt_full_te_layer_autocast \
model.ub_tp_comm_overlap=False \
model.tensor_model_parallel_size=2 \
model.optim.name=distributed_fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.bias=False \
model.bias_activation_fusion=False \
model.bias_dropout_add_fusion=False \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.validation_drop_last=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
rm -rf examples/nlp/language_modeling/gpt_pretrain_results
rm -rf examples/nlp/language_modeling/gpt_index_mappings
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
Expand Down Expand Up @@ -2843,10 +2890,11 @@ jobs:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure
RUNNER: self-hosted-azure-gpus-2-h100
SCRIPT: |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
Expand All @@ -2855,6 +2903,15 @@ jobs:
trainer.precision=bf16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
++model.transformer_engine=True \
++model.fp8=True \
++model.fp8_hybrid=True \
++model.fp8_amax_history_len=1024 \
++model.fp8_amax_compute_algo=max \
++model.reduce_amax=True \
++model.use_te_rng_tracker=True \
++model.name=megatron_gpt_full_te_layer_autocast \
model.ub_tp_comm_overlap=False \
model.pipeline_model_parallel_size=2 \
model.tensor_model_parallel_size=1 \
model.mcore_gpt=True \
Expand All @@ -2879,12 +2936,15 @@ jobs:
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.validation_drop_last=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
Expand All @@ -2896,6 +2956,15 @@ jobs:
model.megatron_amp_O2=True \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
exp_manager.resume_if_exists=True \
++model.transformer_engine=True \
++model.fp8=True \
++model.fp8_hybrid=True \
++model.fp8_amax_history_len=1024 \
++model.fp8_amax_compute_algo=max \
++model.reduce_amax=True \
++model.use_te_rng_tracker=True \
++model.name=megatron_gpt_full_te_layer_autocast \
model.ub_tp_comm_overlap=False \
model.pipeline_model_parallel_size=2 \
model.tensor_model_parallel_size=1 \
model.optim.name=distributed_fused_adam \
Expand All @@ -2918,7 +2987,9 @@ jobs:
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.validation_drop_last=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
AFTER_SCRIPT: |
Expand Down Expand Up @@ -3025,6 +3096,47 @@ jobs:
AFTER_SCRIPT: |
rm -rf examples/nlp/language_modeling/gpt_sft_results
L2_Megatron_GPT_Reranker:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
rm -rf /home/TestData/nlp/megatron_ir/working_dir
python examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py \
exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \
model.global_batch_size=4 \
model.micro_batch_size=4 \
trainer.devices=1 \
trainer.num_nodes=1 \
trainer.max_epochs=null \
trainer.max_steps=20 \
trainer.val_check_interval=10 \
model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
model.peft.lora_tuning.adapter_dim=8 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \
model.data.validation_ds.write_embeddings_to_file=True \
model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl]
rm -rf /home/TestData/nlp/megatron_ir/working_dir
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

L2_Megatron_GPT_Embedding:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ WORKDIR /workspace
# Install NeMo requirements
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG MODELOPT_VERSION=0.13.0
ARG MCORE_TAG=2bbe55be32e2d478c4b2ce575af1cccb8fc3d9b9
ARG MCORE_TAG=2fd6e2b74efca73a1f2d27b89bb5419384b4d3bf
ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
RUN \
--mount=type=bind,source=requirements,target=requirements \
Expand Down
4 changes: 0 additions & 4 deletions examples/asr/conf/asr_finetune/speech_to_text_finetune.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ init_from_nemo_model: null # path to nemo model

model:
sample_rate: 16000
compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
log_prediction: true # enables logging sample predictions in the output during training
rnnt_reduction: 'mean_volume'
skip_nan_grad: false

train_ds:
manifest_filepath: ???
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@ init_from_pretrained_model: null # name of pretrained NeMo model, e.g., `stt_en

model:
sample_rate: 16000
compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
log_prediction: true # enables logging sample predictions in the output during training
rnnt_reduction: 'mean_volume'
skip_nan_grad: false

# configs for huggingface load_dataset function
data_path: "librispeech_asr"
Expand Down
8 changes: 6 additions & 2 deletions examples/asr/speech_to_text_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@
1) `init_from_nemo_model` or
2) `init_from_pretrained_model` in the configuration.
To update the model architecture in conjunction with other modifications, it is advisable to use the primary 'speech_to_text_rnnt/ctc_*.py' script.
****************************************************************************************
This script is mainly intended for changing the dataset, optim, spec_augment, vocabulary/tokenizer of the model.
To update the model architecture in conjunction with other modifications,
it is advisable to use the primary 'speech_to_text_rnnt/ctc_*.py' script.
****************************************************************************************
Note: To create a single script for all model types, we currently only support two types of
initializations:
Expand Down Expand Up @@ -135,7 +139,7 @@ def check_vocabulary(asr_model, cfg):

def update_tokenizer(asr_model, tokenizer_dir, tokenizer_type):
"""
Updates the tokenizer of the model and also reinitializes the decoder if the vocabulary size
Updates the tokenizer of the model and also reinitializes the decoder if the vocabulary size
of the new tokenizer differs from that of the loaded model.
Args:
asr_model: ASRModel instance
Expand Down
Loading

0 comments on commit 1df0603

Please sign in to comment.