Merge branch 'main' into sdxl_draft

NVIDIA · Jul 8, 2024 · 188686c · 188686c
2 parents f411298 + 62459cc
commit 188686c
Show file tree

Hide file tree

Showing 328 changed files with 23,526 additions and 8,986 deletions.
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -34,6 +34,13 @@ TTS:
 - tests/collections/tts/**
 - tests/collections/common/tokenizers/text_to_speech/**
 
+Audio:
+- nemo/collections/audio/**/*
+- examples/audio/**/*
+- tutorials/audio/**/*
+- docs/source/audio/**/*
+- tests/collections/audio/**
+
 core:
 - nemo/core/**/*
 - tests/core/**

diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
@@ -36,7 +36,6 @@ on:
 jobs:
   main:
     runs-on: ${{ inputs.RUNNER }} 
-    timeout-minutes: ${{ inputs.TIMEOUT }}
     outputs:
       conclusion: ${{ steps.main.conclusion }}
       log: ${{ steps.main.outputs.log }}
@@ -54,6 +53,7 @@ jobs:
           uses: actions/checkout@v4
         - id: main
           name: Run main script
+          timeout-minutes: ${{ inputs.TIMEOUT }}
           run: |
             set +e 
             (  

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -95,12 +95,12 @@ jobs:
         ### \'\'
 
 
-  OPTIONAL_L0_Unit_Tests_GPU:
+  L0_Unit_Tests_GPU:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
     with:
       RUNNER: self-hosted-azure
-      TIMEOUT: 30
+      TIMEOUT: 60
       SCRIPT: |
         NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
       IS_OPTIONAL: true
@@ -213,7 +213,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+        python examples/nlp/language_modeling/megatron_gpt_ptq.py \
           model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
           quantization.algorithm=null \
           export.save_path=/home/TestData/nlp/megatron_llama/ci_baseline
@@ -226,7 +226,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+        python examples/nlp/language_modeling/megatron_gpt_ptq.py \
           model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
           model.tensor_model_parallel_size=2 \
           trainer.devices=2 \
@@ -245,7 +245,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+        python examples/nlp/language_modeling/megatron_gpt_ptq.py \
         model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
         quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
         quantization.algorithm=int8_sq \
@@ -274,7 +274,7 @@ jobs:
   #      - name: Checkout repository
   #        uses: actions/checkout@v4
   #      - run: |
-  #          python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+  #          python examples/nlp/language_modeling/megatron_gpt_ptq.py \
   #          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
   #          model.tensor_model_parallel_size=1 \
   #          trainer.devices=1 \
@@ -288,6 +288,45 @@ jobs:
         #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
         #  if: "failure()"
 
+  L2_QAT_Llama2_INT4:
+     needs: [cicd-test-container-setup]
+     runs-on: self-hosted-azure
+     timeout-minutes: 10
+     container:
+       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+       options:
+         # --user 0:128
+         --device=/dev/nvidia0
+         --gpus all
+         --shm-size=8g
+         --env TRANSFORMERS_OFFLINE=0
+         --env HYDRA_FULL_ERROR=1
+         --volume /mnt/datadrive/TestData:/home/TestData
+     steps:
+         - name: Checkout repository
+           uses: actions/checkout@v4
+         - run: |
+            python examples/nlp/language_modeling/tuning/megatron_gpt_qat.py \
+            quantization.algorithm=int4 \
+            quantization.num_calib_size=8 \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            trainer.max_steps=4 \
+            trainer.val_check_interval=4 \
+            +trainer.limit_val_batches=2 \
+            exp_manager.explicit_log_dir=llama2_qat_results \
+            model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+            model.tensor_model_parallel_size=1 \
+            model.pipeline_model_parallel_size=1 \
+            model.global_batch_size=2 \
+            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.train_ds.concat_sampling_probabilities=[1.0] \
+            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl]
+
+            rm -rf llama2_qat_results
+         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+           if: "failure()"
+
   # L2: ASR dev run
   ASR_dev_run_Speech_to_Text:
     needs: [cicd-test-container-setup]
@@ -2630,6 +2669,89 @@ jobs:
     #    }
     #  }
 
+  L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    timeout-minutes: 10
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+           python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+           trainer.devices=2 \
+           trainer.accelerator=gpu \
+           trainer.log_every_n_steps=1 \
+           trainer.val_check_interval=3 \
+           trainer.limit_val_batches=2 \
+           trainer.accumulate_grad_batches=1 \
+           trainer.max_steps=3 \
+           trainer.precision=bf16 \
+           trainer.gradient_clip_val=1.0 \
+           exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+           model.tensor_model_parallel_size=2 \
+           model.megatron_amp_O2=True \
+           model.optim.name=distributed_fused_adam \
+           model.optim.lr=2e-4 \
+           model.optim.sched.warmup_steps=2 \
+           model.optim.sched.constant_steps=2 \
+           model.optim.sched.min_lr=8e-5 \
+           model.max_position_embeddings=128 \
+           model.encoder_seq_length=128 \
+           model.data.seq_length=128 \
+           model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+           model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+           model.num_layers=8 \
+           model.hidden_size=256 \
+           model.num_attention_heads=8 \
+           model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+           model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+        
+           python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+           trainer.devices=2 \
+           trainer.accelerator=gpu \
+           trainer.log_every_n_steps=1 \
+           trainer.val_check_interval=3 \
+           trainer.limit_val_batches=2 \
+           trainer.accumulate_grad_batches=1 \
+           trainer.max_steps=6 \
+           trainer.precision=bf16 \
+           trainer.gradient_clip_val=1.0 \
+           exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+           exp_manager.resume_if_exists=True \
+           model.reset_lr=True \
+           model.tensor_model_parallel_size=2 \
+           model.megatron_amp_O2=True \
+           model.optim.name=distributed_fused_adam \
+           model.optim.lr=2e-4 \
+           model.optim.sched.warmup_steps=2 \
+           model.optim.sched.constant_steps=2 \
+           model.optim.sched.min_lr=8e-5 \
+           model.max_position_embeddings=128 \
+           model.encoder_seq_length=128 \
+           model.data.seq_length=128 \
+           model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+           model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+           model.num_layers=8 \
+           model.hidden_size=256 \
+           model.num_attention_heads=8 \
+           model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+           model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+
+           rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+           rm -rf examples/nlp/language_modeling/gpt_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
+
   L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
@@ -3366,6 +3488,80 @@ jobs:
         rm -rf examples/nlp/language_modeling/t5_pretrain_results
         rm -rf examples/nlp/language_modeling/t5_index_mappings
 
+  L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.max_epochs=null \
+        trainer.max_steps=10 \
+        trainer.val_check_interval=10 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.precision=bf16 \
+        model.megatron_amp_O2=True \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        model.mcore_t5=True \
+        model.transformer_engine=True \
+        model.tensor_model_parallel_size=2 \
+        model.micro_batch_size=4 \
+        model.global_batch_size=4 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.encoder.transformer_block_type='pre_ln' \
+        model.decoder.transformer_block_type='pre_ln' \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False
+
+        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.max_epochs=null \
+        trainer.max_steps=10 \
+        trainer.val_check_interval=10 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.precision=bf16 \
+        model.megatron_amp_O2=True \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.mcore_t5=True \
+        model.transformer_engine=True \
+        model.tensor_model_parallel_size=2 \
+        model.micro_batch_size=4 \
+        model.global_batch_size=4 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.encoder.transformer_block_type='pre_ln' \
+        model.decoder.transformer_block_type='pre_ln' \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_pretrain_results
+        rm -rf examples/nlp/language_modeling/t5_index_mappings
+
   L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4236,7 +4432,7 @@ jobs:
 
   Nemo_CICD_Test:
     needs: 
-      #- OPTIONAL_L0_Unit_Tests_GPU
+      - L0_Unit_Tests_GPU
       - L0_Unit_Tests_CPU
       - L2_Community_LLM_Checkpoints_tests_Llama
       - L2_Community_LLM_Checkpoints_tests_StarCoder
@@ -4296,6 +4492,7 @@ jobs:
       - L2_BioMegatron_Bert_NER_Task
       - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2
+      - L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2
@@ -4310,6 +4507,7 @@ jobs:
       - L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2
       - L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2
       - L2_Megatron_T5_Pretraining_and_Resume_Training_TP2
+      - L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_Pretraining_and_Resume_Training_PP2
@@ -4351,7 +4549,9 @@ jobs:
         name: Checkout repository
         uses: actions/checkout@v4
 
-      - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }}
+      - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' && env.SLACK_WEBHOOK != '' }}
+        env: 
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
         run: |
           set -x
 

diff --git a/Dockerfile b/Dockerfile
@@ -167,12 +167,12 @@ COPY tutorials /workspace/nemo/tutorials
 RUN printf "#!/bin/bash\njupyter lab --no-browser --allow-root --ip=0.0.0.0" >> start-jupyter.sh && \
   chmod +x start-jupyter.sh
 
-# If required, install AIS CLI
-RUN if [ "${REQUIRE_AIS_CLI}" = true ]; then \
-  INSTALL_MSG=$(/bin/bash scripts/installers/install_ais_cli_latest.sh); INSTALL_CODE=$?; \
+# If required, install AIS CLI and Python AIS SDK
+RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_ais_cli_latest.sh && pip install aistore); INSTALL_CODE=$?; \
   echo ${INSTALL_MSG}; \
   if [ ${INSTALL_CODE} -ne 0 ]; then \
   echo "AIS CLI installation failed"; \
+  if [ "${REQUIRE_AIS_CLI}" = true ]; then \
   exit ${INSTALL_CODE}; \
-  else echo "AIS CLI installed successfully"; fi \
-  else echo "Skipping AIS CLI installation"; fi
+  else echo "Skipping AIS CLI installation"; fi \
+  else echo "AIS CLI installed successfully"; fi
diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -32,9 +32,9 @@ EOF
 WORKDIR /workspace
 
 # Install NeMo requirements
-ARG TE_TAG=bfe21c3d68b0a9951e5716fb520045db53419c5e
-ARG MODELOPT_VERSION=0.11.0
-ARG MCORE_TAG=02871b4df8c69fac687ab6676c4246e936ce92d0
+ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
+ARG MODELOPT_VERSION=0.13.0
+ARG MCORE_TAG=0bc3547702464501feefeb5523b7a17e591b21fa
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
 --mount=type=bind,source=requirements,target=requirements \
@@ -48,6 +48,7 @@ pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.n
 "nvidia-modelopt[torch]~=${MODELOPT_VERSION}" \
 "apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}" \
 "llama-index==0.10.43" \
+"onnxscript @ git+https://github.com/microsoft/onnxscript" \
 -r tools/ctc_segmentation/requirements.txt \
 ".[all]"
 
@@ -60,6 +61,22 @@ git checkout ${MCORE_TAG} && \
   popd && \
 popd
 export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
+
+# Mamba dependancy installation
+git clone https://github.com/state-spaces/mamba.git && \
+  cd mamba && \
+  git checkout v2.0.3 && \
+  python setup.py install && \
+  cd .. && \
+  rm -rf mamba 
+
+git clone https://github.com/Dao-AILab/causal-conv1d && \
+  cd causal-conv1d && \
+  git checkout v1.2.2.post1 && \
+  python setup.py install && \
+  cd .. && \
+  rm -rf causal-conv1d 
+
 EOF
 
 # Copy over NeMo code