Skip to content

Commit

Permalink
Merge branch 'main' into sdxl_draft
Browse files Browse the repository at this point in the history
  • Loading branch information
yaoyu-33 authored Jul 8, 2024
2 parents f411298 + 62459cc commit 188686c
Show file tree
Hide file tree
Showing 328 changed files with 23,526 additions and 8,986 deletions.
7 changes: 7 additions & 0 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ TTS:
- tests/collections/tts/**
- tests/collections/common/tokenizers/text_to_speech/**

Audio:
- nemo/collections/audio/**/*
- examples/audio/**/*
- tutorials/audio/**/*
- docs/source/audio/**/*
- tests/collections/audio/**

core:
- nemo/core/**/*
- tests/core/**
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/_test_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ on:
jobs:
main:
runs-on: ${{ inputs.RUNNER }}
timeout-minutes: ${{ inputs.TIMEOUT }}
outputs:
conclusion: ${{ steps.main.conclusion }}
log: ${{ steps.main.outputs.log }}
Expand All @@ -54,6 +53,7 @@ jobs:
uses: actions/checkout@v4
- id: main
name: Run main script
timeout-minutes: ${{ inputs.TIMEOUT }}
run: |
set +e
(
Expand Down
216 changes: 208 additions & 8 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,12 @@ jobs:
### \'\'
OPTIONAL_L0_Unit_Tests_GPU:
L0_Unit_Tests_GPU:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure
TIMEOUT: 30
TIMEOUT: 60
SCRIPT: |
NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
IS_OPTIONAL: true
Expand Down Expand Up @@ -213,7 +213,7 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
python examples/nlp/language_modeling/megatron_gpt_quantization.py \
python examples/nlp/language_modeling/megatron_gpt_ptq.py \
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
quantization.algorithm=null \
export.save_path=/home/TestData/nlp/megatron_llama/ci_baseline
Expand All @@ -226,7 +226,7 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
python examples/nlp/language_modeling/megatron_gpt_quantization.py \
python examples/nlp/language_modeling/megatron_gpt_ptq.py \
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
model.tensor_model_parallel_size=2 \
trainer.devices=2 \
Expand All @@ -245,7 +245,7 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
python examples/nlp/language_modeling/megatron_gpt_quantization.py \
python examples/nlp/language_modeling/megatron_gpt_ptq.py \
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
quantization.algorithm=int8_sq \
Expand Down Expand Up @@ -274,7 +274,7 @@ jobs:
# - name: Checkout repository
# uses: actions/checkout@v4
# - run: |
# python examples/nlp/language_modeling/megatron_gpt_quantization.py \
# python examples/nlp/language_modeling/megatron_gpt_ptq.py \
# model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
# model.tensor_model_parallel_size=1 \
# trainer.devices=1 \
Expand All @@ -288,6 +288,45 @@ jobs:
#- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
# if: "failure()"

L2_QAT_Llama2_INT4:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/tuning/megatron_gpt_qat.py \
quantization.algorithm=int4 \
quantization.num_calib_size=8 \
trainer.devices=1 \
trainer.num_nodes=1 \
trainer.max_steps=4 \
trainer.val_check_interval=4 \
+trainer.limit_val_batches=2 \
exp_manager.explicit_log_dir=llama2_qat_results \
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
model.tensor_model_parallel_size=1 \
model.pipeline_model_parallel_size=1 \
model.global_batch_size=2 \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.train_ds.concat_sampling_probabilities=[1.0] \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl]
rm -rf llama2_qat_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

# L2: ASR dev run
ASR_dev_run_Speech_to_Text:
needs: [cicd-test-container-setup]
Expand Down Expand Up @@ -2630,6 +2669,89 @@ jobs:
# }
# }

L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=3 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=3 \
trainer.precision=bf16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
model.tensor_model_parallel_size=2 \
model.megatron_amp_O2=True \
model.optim.name=distributed_fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=3 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=6 \
trainer.precision=bf16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
exp_manager.resume_if_exists=True \
model.reset_lr=True \
model.tensor_model_parallel_size=2 \
model.megatron_amp_O2=True \
model.optim.name=distributed_fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
rm -rf examples/nlp/language_modeling/gpt_pretrain_results
rm -rf examples/nlp/language_modeling/gpt_index_mappings
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
Expand Down Expand Up @@ -3366,6 +3488,80 @@ jobs:
rm -rf examples/nlp/language_modeling/t5_pretrain_results
rm -rf examples/nlp/language_modeling/t5_index_mappings
L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=null \
trainer.max_steps=10 \
trainer.val_check_interval=10 \
trainer.accumulate_grad_batches=1 \
trainer.precision=bf16 \
model.megatron_amp_O2=True \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
model.mcore_t5=True \
model.transformer_engine=True \
model.tensor_model_parallel_size=2 \
model.micro_batch_size=4 \
model.global_batch_size=4 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.decoder.num_layers=4 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.encoder.transformer_block_type='pre_ln' \
model.decoder.transformer_block_type='pre_ln' \
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
model.data.data_impl=text_mmap \
+model.data.data_impl_kwargs.newline_int=10 \
+model.data.data_impl_kwargs.header_lines=0 \
+model.data.data_impl_kwargs.workers=null \
+model.data.data_impl_kwargs.sort_dataset_paths=False
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=null \
trainer.max_steps=10 \
trainer.val_check_interval=10 \
trainer.accumulate_grad_batches=1 \
trainer.precision=bf16 \
model.megatron_amp_O2=True \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
exp_manager.resume_if_exists=True \
model.mcore_t5=True \
model.transformer_engine=True \
model.tensor_model_parallel_size=2 \
model.micro_batch_size=4 \
model.global_batch_size=4 \
model.seq_length=128 \
model.encoder.num_layers=4 \
model.encoder.hidden_size=64 \
model.encoder.num_attention_heads=8 \
model.decoder.num_layers=4 \
model.decoder.hidden_size=64 \
model.decoder.num_attention_heads=8 \
model.encoder.transformer_block_type='pre_ln' \
model.decoder.transformer_block_type='pre_ln' \
model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
model.data.data_impl=text_mmap \
+model.data.data_impl_kwargs.newline_int=10 \
+model.data.data_impl_kwargs.header_lines=0 \
+model.data.data_impl_kwargs.workers=null \
+model.data.data_impl_kwargs.sort_dataset_paths=False
AFTER_SCRIPT: |
rm -rf examples/nlp/language_modeling/t5_pretrain_results
rm -rf examples/nlp/language_modeling/t5_index_mappings
L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand Down Expand Up @@ -4236,7 +4432,7 @@ jobs:
Nemo_CICD_Test:
needs:
#- OPTIONAL_L0_Unit_Tests_GPU
- L0_Unit_Tests_GPU
- L0_Unit_Tests_CPU
- L2_Community_LLM_Checkpoints_tests_Llama
- L2_Community_LLM_Checkpoints_tests_StarCoder
Expand Down Expand Up @@ -4296,6 +4492,7 @@ jobs:
- L2_BioMegatron_Bert_NER_Task
- L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2
- L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2
- L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2
- L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2
- L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2
- L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2
Expand All @@ -4310,6 +4507,7 @@ jobs:
- L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2
- L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2
- L2_Megatron_T5_Pretraining_and_Resume_Training_TP2
- L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2
- L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2
- L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2
- L2_Megatron_T5_Pretraining_and_Resume_Training_PP2
Expand Down Expand Up @@ -4351,7 +4549,9 @@ jobs:
name: Checkout repository
uses: actions/checkout@v4

- if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }}
- if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' && env.SLACK_WEBHOOK != '' }}
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
run: |
set -x
Expand Down
10 changes: 5 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -167,12 +167,12 @@ COPY tutorials /workspace/nemo/tutorials
RUN printf "#!/bin/bash\njupyter lab --no-browser --allow-root --ip=0.0.0.0" >> start-jupyter.sh && \
chmod +x start-jupyter.sh

# If required, install AIS CLI
RUN if [ "${REQUIRE_AIS_CLI}" = true ]; then \
INSTALL_MSG=$(/bin/bash scripts/installers/install_ais_cli_latest.sh); INSTALL_CODE=$?; \
# If required, install AIS CLI and Python AIS SDK
RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_ais_cli_latest.sh && pip install aistore); INSTALL_CODE=$?; \
echo ${INSTALL_MSG}; \
if [ ${INSTALL_CODE} -ne 0 ]; then \
echo "AIS CLI installation failed"; \
if [ "${REQUIRE_AIS_CLI}" = true ]; then \
exit ${INSTALL_CODE}; \
else echo "AIS CLI installed successfully"; fi \
else echo "Skipping AIS CLI installation"; fi
else echo "Skipping AIS CLI installation"; fi \
else echo "AIS CLI installed successfully"; fi
23 changes: 20 additions & 3 deletions Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ EOF
WORKDIR /workspace

# Install NeMo requirements
ARG TE_TAG=bfe21c3d68b0a9951e5716fb520045db53419c5e
ARG MODELOPT_VERSION=0.11.0
ARG MCORE_TAG=02871b4df8c69fac687ab6676c4246e936ce92d0
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG MODELOPT_VERSION=0.13.0
ARG MCORE_TAG=0bc3547702464501feefeb5523b7a17e591b21fa
ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
RUN \
--mount=type=bind,source=requirements,target=requirements \
Expand All @@ -48,6 +48,7 @@ pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.n
"nvidia-modelopt[torch]~=${MODELOPT_VERSION}" \
"apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}" \
"llama-index==0.10.43" \
"onnxscript @ git+https://github.com/microsoft/onnxscript" \
-r tools/ctc_segmentation/requirements.txt \
".[all]"

Expand All @@ -60,6 +61,22 @@ git checkout ${MCORE_TAG} && \
popd && \
popd
export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"

# Mamba dependancy installation
git clone https://github.com/state-spaces/mamba.git && \
cd mamba && \
git checkout v2.0.3 && \
python setup.py install && \
cd .. && \
rm -rf mamba

git clone https://github.com/Dao-AILab/causal-conv1d && \
cd causal-conv1d && \
git checkout v1.2.2.post1 && \
python setup.py install && \
cd .. && \
rm -rf causal-conv1d

EOF

# Copy over NeMo code
Expand Down
Loading

0 comments on commit 188686c

Please sign in to comment.