diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index 5f29832f0c0f..67bc69b1f8a5 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -120,7 +120,7 @@ jobs: "type": "section", "text": { "type": "mrkdwn", - "text": ":alert: Cherrypick bot 🤖: Hey <@'$USERNAME'>: Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: " + "text": ":alert: Cherrypick bot 🤖: Hey <@'$USERNAME'>: Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: " } } ] diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 800d91acb7ed..345482e9a1a8 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -131,16 +131,16 @@ jobs: ### \'\' # L0: GPU unit tests - OPTIONAL_L0_Unit_Tests_GPU_ASR: + L0_Unit_Tests_GPU_ASR: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true' + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure TIMEOUT: 20 + # TODO: remove this hack SCRIPT: | - NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads - IS_OPTIONAL: true + python -c "from nemo.collections.asr.models import ASRModel" && NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads L0_Unit_Tests_GPU_Audio: needs: [cicd-test-container-setup] @@ -1212,18 +1212,6 @@ jobs: matmul_precision=medium AFTER_SCRIPT: | rm -rf preds.json - - - # L2: Transducer alignment - OPTIONAL_L2_Transducer_alignment_Running_pytest: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_Transducer_alignment_Running_pytest') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1 --with_downloads - IS_OPTIONAL: true # L2: Segmentation Tool L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav: @@ -1345,275 +1333,6 @@ jobs: pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \ output_manifest=preds.json - # L2: Duplex Text Normalization - L2_Duplex_Text_Normalization_with_Tarred_dataset: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Duplex_Text_Normalization_with_Tarred_dataset') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - cd examples/nlp/duplex_text_normalization && \ - python duplex_text_normalization_train.py \ - data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \ - mode=tn \ - lang=en \ - tagger_model.do_training=false \ - decoder_model.transformer=t5-small \ - data.validation_ds.batch_size=2 \ - data.train_ds.use_cache=false \ - data.validation_ds.use_cache=false \ - data.test_ds.batch_size=2 \ - data.train_ds.decoder_data_augmentation=false \ - data.train_ds.num_workers=2 \ - decoder_trainer.devices=[0,1] \ - decoder_trainer.accelerator="gpu" \ - data.train_ds.use_tarred_dataset=true \ - +decoder_trainer.fast_dev_run=true \ - decoder_exp_manager.create_checkpoint_callback=false \ - data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \ - data.test_ds.use_cache=false \ - data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv - - # L2: Intent and Slot Classification Tasks - L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/intent_slot_classification && \ - python intent_slot_classification.py \ - model.data_dir=/home/TestData/nlp/retail \ - model.validation_ds.prefix=dev \ - model.test_ds.prefix=dev \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=checkpoints - AFTER_SCRIPT: | - rm -rf checkpoints - - L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/intent_slot_classification && \ - python multi_label_intent_slot_classification.py \ - model.data_dir=/home/TestData/nlp/new_multiatis \ - model.validation_ds.prefix=dev \ - model.test_ds.prefix=dev \ - trainer.devices=1 \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=checkpoints2 - AFTER_SCRIPT: | - rm -rf checkpoints2 - - # TODO: add when megatron-bert is supported again - # stage("L2: Model Parallel Size 2 Megatron Text Classification") { - # when { - # anyOf{ - # branch "main" - # changeRequest target: "main" - # } - # } - # failFast true - # steps{ - # cd examples/nlp/text_classification && \ - # python text_classification_with_bert.py \ - # trainer.devices=[0,1] \ - # trainer.accelerator="gpu" \ - # trainer.num_nodes=1 \ - # trainer.precision=16 \ - # trainer.gradient_clip_val=1.0 \ - # +trainer.fast_dev_run=true \ - # model.dataset.num_classes=6 \ - # model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ - # model.train_ds.batch_size=4 \ - # model.language_model.pretrained_model_name=megatron-bert-uncased \ - # model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \ - # model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \ - # model.nemo_path=null \ - # ~model.infer_samples \ - # exp_manager=null - # } - # } - - # stage("L2: Model Parallel Size 2 Megatron Autoresume") { - # when { - # anyOf{ - # branch "main" - # changeRequest target: "main" - # } - # } - # failFast true - # steps{ - # cd examples/nlp/text_classification && \ - # python text_classification_with_bert.py \ - # trainer.devices=[0,1] \ - # trainer.accelerator="gpu" \ - # trainer.num_nodes=1 \ - # trainer.precision=16 \ - # trainer.gradient_clip_val=1.0 \ - # trainer.max_epochs=1 \ - # +trainer.fast_dev_run=true \ - # model.dataset.num_classes=6 \ - # model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ - # model.train_ds.batch_size=4 \ - # model.language_model.pretrained_model_name=megatron-bert-uncased \ - # model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \ - # model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \ - # model.nemo_path=null \ - # ~model.infer_samples \ - # +exp_manager.explicit_log_dir=/home/TestData/nlp/mp_autoresume \ - # +exp_manager.resume_if_exists=true - # } - # } - - # stage("L2: Model Parallel Size 2 Megatron Evaluation from .nemo") { - # when { - # anyOf{ - # branch "main" - # changeRequest target: "main" - # } - # } - # failFast true - # steps{ - # cd examples/nlp/text_classification && \ - # python model_parallel_text_classification_evaluation.py \ - # trainer.devices=[0,1] \ - # trainer.accelerator="gpu" \ - # trainer.num_nodes=1 \ - # model.dataset.num_classes=6 \ - # model.test_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \ - # model.nemo_path=/home/TestData/nlp/mp_2_nemo/retail_text_class_350M.nemo \ - # exp_manager=null - # } - # } - - # stage("L2: Model Parallel Size 2 Megatron Train from .nemo") { - # when { - # anyOf{ - # branch "main" - # changeRequest target: "main" - # } - # } - # failFast true - # steps{ - # cd examples/nlp/token_classification && \ - # python token_classification_train.py \ - # pretrained_model=/home/TestData/nlp/mp_2_nemo/ner_350M.nemo \ - # model.dataset.data_dir=/home/TestData/nlp/ner/ \ - # model.train_ds.batch_size=2 \ - # model.dataset.use_cache=false \ - # trainer.devices=[0,1] \ - # trainer.accelerator="gpu" \ - # +trainer.fast_dev_run=true \ - # model.dataset.class_balancing="weighted_loss" \ - # exp_manager=null - # } - # } - - - # L2: Parallel NLP Examples 2 - L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/token_classification && \ - python token_classification_train.py \ - pretrained_model=ner_en_bert \ - model.dataset.data_dir=/home/TestData/nlp/ner/ \ - model.train_ds.batch_size=2 \ - model.dataset.use_cache=false \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - model.dataset.class_balancing="weighted_loss" \ - exp_manager.exp_dir=null - - L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/token_classification && \ - data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ - python punctuation_capitalization_train_evaluate.py \ - pretrained_model=punctuation_en_bert \ - model.train_ds.ds_item="${data_dir}" \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=null; - - rm -rf "${data_dir}" - - L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/token_classification && \ - python token_classification_train.py \ - model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \ - exp_manager.exp_dir=null - - L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/token_classification/token_classification_evaluate.py \ - model.dataset.data_dir=/home/TestData/nlp/ner/ \ - model.dataset.use_cache=false \ - pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo - - L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ - python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ - +do_training=false \ - +do_testing=true \ - model.test_ds.ds_item="${data_dir}" \ - ~model.train_ds \ - ~model.validation_ds \ - +model.test_ds.use_cache=false \ - pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo; - - rm -rf "${data_dir}" - - # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed L2_Pretraining_BERT_pretraining_from_Text: needs: [cicd-test-container-setup] @@ -1990,313 +1709,6 @@ jobs: model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model AFTER_SCRIPT: | rm -rf examples/nlp/machine_translation/megatron_nmt_results - - L2_Megatron_BART_Perceiver_MIM_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Perceiver_MIM_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.arch=perceiver \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation="swiglu" \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method="block" \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation="swiglu" \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method="block" \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.data.data_impl=text_mmap \ - model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ - model.data.splits_string="\"800,100,100\"" \ - model.data.whole_word_masking=False \ - model.tokenizer.library=sentencepiece \ - model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - ++model.hiddens.enc_output_name=z \ - ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ - ++model.hiddens.transform.q_z_given_x.hidden_size=64 \ - ++model.hiddens.loss.mim.cls_name=a_mim \ - ++model.hiddens.loss.mim.loss_weight=0.5 - # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error - # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.arch=perceiver \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation="swiglu" \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method="block" \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation="swiglu" \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method="block" \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.data.data_impl=text_mmap \ - model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ - model.data.splits_string="\"800,100,100\"" \ - model.data.whole_word_masking=False \ - model.tokenizer.library=sentencepiece \ - model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - ++model.hiddens.enc_output_name=z \ - ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ - ++model.hiddens.transform.q_z_given_x.hidden_size=64 \ - ++model.hiddens.loss.mim.cls_name=a_mim \ - ++model.hiddens.loss.mim.loss_weight=0.5 - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/megatron_mim_results - - # stage("L2: NMT Bottleneck Fallback") { - # when { - # anyOf { - # branch "main" - # changeRequest target: "main" - # } - # } - # failFast true - # parallel { - # stage("L2: seq2seq (no bottleneck)") { - # steps { - # cd examples/nlp/machine_translation && \ - # enc_dec_nmt-bottleneck.py \ - # --config-path=conf \ - # --config-name=aayn_bottleneck \ - # do_testing=true \ - # model.model_type=nll \ - # model.encoder.arch=seq2seq \ - # model.encoder.hidden_steps=1 \ - # model.encoder.hidden_blocks=1 \ - # model.encoder.hidden_init_method=params \ - # model.encoder.hidden_size=64 \ - # model.encoder.inner_size=128 \ - # model.encoder.num_attention_heads=2 \ - # model.encoder.num_layers=2 \ - # model.decoder.hidden_size=64 \ - # model.decoder.inner_size=128 \ - # model.decoder.num_attention_heads=2 \ - # model.decoder.num_layers=2 \ - # model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \ - # model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \ - # model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ - # model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ - # model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src \ - # model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref \ - # model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # trainer.devices=1 \ - # trainer.accelerator="gpu" \ - # +trainer.fast_dev_run=true \ - # +trainer.limit_test_batches=2 \ - # exp_manager=null \ - # } - # } - # } - # } - # stage("L2: NMT Bottleneck Architecture") { - # when { - # anyOf { - # branch "main" - # changeRequest target: "main" - # } - # } - # failFast true - # parallel { - # stage("Bridge Encoder (identity)") { - # steps { - # cd examples/nlp/machine_translation && \ - # enc_dec_nmt-bottleneck.py \ - # --config-path=conf \ - # --config-name=aayn_bottleneck \ - # do_testing=true \ - # model.model_type=nll \ - # model.encoder.arch=bridge \ - # model.encoder.hidden_steps=1 \ - # model.encoder.hidden_blocks=1 \ - # model.encoder.hidden_init_method=identity \ - # model.encoder.hidden_size=64 \ - # model.encoder.inner_size=128 \ - # model.encoder.num_attention_heads=2 \ - # model.encoder.num_layers=2 \ - # model.decoder.hidden_size=64 \ - # model.decoder.inner_size=128 \ - # model.decoder.num_attention_heads=2 \ - # model.decoder.num_layers=2 \ - # model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - # model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # trainer.devices=1 \ - # trainer.accelerator="gpu" \ - # +trainer.fast_dev_run=true \ - # +trainer.limit_test_batches=2 \ - # exp_manager=null - # } - # } - # stage("Perceiver Encoder (params)") { - # steps { - # cd examples/nlp/machine_translation && \ - # enc_dec_nmt-bottleneck.py \ - # --config-path=conf \ - # --config-name=aayn_bottleneck \ - # do_testing=true \ - # model.model_type=nll \ - # model.encoder.arch=perceiver \ - # model.encoder.hidden_steps=1 \ - # model.encoder.hidden_blocks=1 \ - # model.encoder.hidden_init_method=params \ - # model.encoder.hidden_size=64 \ - # model.encoder.inner_size=128 \ - # model.encoder.num_attention_heads=2 \ - # model.encoder.num_layers=2 \ - # model.decoder.hidden_size=64 \ - # model.decoder.inner_size=128 \ - # model.decoder.num_attention_heads=2 \ - # model.decoder.num_layers=2 \ - # model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - # model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # trainer.devices=1 \ - # trainer.accelerator="gpu" \ - # +trainer.fast_dev_run=true \ - # +trainer.limit_test_batches=2 \ - # exp_manager=null - # } - # } - # } - # } - # stage("L2: NMT Bottleneck LVM") { - # when { - # anyOf { - # branch "main" - # changeRequest target: "main" - # } - # } - # failFast true - # parallel { - # stage("VAE") { - # steps { - # cd examples/nlp/machine_translation && \ - # enc_dec_nmt-bottleneck.py \ - # --config-path=conf \ - # --config-name=aayn_bottleneck \ - # do_testing=true \ - # model.model_type=vae \ - # model.encoder.arch=perceiver \ - # model.encoder.hidden_steps=1 \ - # model.encoder.hidden_blocks=1 \ - # model.encoder.hidden_init_method=params \ - # model.encoder.hidden_size=64 \ - # model.encoder.inner_size=128 \ - # model.encoder.num_attention_heads=2 \ - # model.encoder.num_layers=2 \ - # model.decoder.hidden_size=64 \ - # model.decoder.inner_size=128 \ - # model.decoder.num_attention_heads=2 \ - # model.decoder.num_layers=2 \ - # model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - # model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # trainer.devices=1 \ - # trainer.accelerator="gpu" \ - # +trainer.fast_dev_run=true \ - # +trainer.limit_test_batches=2 \ - # exp_manager=null - # } - # } - # stage("MIM") { - # steps { - # cd examples/nlp/machine_translation && \ - # enc_dec_nmt-bottleneck.py \ - # --config-path=conf \ - # --config-name=aayn_bottleneck \ - # do_testing=true \ - # model.model_type=mim \ - # model.encoder.arch=perceiver \ - # model.encoder.hidden_steps=1 \ - # model.encoder.hidden_blocks=1 \ - # model.encoder.hidden_init_method=params \ - # model.encoder.hidden_size=64 \ - # model.encoder.inner_size=128 \ - # model.encoder.num_attention_heads=2 \ - # model.encoder.num_layers=2 \ - # model.decoder.hidden_size=64 \ - # model.decoder.inner_size=128 \ - # model.decoder.num_attention_heads=2 \ - # model.decoder.num_layers=2 \ - # model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - # model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # trainer.devices=1 \ - # trainer.accelerator="gpu" \ - # +trainer.fast_dev_run=true \ - # +trainer.limit_test_batches=2 \ - # exp_manager=null - # } - # } - # } - # } L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism: needs: [cicd-test-container-setup] @@ -2366,82 +1778,10 @@ jobs: model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - L2_Megatron_Bert_Pretraining_and_Resume_Training: + L2_Megatron_Core_Bert_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Bert_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=bf16 \ - model.megatron_amp_O2=True \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.sequence_parallel=True \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=bf16 \ - model.megatron_amp_O2=True \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings - - L2_Megatron_Core_Bert_Pretraining_and_Resume_Training: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_Bert_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true' + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_Bert_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -2508,228 +1848,6 @@ jobs: rm -rf examples/nlp/language_modeling/bert_pretrain_results rm -rf examples/nlp/language_modeling/bert_index_mappings - L2_Megatron_RETRO_Pretraining_and_Resume_Training: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_RETRO_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_retro_pretraining.py \ - trainer.num_nodes=1 \ - trainer.devices=2 \ - trainer.precision=bf16 \ - trainer.accelerator=gpu \ - model.data.data_prefix=["none"] \ - exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ - model.mcore_gpt=True \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.optim.name=distributed_fused_adam \ - model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ - model.data.num_workers=4 \ - model.micro_batch_size=1 \ - model.data.shuffle_documents=False \ - trainer.val_check_interval=30 \ - +trainer.num_sanity_val_steps=0 \ - model.init_method_std=0.023 \ - model.optim.lr=6.0e-4 \ - model.megatron_amp_O2=True \ - model.data.splits_string="\"98,2,0\"" \ - model.data.dataloader_type=cyclic \ - trainer.max_steps=10 - - python examples/nlp/language_modeling/megatron_retro_pretraining.py \ - trainer.num_nodes=1 \ - trainer.devices=2 \ - trainer.precision=bf16 \ - trainer.accelerator=gpu \ - model.data.data_prefix=["none"] \ - exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ - model.mcore_gpt=True \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.optim.name=distributed_fused_adam \ - model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ - model.data.num_workers=4 \ - model.micro_batch_size=1 \ - model.data.shuffle_documents=False \ - trainer.val_check_interval=30 \ - +trainer.num_sanity_val_steps=0 \ - model.init_method_std=0.023 \ - model.optim.lr=6.0e-4 \ - model.megatron_amp_O2=True \ - model.data.splits_string="\"98,2,0\"" \ - model.data.dataloader_type=cyclic \ - trainer.max_steps=20 - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/mcore_retro_results - - L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.accumulate_grad_batches=1 \ - trainer.limit_val_batches=2 \ - exp_manager.resume_if_exists=True \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ - model.data.data_prefix= \ - model.data.knn_index= \ - model.data.retrieval_prefix= \ - model.tensor_model_parallel_size=2 \ - model.micro_batch_size=4 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.chunk_size=32 \ - model.enc_num_layers=2 \ - model.dec_num_layers=2 \ - model.enc_cross_attention=[1] \ - model.dec_cross_attention=[1] \ - +model.data.mock=True - - python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.accumulate_grad_batches=1 \ - trainer.limit_val_batches=2 \ - exp_manager.resume_if_exists=True \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ - model.data.data_prefix= \ - model.data.knn_index= \ - model.data.retrieval_prefix= \ - model.tensor_model_parallel_size=2 \ - model.micro_batch_size=4 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.chunk_size=32 \ - model.enc_num_layers=2 \ - model.dec_num_layers=2 \ - model.enc_cross_attention=[1] \ - model.dec_cross_attention=[1] \ - +model.data.mock=True - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/retro_legacy_results - - # L2_Megatron_RETRO_muTransfer_Pretraining_Performance: - # needs: [cicd-test-container-setup] - # runs-on: self-hosted-azure - # container: - # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - # options: - # # --user 0:128 - # --device=/dev/nvidia0 - # --gpus all - # --shm-size=8g - # --env TRANSFORMERS_OFFLINE=0 - # --env HYDRA_FULL_ERROR=1 - # --volume /mnt/datadrive/TestData:/home/TestData - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - run: | - # python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \ - # trainer.devices=2 \ - # trainer.num_nodes=1 \ - # trainer.accelerator=gpu \ - # trainer.accumulate_grad_batches=1 \ - # trainer.max_steps=100 \ - # trainer.log_every_n_steps=1 \ - # trainer.precision=16 \ - # trainer.val_check_interval=100 \ - # trainer.limit_val_batches=0 \ - # trainer.gradient_clip_val=1.0 \ - # +trainer.num_sanity_val_steps=0 \ - # exp_manager.exp_dir=examples/nlp/language_modeling/retro_results/ \ - # +exp_manager.version=smalltest \ - # model.data.neighbors=2 \ - # model.megatron_amp_O2=False \ - # model.apply_query_key_layer_scaling=False \ - # model.tensor_model_parallel_size=1 \ - # model.optim.name=muadamw \ - # model.optim.weight_decay=0.1 \ - # model.optim.betas=[0.9,0.95] \ - # model.optim.lr=6e-4 \ - # model.optim.sched.warmup_steps=1000 \ - # model.optim.sched.constant_steps=0 \ - # model.optim.sched.min_lr=6e-5 \ - # model.add_position_embedding=False \ - # model.enc_num_layers=2 \ - # model.dec_num_layers=6 \ - # model.enc_cross_attention=[0] \ - # model.dec_cross_attention=[3,5] \ - # model.hidden_size=96 \ - # model.ffn_hidden_size=384 \ - # model.init_method_std=0.023 \ - # model.num_attention_heads=12 \ - # model.max_position_embeddings=1024 \ - # model.encoder_seq_length=1024 \ - # model.tokenizer.library=megatron \ - # model.tokenizer.type=GPT2BPETokenizer \ - # model.tokenizer.merge_file=/home/TestData/nlp/megatron_retro/gpt2-merges.txt \ - # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_retro/gpt2-vocab.json \ - # model.data.data_prefix=[/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document] \ - # model.data.knn_index=[/home/TestData/nlp/megatron_retro/knn2_map_wiki_test.idx] \ - # model.data.retrieval_prefix=/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document \ - # model.data.index_mapping_dir=/home/TestData/nlp/megatron_retro \ - # model.data.num_workers=8 \ - # model.micro_batch_size=8 \ - # model.normalization=rmsnorm \ - # model.transformer_block_type=pre_ln \ - # model.bias_activation_fusion=True \ - # model.bias_dropout_add_fusion=False \ - # model.masked_softmax_fusion=True \ - # model.hidden_dropout=0 \ - # model.attention_dropout=0 \ - # model.fp32_residual_connection=True \ - # model.shape_file=/home/TestData/nlp/megatron_retro/o1_rel_shape_info_tiny.yaml - - # python -c "import pandas as pd - # import pathlib - # from pandas.testing import assert_frame_equal - # from tensorboard.backend.event_processing.event_accumulator import EventAccumulator - # import torch - # if not (torch.cuda.is_available() and "A100" in torch.cuda.get_device_name()): - # import sys - # sys.exit(0) - # event_file = list(pathlib.Path("examples/nlp/language_modeling/retro_results/megatron_retro/smalltest").glob("events.out.tfevents*"))[0] - # ea = EventAccumulator(str(event_file)).Reload() - # vals = [] - # for i in ea.Scalars("reduced_train_loss"): - # vals.append(i.value) - # training_curve = pd.DataFrame({"loss": vals}) - # gt_curve = pd.read_csv("/home/TestData/nlp/megatron_retro/expected_learning_curve.csv") - # assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)" - - # rm -rf examples/nlp/language_modeling/retro_results - # - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - # if: "failure()" - L2_RAG_Pipeline_Indexing: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -2766,22 +1884,6 @@ jobs: generating.inference.temperature=1.0 \ generating.query="Which art schools did I applied to?" - L2_BioMegatron_Bert_NER_Task: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_BioMegatron_Bert_NER_Task') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/token_classification/token_classification_train.py \ - exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \ - trainer.max_epochs=1 \ - model.dataset.data_dir=/home/TestData/nlp/ner \ - model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \ - model.tokenizer.tokenizer_name=null - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/token_classification_results - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -3936,103 +3038,6 @@ jobs: AFTER_SCRIPT: | rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo - L2_Megatron_T5_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=bf16 \ - model.megatron_amp_O2=True \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=relative \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=fast-swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=bf16 \ - model.megatron_amp_O2=True \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=relative \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=fast-swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -4488,18 +3493,6 @@ jobs: rm -rf examples/nlp/language_modeling/t5_pretrain_results rm -rf examples/nlp/language_modeling/t5_index_mappings - L2_Megatron_T5_Eval: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Eval') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_t5_eval.py \ - --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - --prompt "How do I fix my GPU memory issue? I am seeing out of memory." \ - --tensor_model_parallel_size 1 - L2_Megatron_Core_T5_Eval: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -4512,196 +3505,6 @@ jobs: --prompt "How do I fix my GPU memory issue? I am seeing out of memory." \ --tensor_model_parallel_size 1 - L2_Megatron_BART_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation="reglu" \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method="block" \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation="reglu" \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method="block" \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}" - - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=5 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation="reglu" \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method="block" \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation="reglu" \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method="block" \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}" - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/bart_pretrain_results - - L2_Megatron_BART_Pretraining_and_Resume_Training_PP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=geglu \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=geglu \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.respect_document_boundaries=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] - - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=geglu \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=geglu \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.respect_document_boundaries=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/bart_pretrain_results - - - L2_Megatron_T5_PEFT_Lora_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_PEFT_Lora_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - - python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=/tmp/nlp_t5_lora_tuning_tp2 \ - model.pipeline_model_parallel_size=1 \ - model.tensor_model_parallel_size=2 \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ - model.peft.peft_scheme=lora \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ - model.peft.restore_from_path=/tmp/nlp_t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \ - model.peft.restore_from_ckpt_name=null \ - model.peft.restore_from_hparams_path=null \ - model.tensor_model_parallel_size=2 \ - trainer.devices=2 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ - model.data.test_ds.names=[quarel4] \ - model.global_batch_size=2 \ - model.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=10 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix=/tmp/nlp_t5_lora_tuning_tp2/out \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path=/tmp/nlp_t5_lora_tuning_tp2/out.jsonl - L2_Megatron_Core_T5_PEFT_Lora_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -5084,7 +3887,7 @@ jobs: rm -rf tests/collections/llm/gpt_pretrain_results rm -rf tests/collections/llm/gpt_index_mappings - OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check: + L2_NeMo_2_GPT_DDP_Param_Parity_check: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check') || needs.cicd-test-container-setup.outputs.all == 'true' @@ -5092,7 +3895,7 @@ jobs: RUNNER: self-hosted-azure SCRIPT: | - python tests/lightning/test_ddp_parity_checker.py \ + TORCHDYNAMO_DISABLE=1 python tests/lightning/test_ddp_parity_checker.py \ --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document @@ -5100,8 +3903,7 @@ jobs: AFTER_SCRIPT: | rm -rf tests/collections/llm/gpt_pretrain_results rm -rf tests/collections/llm/gpt_index_mappings - IS_OPTIONAL: true - + L2_NeMo_2_SSM_Pretraining: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -5182,6 +3984,22 @@ jobs: AFTER_SCRIPT: | rm -rf tests/collections/llm/t5_finetune_results/${{ github.run_id }} + L2_NeMo_2_T5_LoRA: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_T5_LoRA') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_finetuning.py \ + --devices=2 \ + --max-steps=250 \ + --peft=lora \ + --experiment-dir=tests/collections/llm/t5_peft_results/${{ github.run_id }} \ + --checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_150steps + AFTER_SCRIPT: | + rm -rf tests/collections/llm/t5_peft_results/${{ github.run_id }} + L2_NeMo_2_Mixtral_Pretraining: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -5221,8 +4039,6 @@ jobs: --pp_size 1 \ --mbs 1 - AFTER_SCRIPT: | - rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }} L2_NeMo_2_GPT_SFT_TP1PP1_MBS2: needs: [cicd-test-container-setup] @@ -5252,8 +4068,6 @@ jobs: --pp_size 1 \ --mbs 2 - AFTER_SCRIPT: | - rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }} L2_NeMo_2_GPT_SFT_TP1PP2_MBS2: needs: [cicd-test-container-setup] @@ -5283,8 +4097,6 @@ jobs: --pp_size 2 \ --mbs 2 - AFTER_SCRIPT: | - rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }} L2_NeMo_2_GPT_SFT_TP2PP1_MBS2: needs: [cicd-test-container-setup] @@ -5314,8 +4126,35 @@ jobs: --pp_size 1 \ --mbs 2 - AFTER_SCRIPT: | - rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }} + + L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + + python tests/collections/llm/gpt_finetuning.py \ + --restore_path /home/TestData/nemo2_ckpt/llama_68M \ + --devices 2 \ + --max_steps 3 \ + --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \ + --peft none \ + --tp_size 1 \ + --pp_size 1 \ + --mbs 1 --packed + + python tests/collections/llm/gpt_finetuning.py \ + --restore_path /home/TestData/nemo2_ckpt/llama_68M \ + --devices 2 \ + --max_steps 6 \ + --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \ + --peft none \ + --tp_size 1 \ + --pp_size 1 \ + --mbs 1 --packed + L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1: needs: [cicd-test-container-setup] @@ -5345,8 +4184,6 @@ jobs: --pp_size 1 \ --mbs 1 - AFTER_SCRIPT: | - rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }} L2_NeMo_2_GPT_LoRA_TP1PP1_MBS2: needs: [cicd-test-container-setup] @@ -5376,8 +4213,6 @@ jobs: --pp_size 1 \ --mbs 2 - AFTER_SCRIPT: | - rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }} L2_NeMo_2_GPT_LoRA_TP1PP2_MBS2: needs: [cicd-test-container-setup] @@ -5407,8 +4242,6 @@ jobs: --pp_size 2 \ --mbs 2 - AFTER_SCRIPT: | - rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }} L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2: needs: [cicd-test-container-setup] @@ -5438,8 +4271,33 @@ jobs: --pp_size 1 \ --mbs 2 - AFTER_SCRIPT: | - rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }} + L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + + python tests/collections/llm/gpt_finetuning.py \ + --restore_path /home/TestData/nemo2_ckpt/llama_68M \ + --devices 2 \ + --max_steps 3 \ + --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \ + --peft lora \ + --tp_size 1 \ + --pp_size 1 \ + --mbs 1 --packed + + python tests/collections/llm/gpt_finetuning.py \ + --restore_path /home/TestData/nemo2_ckpt/llama_68M \ + --devices 2 \ + --max_steps 6 \ + --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \ + --peft lora \ + --tp_size 1 \ + --pp_size 1 \ + --mbs 1 --packed L2_NeMo_2_NeMo_Mcore_Mixtral_bitexact: needs: [cicd-test-container-setup] @@ -5456,7 +4314,7 @@ jobs: - gpu-test - cicd-test-container-setup - #- OPTIONAL_L0_Unit_Tests_GPU_ASR + - L0_Unit_Tests_GPU_ASR - L0_Unit_Tests_GPU_Audio - L0_Unit_Tests_GPU_Common - L0_Unit_Tests_GPU_LLM @@ -5507,19 +4365,10 @@ jobs: - L2_ASR_Adapters_Linear_Adapters - L2_ASR_Adapters_RelPos_MHA_Adapters - L2_Speech_Transcription_Speech_to_Text_Transcribe - #- OPTIONAL_L2_Transducer_alignment_Running_pytest - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3 - L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference - L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference - - L2_Duplex_Text_Normalization_with_Tarred_dataset - - L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification - - L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification - - L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test - - L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test - - L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1 - - L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification - - L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation - L2_Pretraining_BERT_pretraining_from_Text - L2_Pretraining_BERT_from_Preprocessed - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN @@ -5530,15 +4379,10 @@ jobs: - L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation - L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation - L2_Megatron_NMT_Training_TP2 - - L2_Megatron_BART_Perceiver_MIM_Training_TP2 - L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism - - L2_Megatron_Bert_Pretraining_and_Resume_Training - L2_Megatron_Core_Bert_Pretraining_and_Resume_Training - - L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training - - L2_Megatron_RETRO_Pretraining_and_Resume_Training - L2_RAG_Pipeline_Indexing - L2_RAG_Pipeline_Generating - - L2_BioMegatron_Bert_NER_Task - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2 - L2_Megatron_GPT_Skip_Train - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2 @@ -5559,18 +4403,13 @@ jobs: - L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len - L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2 - L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2 - - L2_Megatron_T5_Pretraining_and_Resume_Training_TP2 - L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2 - L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2 - L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2 - L2_Megatron_T5_Pretraining_and_Resume_Training_PP2 - L2_Megatron_T5_w_Mixture_of_Expert_Pretraining - L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2 - - L2_Megatron_T5_Eval - L2_Megatron_Core_T5_Eval - - L2_Megatron_BART_Pretraining_and_Resume_Training_TP2 - - L2_Megatron_BART_Pretraining_and_Resume_Training_PP2 - - L2_Megatron_T5_PEFT_Lora_TP2 - L2_Megatron_Core_T5_PEFT_Lora_TP2 - L2_Megatron_Mock_Data_Generation_MockGPTDataset - L2_Megatron_Mock_Data_Generation_MockT5Dataset @@ -5583,20 +4422,23 @@ jobs: - Speech_Checkpoints_tests - L2_Stable_Diffusion_Training - L2_NeMo_2_GPT_Pretraining_no_transformer_engine - #- OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check + - L2_NeMo_2_GPT_DDP_Param_Parity_check - L2_NeMo_2_HF_MODEL_IMPORT - L2_NeMo_2_SSM_Pretraining - L2_NeMo_2_SSM_Finetuning - L2_NeMo_2_T5_Pretraining - L2_NeMo_2_T5_Finetuning + - L2_NeMo_2_T5_LoRA - L2_NeMo_2_GPT_SFT_TP1PP1_MBS1 - L2_NeMo_2_GPT_SFT_TP1PP1_MBS2 - L2_NeMo_2_GPT_SFT_TP1PP2_MBS2 - L2_NeMo_2_GPT_SFT_TP2PP1_MBS2 + - L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED - L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1 - L2_NeMo_2_GPT_LoRA_TP1PP1_MBS2 - L2_NeMo_2_GPT_LoRA_TP1PP2_MBS2 - L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2 + - L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED - L2_NeMo_2_Mixtral_Pretraining - L2_PTQ_Llama2_INT8_SQ - L2_PTQ_Llama2_FP8 @@ -5744,4 +4586,4 @@ jobs: - name: "Pipeline not successful, set exit code to 1" if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }} - run: exit 1 \ No newline at end of file + run: exit 1 diff --git a/Dockerfile.ci b/Dockerfile.ci index dbcd92cfcb65..09ffe9674e5d 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.17.0 -ARG MCORE_TAG=0d89fc4c0d4394f915fffff11212d6957652337f +ARG MCORE_TAG=563d5d1726012e8077895b732d5bc81b6e975e8d ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c RUN \ diff --git a/docs/source/performance/performance_long_sequence.md b/docs/source/performance/performance_long_sequence.md new file mode 100644 index 000000000000..d9f26dcf0d61 --- /dev/null +++ b/docs/source/performance/performance_long_sequence.md @@ -0,0 +1,134 @@ +# Long Sequence Performance + +## LLAMA2-7B (FP8) + +- The table below shows the pre-training performance of the LLAMA2-7B with CP (context parallelism) and compares it against the results without CP at various input sequence lengths. The detailed model-parallel configurations and the achieved performance are shown in the training results with CP. In non-CP training runs, we use the most performant model- and data-parallel configurations without CP given the memory capacity constraint of the H100 GPU system. + + - Container: [NeMo24.03.01.framework](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags) + - System: DGX-H100 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SeqLen (K)# of GPUsWithout CPWith CPSpeedup with CP/without CP
TFLOPS / GPUTPPPDPCPTFLOPS / GPU
4476811417681.00
8873012417301.00
161666021816601.00
323259521826101.03
646453441825741.07
12812842441845551.31
25625639241885491.40
512512104814165495.28
1024102426.58143253620.23
+ + +### Speedup of LLAMA2 7B training with CP over without CP +![cp_speedup_figure](https://github.com/NVIDIA/NeMo/releases/download/r2.0.0rc1/tutorial_cp_speedup_figure.png) \ No newline at end of file diff --git a/examples/audio/process_audio.py b/examples/audio/process_audio.py index e28fb4e69627..ec88bda34954 100644 --- a/examples/audio/process_audio.py +++ b/examples/audio/process_audio.py @@ -159,8 +159,8 @@ def main(cfg: ProcessConfig) -> ProcessConfig: audio_to_audio_model.set_trainer(trainer) audio_to_audio_model = audio_to_audio_model.eval() - # override sampler - if cfg.sampler is not None: + # override sampler if necessary + if cfg.sampler: logging.info('Overriding sampler with %s', cfg.sampler) if hasattr(audio_to_audio_model, 'sampler'): diff --git a/examples/llm/pretrain/README.md b/examples/llm/pretrain/README.md index c9bb7331f972..61f64d7792bb 100644 --- a/examples/llm/pretrain/README.md +++ b/examples/llm/pretrain/README.md @@ -3,7 +3,7 @@ ### Listing the available recipes for pretraining ```bash -nemorun llm pretrain --help +nemo llm pretrain --help ``` ![recipe-listing](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/list-recipes.png) @@ -12,7 +12,7 @@ nemorun llm pretrain --help ### Run pre-training with a default recipe ```bash -nemorun llm pretrain --factory llama3_8b +nemo llm pretrain --factory llama3_8b ``` ![llama3_70b](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b.png) @@ -20,7 +20,7 @@ nemorun llm pretrain --factory llama3_8b We can also call the factory function with custom parameters: ```bash -nemorun llm pretrain --factory "llama3_70b(num_nodes=128)" +nemo llm pretrain --factory "llama3_70b(num_nodes=128)" ``` ![llama3_70b-128-nodes](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b_128nodes.png) @@ -29,13 +29,13 @@ nemorun llm pretrain --factory "llama3_70b(num_nodes=128)" The CLI allows you to overwrite any parameter. For example, to run the recipe with 2000 steps: ```bash -nemorun llm pretrain --factory llama3_70b trainer.max_steps=2000 +nemo llm pretrain --factory llama3_70b trainer.max_steps=2000 ``` The syntax of the CLI is the same as the Python code. Which is great but in some cases you might want to inspect & edit a recipe interactively. An easy way to do this using the cli is the use the `--repl` flag. ```bash -nemorun llm pretrain --factory llama3_70b --repl +nemo llm pretrain --factory llama3_70b --repl ``` ![repl](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/repl.gif) diff --git a/examples/nlp/duplex_text_normalization/README.md b/examples/nlp/duplex_text_normalization/README.md new file mode 100644 index 000000000000..808ed2856fb2 --- /dev/null +++ b/examples/nlp/duplex_text_normalization/README.md @@ -0,0 +1,2 @@ +> [!IMPORTANT] +> This section is no longer supported in NeMo and is scheduled for removal in the 23.11 release. diff --git a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py index 40ba35f819ef..c81119489582 100644 --- a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py +++ b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py @@ -21,7 +21,9 @@ --checkpoint_name \ --nemo_file_path \ --tensor_model_parallel_size \ - --pipeline_model_parallel_size + --pipeline_model_parallel_size \ + --gpus_per_node \ + --model_type """ import dis @@ -100,7 +102,7 @@ def get_args(): default="gpt", choices=["gpt", "sft", "t5", "bert", "nmt", "bart", "retro"], ) - parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1)) + parser.add_argument("--local-rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1)) parser.add_argument("--bcp", action="store_true", help="Whether on BCP platform") parser.add_argument( "--precision", @@ -134,7 +136,7 @@ def convert(local_rank, rank, world_size, args): 'accelerator': 'gpu', 'precision': args.precision, }, - 'model': {'native_amp_init_scale': 2 ** 32, 'native_amp_growth_interval': 1000, 'hysteresis': 2}, + 'model': {'native_amp_init_scale': 2**32, 'native_amp_growth_interval': 1000, 'hysteresis': 2}, } cfg = OmegaConf.create(cfg) @@ -142,7 +144,7 @@ def convert(local_rank, rank, world_size, args): # If FP16 create a GradScaler as the build_model_parallel_config of MegatronBaseModel expects it if cfg.trainer.precision == '16-mixed': scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), + init_scale=cfg.model.get('native_amp_init_scale', 2**32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), hysteresis=cfg.model.get('hysteresis', 2), ) diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml index 06551f46486c..79a07ce4e2c0 100644 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml @@ -126,6 +126,13 @@ model: tunable_base_param_names: ["self_attention", "word_embeddings"] # TODO: regex support @adithyre data: + chat: False # whether use chatbot data or not + chat_prompt_tokens: # special tokens for the chat prompts, a dictionary of {token_type: token}. note that some tokenizer may combine the characters at the junction between {end_of_turn}{turn_start}. e.g. '', the '><' sometimes is merged to be a single token. This is not supported, try to avoid + system_turn_start: "\x00" + turn_start: "\x11" + label_start: "\x12" + end_of_turn: "\x0A" # \0x0A is '\n' + end_of_name: "\x0A" # \0x0A is '\n' train_ds: # Example of how to specify paths to multiple datasets # file_names: diff --git a/examples/nlp/token_classification/README.md b/examples/nlp/token_classification/README.md new file mode 100644 index 000000000000..808ed2856fb2 --- /dev/null +++ b/examples/nlp/token_classification/README.md @@ -0,0 +1,2 @@ +> [!IMPORTANT] +> This section is no longer supported in NeMo and is scheduled for removal in the 23.11 release. diff --git a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py index aa49435ded16..fc501b3d00de 100644 --- a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py @@ -293,6 +293,13 @@ def __call__( device: torch.device, partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, ): + if x.device.type != "cuda": + # If CUDA graphs are enabled and "frame-looping" algorithm is requested, current class + # is not suitable to handle non-CUDA inputs; thus we are passing them to original caller + return self.caller._greedy_decode_blank_as_pad_loop_frames( + x=x, out_len=out_len, device=device, partial_hypotheses=partial_hypotheses + ) + if partial_hypotheses is not None: raise NotImplementedError( "`partial_hypotheses` support is not available " diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index c1e712c44aeb..0d4f4c895bcf 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -314,7 +314,7 @@ def prepare_audio_data(cfg: DictConfig) -> Tuple[List[str], bool]: with NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: for item in read_and_maybe_sort_manifest(cfg.dataset_manifest, try_sort=cfg.presort_manifest): audio_file = get_full_path(audio_file=item[audio_key], manifest_file=cfg.dataset_manifest) - item[audio_key] = audio_file + item['audio_filepath'] = audio_file filepaths.append(audio_file) f.write(json.dumps(item) + "\n") sorted_manifest_path = f.name diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py index 76dca1268c3b..439322b8e810 100644 --- a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py +++ b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py @@ -13,7 +13,7 @@ # limitations under the License. from collections import OrderedDict -from typing import Optional +from typing import List, Optional from transformers import AutoTokenizer as AUTOTOKENIZER @@ -43,6 +43,7 @@ def __init__( sep_token: Optional[str] = None, cls_token: Optional[str] = None, unk_token: Optional[str] = None, + additional_special_tokens: Optional[List] = [], use_fast: Optional[bool] = False, trust_remote_code: Optional[bool] = False, ): @@ -60,6 +61,7 @@ def __init__( sep_token: token used for separating sequences cls_token: class token. Usually equal to bos_token unk_token: token to use for unknown tokens + additional_special_tokens: list of other tokens beside standard special tokens (bos, eos, pad, etc.). For example, sentinel tokens for T5 (, , etc.) use_fast: whether to use fast HuggingFace tokenizer """ try: @@ -124,10 +126,17 @@ def __init__( elif self.tokenizer.cls_token is None and self.tokenizer.bos_token: special_tokens_dict["cls_token"] = self.tokenizer.bos_token + # add additional special tokens (not standard special tokens such as bos, eod, sep) + if additional_special_tokens is not None: + special_tokens_dict["additional_special_tokens"] = additional_special_tokens + new_tokens_in_vocab = [] for token in [mask_token, bos_token, eos_token, pad_token, sep_token, cls_token, unk_token]: if token is not None and token not in self.tokenizer.get_vocab(): new_tokens_in_vocab.append(token) + for token in additional_special_tokens: + if token is not None and token not in self.tokenizer.get_vocab(): + new_tokens_in_vocab.append(token) if len(new_tokens_in_vocab) > 0: """ diff --git a/nemo/collections/diffusion/encoders/__init__.py b/nemo/collections/diffusion/encoders/__init__.py new file mode 100644 index 000000000000..9e3250071955 --- /dev/null +++ b/nemo/collections/diffusion/encoders/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/diffusion/encoders/conditioner.py b/nemo/collections/diffusion/encoders/conditioner.py new file mode 100644 index 000000000000..2bfb008c5d84 --- /dev/null +++ b/nemo/collections/diffusion/encoders/conditioner.py @@ -0,0 +1,199 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union + +import torch +import torch.nn as nn +from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5Tokenizer + + +class AbstractEmbModel(nn.Module): + def __init__(self, enable_lora_finetune=False, target_block=[], target_module=[]): + super().__init__() + self._is_trainable = None + self._ucg_rate = None + self._input_key = None + + self.TARGET_BLOCK = target_block + self.TARGET_MODULE = target_module + if enable_lora_finetune: + self.lora_layers = [] + + @property + def is_trainable(self) -> bool: + return self._is_trainable + + @property + def ucg_rate(self) -> Union[float, torch.Tensor]: + return self._ucg_rate + + @property + def input_key(self) -> str: + return self._input_key + + @is_trainable.setter + def is_trainable(self, value: bool): + self._is_trainable = value + + @ucg_rate.setter + def ucg_rate(self, value: Union[float, torch.Tensor]): + self._ucg_rate = value + + @input_key.setter + def input_key(self, value: str): + self._input_key = value + + @is_trainable.deleter + def is_trainable(self): + del self._is_trainable + + @ucg_rate.deleter + def ucg_rate(self): + del self._ucg_rate + + @input_key.deleter + def input_key(self): + del self._input_key + + def encode(self, *args, **kwargs): + raise NotImplementedError + + def _enable_lora(self, lora_model): + for module_name, module in lora_model.named_modules(): + if module.__class__.__name__ in self.TARGET_BLOCK: + tmp = {} + for sub_name, sub_module in module.named_modules(): + if sub_module.__class__.__name__ in self.TARGET_MODULE: + if hasattr(sub_module, "input_size") and hasattr( + sub_module, "output_size" + ): # for megatron ParallelLinear + lora = LoraWrapper(sub_module, sub_module.input_size, sub_module.output_size) + else: # for nn.Linear + lora = LoraWrapper(sub_module, sub_module.in_features, sub_module.out_features) + self.lora_layers.append(lora) + if sub_name not in tmp.keys(): + tmp.update({sub_name: lora}) + else: + print(f"Duplicate subnames are found in module {module_name}") + for sub_name, lora_layer in tmp.items(): + lora_name = f'{sub_name}_lora' + module.add_module(lora_name, lora_layer) + + +class FrozenCLIPEmbedder(AbstractEmbModel): + """Uses the CLIP transformer encoder for text (from Hugging Face)""" + + LAYERS = ["last", "pooled", "hidden"] + + def __init__( + self, + version="openai/clip-vit-large-patch14", + device="cuda", + max_length=77, + enable_lora_finetune=False, + layer="last", + layer_idx=None, + always_return_pooled=False, + dtype=torch.float, + ): + super().__init__(enable_lora_finetune, target_block=["CLIPAttention", "CLIPMLP"], target_module=["Linear"]) + self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") + self.transformer = CLIPTextModel.from_pretrained(version, torch_dtype=dtype).to(device) + self.device = device + self.max_length = max_length + self.freeze() + if enable_lora_finetune: + self._enable_lora(self.transformer) + print(f"CLIP transformer encoder add {len(self.lora_layers)} lora layers.") + + self.layer = layer + self.layer_idx = layer_idx + self.return_pooled = always_return_pooled + if layer == "hidden": + assert layer_idx is not None + assert 0 <= abs(layer_idx) <= 12 + + def freeze(self): + self.transformer = self.transformer.eval() + for param in self.parameters(): + param.requires_grad = False + + def forward(self, text, max_sequence_length=None): + batch_encoding = self.tokenizer( + text, + truncation=True, + max_length=max_sequence_length if max_sequence_length else self.max_length, + return_length=True, + return_overflowing_tokens=False, + padding="max_length", + return_tensors="pt", + ) + tokens = batch_encoding["input_ids"].to(self.transformer.device, non_blocking=True) + outputs = self.transformer(input_ids=tokens, output_hidden_states=(self.layer == "hidden")) + + if self.layer == "last": + z = outputs.last_hidden_state + elif self.layer == "pooled": + z = outputs.pooler_output[:, None, :] + else: + z = outputs.hidden_states[self.layer_idx] + + # Pad the seq length to multiple of 8 + seq_len = (z.shape[1] + 8 - 1) // 8 * 8 + z = torch.nn.functional.pad(z, (0, 0, 0, seq_len - z.shape[1]), value=0.0) + if self.return_pooled: + return z, outputs.pooler_output + return z + + def encode(self, text): + return self(text) + + +class FrozenT5Embedder(AbstractEmbModel): + def __init__( + self, + version="google/t5-v1_1-xxl", + max_length=512, + device="cuda", + dtype=torch.float, + ): + super().__init__() + self.tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-xxl", max_length=max_length) + self.transformer = T5EncoderModel.from_pretrained(version, torch_dtype=dtype).to(device) + self.max_length = max_length + self.freeze() + self.device = device + self.dtype = dtype + + def freeze(self): + self.transformer = self.transformer.eval() + for param in self.parameters(): + param.requires_grad = False + + def forward(self, text, max_sequence_length=None): + batch_encoding = self.tokenizer( + text, + truncation=True, + max_length=max_sequence_length if max_sequence_length else self.max_length, + return_length=False, + return_overflowing_tokens=False, + padding="max_length", + return_tensors="pt", + ) + + tokens = batch_encoding["input_ids"].to(self.transformer.device, non_blocking=True) + outputs = self.transformer(input_ids=tokens, output_hidden_states=None) + + return outputs.last_hidden_state diff --git a/nemo/collections/diffusion/flux_infer.py b/nemo/collections/diffusion/flux_infer.py new file mode 100644 index 000000000000..f914dbf50258 --- /dev/null +++ b/nemo/collections/diffusion/flux_infer.py @@ -0,0 +1,113 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import torch + +from nemo.collections.diffusion.models.flux.pipeline import FluxInferencePipeline +from nemo.collections.diffusion.utils.flux_pipeline_utils import configs +from nemo.collections.diffusion.utils.mcore_parallel_utils import Utils + + +def parse_args(): + parser = argparse.ArgumentParser( + description="The flux inference pipeline is utilizing megatron core transformer.\nPlease prepare the necessary checkpoints for flux model on local disk in order to use this script" + ) + + parser.add_argument("--flux_ckpt", type=str, default="", help="Path to Flux transformer checkpoint(s)") + parser.add_argument("--vae_ckpt", type=str, default="/ckpts/ae.safetensors", help="Path to \'ae.safetensors\'") + parser.add_argument( + "--clip_version", + type=str, + default='/ckpts/text_encoder', + help="Clip version, provide either ckpt dir or clip version like openai/clip-vit-large-patch14", + ) + parser.add_argument( + "--t5_version", + type=str, + default='/ckpts/text_encoder_2', + help="Clip version, provide either ckpt dir or clip version like google/t5-v1_1-xxl", + ) + parser.add_argument( + "--do_convert_from_hf", + action='store_true', + default=False, + help="Must be true if provided checkpoint is not already converted to NeMo version", + ) + parser.add_argument( + "--save_converted_model", + action="store_true", + default=False, + help="Whether to save the converted NeMo transformer checkpoint for Flux", + ) + parser.add_argument( + "--version", + type=str, + default='dev', + choices=['dev', 'schnell'], + help="Must align with the checkpoint provided.", + ) + parser.add_argument("--height", type=int, default=1024, help="Image height.") + parser.add_argument("--width", type=int, default=1024, help="Image width.") + parser.add_argument("--inference_steps", type=int, default=10, help="Number of inference steps to run.") + parser.add_argument( + "--num_images_per_prompt", type=int, default=1, help="Number of images to generate for each prompt." + ) + parser.add_argument("--guidance", type=float, default=0.0, help="Guidance scale.") + parser.add_argument( + "--offload", action='store_true', default=False, help="Offload modules to cpu after being called." + ) + parser.add_argument( + "--prompts", + type=str, + default="A cat holding a sign that says hello world", + help="Inference prompts, use \',\' to separate if multiple prompts are provided.", + ) + parser.add_argument("--bf16", action='store_true', default=False, help="Use bf16 in inference.") + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + print('Initializing model parallel config') + Utils.initialize_distributed(1, 1, 1) + + print('Initializing flux inference pipeline') + params = configs[args.version] + params.vae_params.ckpt = args.vae_ckpt + params.clip_params['version'] = args.clip_version + params.t5_params['version'] = args.t5_version + pipe = FluxInferencePipeline(params) + + print('Loading transformer weights') + pipe.load_from_pretrained( + args.flux_ckpt, + do_convert_from_hf=args.do_convert_from_hf, + save_converted_model=args.save_converted_model, + ) + dtype = torch.bfloat16 if args.bf16 else torch.float32 + text = args.prompts.split(',') + pipe( + text, + max_sequence_length=256, + height=args.height, + width=args.width, + num_inference_steps=args.inference_steps, + num_images_per_prompt=args.num_images_per_prompt, + offload=args.offload, + guidance_scale=args.guidance, + dtype=dtype, + ) diff --git a/nemo/collections/diffusion/models/dit/dit_attention.py b/nemo/collections/diffusion/models/dit/dit_attention.py new file mode 100644 index 000000000000..9e60b11dd1c6 --- /dev/null +++ b/nemo/collections/diffusion/models/dit/dit_attention.py @@ -0,0 +1,428 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +from dataclasses import dataclass +from typing import Union + +import torch +from megatron.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb +from megatron.core.transformer.attention import Attention, SelfAttention +from megatron.core.transformer.custom_layers.transformer_engine import SplitAlongDim +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.transformer_config import TransformerConfig + + +@dataclass +class JointSelfAttentionSubmodules: + linear_qkv: Union[ModuleSpec, type] = None + added_linear_qkv: Union[ModuleSpec, type] = None + core_attention: Union[ModuleSpec, type] = None + linear_proj: Union[ModuleSpec, type] = None + q_layernorm: Union[ModuleSpec, type] = None + k_layernorm: Union[ModuleSpec, type] = None + added_q_layernorm: Union[ModuleSpec, type] = None + added_k_layernorm: Union[ModuleSpec, type] = None + + +class JointSelfAttention(Attention): + """Joint Self-attention layer class + + Used for MMDIT-like transformer block. + """ + + def __init__( + self, + config: TransformerConfig, + submodules: JointSelfAttentionSubmodules, + layer_number: int, + attn_mask_type=AttnMaskType.padding, + context_pre_only: bool = False, + ): + super().__init__( + config=config, + submodules=submodules, + layer_number=layer_number, + attn_mask_type=attn_mask_type, + attention_type="self", + ) + + self.linear_qkv = build_module( + submodules.linear_qkv, + self.config.hidden_size, + self.query_projection_size + 2 * self.kv_projection_size, + config=self.config, + init_method=self.config.init_method, + gather_output=False, + bias=self.config.add_bias_linear or self.config.add_qkv_bias, + skip_bias_add=False, + is_expert=False, + tp_comm_buffer_name='qkv', + ) + + if submodules.added_linear_qkv is not None: + self.added_linear_qkv = build_module( + submodules.added_linear_qkv, + self.config.hidden_size, + self.query_projection_size + 2 * self.kv_projection_size, + config=self.config, + init_method=self.config.init_method, + gather_output=False, + bias=self.config.add_qkv_bias, + skip_bias_add=False, + is_expert=False, + tp_comm_buffer_name='qkv', + ) + + if not context_pre_only: + self.added_linear_proj = build_module( + submodules.linear_proj, + self.query_projection_size, + self.config.hidden_size, + config=self.config, + init_method=self.config.output_layer_init_method, + bias=self.config.add_bias_linear, + input_is_parallel=True, + skip_bias_add=True, + is_expert=False, + tp_comm_buffer_name='proj', + ) + + if submodules.q_layernorm is not None: + self.q_layernorm = build_module( + submodules.q_layernorm, + hidden_size=self.hidden_size_per_attention_head, + config=self.config, + eps=self.config.layernorm_epsilon, + ) + else: + self.q_layernorm = None + + if submodules.k_layernorm is not None: + self.k_layernorm = build_module( + submodules.k_layernorm, + hidden_size=self.hidden_size_per_attention_head, + config=self.config, + eps=self.config.layernorm_epsilon, + ) + else: + self.k_layernorm = None + + if submodules.added_q_layernorm is not None: + self.added_q_layernorm = build_module( + submodules.added_q_layernorm, + hidden_size=self.hidden_size_per_attention_head, + config=self.config, + eps=self.config.layernorm_epsilon, + ) + else: + self.added_q_layernorm = None + + if submodules.added_k_layernorm is not None: + self.added_k_layernorm = build_module( + submodules.added_k_layernorm, + hidden_size=self.hidden_size_per_attention_head, + config=self.config, + eps=self.config.layernorm_epsilon, + ) + else: + self.added_k_layernorm = None + + def _split_qkv(self, mixed_qkv): + # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn] + new_tensor_shape = mixed_qkv.size()[:-1] + ( + self.num_query_groups_per_partition, + ( + (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2) + * self.hidden_size_per_attention_head + ), + ) + mixed_qkv = mixed_qkv.view(*new_tensor_shape) + + split_arg_list = [ + ( + self.num_attention_heads_per_partition + // self.num_query_groups_per_partition + * self.hidden_size_per_attention_head + ), + self.hidden_size_per_attention_head, + self.hidden_size_per_attention_head, + ] + + if SplitAlongDim is not None: + + # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn] + (query, key, value) = SplitAlongDim( + mixed_qkv, + 3, + split_arg_list, + ) + else: + + # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn] + (query, key, value) = torch.split( + mixed_qkv, + split_arg_list, + dim=3, + ) + + # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] + query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head) + return query, key, value + + def get_query_key_value_tensors(self, hidden_states, key_value_states=None): + """ + Derives `query`, `key` and `value` tensors from `hidden_states`. + """ + # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)] + mixed_qkv, _ = self.linear_qkv(hidden_states) + + query, key, value = self._split_qkv(mixed_qkv) + + if self.config.test_mode: + self.run_realtime_tests() + + if self.q_layernorm is not None: + query = self.q_layernorm(query) + + if self.k_layernorm is not None: + key = self.k_layernorm(key) + + return query, key, value + + def get_added_query_key_value_tensors(self, added_hidden_states, key_value_states=None): + """ + Derives `query`, `key` and `value` tensors from `hidden_states`. + """ + # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)] + mixed_qkv, _ = self.added_linear_qkv(added_hidden_states) + + query, key, value = self._split_qkv(mixed_qkv) + + if self.config.test_mode: + self.run_realtime_tests() + + if self.added_q_layernorm is not None: + query = self.added_q_layernorm(query) + + if self.added_k_layernorm is not None: + key = self.added_k_layernorm(key) + + return query, key, value + + def forward( + self, + hidden_states, + attention_mask, + key_value_states=None, + inference_params=None, + rotary_pos_emb=None, + packed_seq_params=None, + additional_hidden_states=None, + ): + # hidden_states: [sq, b, h] + + # For self attention we just duplicate the rotary_pos_emb if it isn't already + if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple): + rotary_pos_emb = (rotary_pos_emb,) * 2 + + # ===================== + # Query, Key, and Value + # ===================== + # Get the query, key and value tensors based on the type of attention - + # self or cross attn. + + query, key, value = self.get_query_key_value_tensors(hidden_states) + added_query, added_key, added_value = self.get_added_query_key_value_tensors(additional_hidden_states) + + query = torch.cat([added_query, query], dim=0) + key = torch.cat([added_key, key], dim=0) + value = torch.cat([added_value, value], dim=0) + + # =================================================== + # Adjust key, value, and rotary_pos_emb for inference + # =================================================== + key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference( + inference_params, key, value, rotary_pos_emb + ) + + if packed_seq_params is not None: + query = query.squeeze(1) + key = key.squeeze(1) + value = value.squeeze(1) + + # ================================================ + # relative positional embedding (rotary embedding) + # ================================================ + if rotary_pos_emb is not None: + q_pos_emb, k_pos_emb = rotary_pos_emb + + if packed_seq_params is not None: + cu_seqlens_q = packed_seq_params.cu_seqlens_q + cu_seqlens_kv = packed_seq_params.cu_seqlens_kv + else: + cu_seqlens_q = cu_seqlens_kv = None + query = apply_rotary_pos_emb( + query, + q_pos_emb, + config=self.config, + cu_seqlens=cu_seqlens_q, + ) + key = apply_rotary_pos_emb( + key, + k_pos_emb, + config=self.config, + cu_seqlens=cu_seqlens_kv, + ) + + # TODO, can apply positional embedding to value_layer so it has + # absolute positional embedding. + # otherwise, only relative positional embedding takes effect + # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb) + + # ================================== + # core attention computation + # ================================== + if self.checkpoint_core_attention and self.training: + core_attn_out = self._checkpointed_attention_forward( + query, + key, + value, + attention_mask, + attn_mask_type=attn_mask_type, + packed_seq_params=packed_seq_params, + ) + else: + core_attn_out = self.core_attention( + query, + key, + value, + attention_mask, + attn_mask_type=attn_mask_type, + packed_seq_params=packed_seq_params, + ) + + if packed_seq_params is not None: + # reshape to same output shape as unpacked case + # (t, np, hn) -> (t, b=1, h=np*hn) + # t is the pack size = sum (sq_i) + # note that batch is a dummy dimension in the packed case + core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1) + + # ================= + # Output. [sq, b, h] + # ================= + encoder_attention_output = core_attn_out[: additional_hidden_states.shape[0], :, :] + attention_output = core_attn_out[additional_hidden_states.shape[0] :, :, :] + + output, bias = self.linear_proj(attention_output) + encoder_output, encoder_bias = self.added_linear_proj(encoder_attention_output) + + output = output + bias + encoder_output = encoder_output + encoder_bias + + return output, encoder_output + + +class FluxSingleAttention(SelfAttention): + """Self-attention layer class + + Self-attention layer takes input with size [s, b, h] + and returns output of the same size. + """ + + def forward( + self, + hidden_states, + attention_mask, + key_value_states=None, + inference_params=None, + rotary_pos_emb=None, + packed_seq_params=None, + ): + # hidden_states: [sq, b, h] + + # For self attention we just duplicate the rotary_pos_emb if it isn't already + if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple): + rotary_pos_emb = (rotary_pos_emb,) * 2 + + # ===================== + # Query, Key, and Value + # ===================== + # Get the query, key and value tensors based on the type of attention - + # self or cross attn. + query, key, value = self.get_query_key_value_tensors(hidden_states, key_value_states) + # print(f'megatron q before ln: {query.transpose(0, 1).contiguous()}, {query.transpose(0, 1).contiguous().shape}') + # print(f'megatron k before ln: {key.transpose(0, 1).contiguous()}, {key.transpose(0, 1).contiguous().shape}') + # print(f'megatron v before ln: {value.transpose(0, 1).contiguous()}, {value.transpose(0, 1).contiguous().shape}') + + # =================================================== + # Adjust key, value, and rotary_pos_emb for inference + # =================================================== + key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference( + inference_params, key, value, rotary_pos_emb + ) + + if packed_seq_params is not None: + query = query.squeeze(1) + key = key.squeeze(1) + value = value.squeeze(1) + + # ================================================ + # relative positional embedding (rotary embedding) + # ================================================ + if rotary_pos_emb is not None: + q_pos_emb, k_pos_emb = rotary_pos_emb + + if packed_seq_params is not None: + cu_seqlens_q = packed_seq_params.cu_seqlens_q + cu_seqlens_kv = packed_seq_params.cu_seqlens_kv + else: + cu_seqlens_q = cu_seqlens_kv = None + query = apply_rotary_pos_emb( + query, + q_pos_emb, + config=self.config, + cu_seqlens=cu_seqlens_q, + ) + key = apply_rotary_pos_emb( + key, + k_pos_emb, + config=self.config, + cu_seqlens=cu_seqlens_kv, + ) + + # TODO, can apply positional embedding to value_layer so it has + # absolute positional embedding. + # otherwise, only relative positional embedding takes effect + # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb) + + # ================================== + # core attention computation + # ================================== + + if self.checkpoint_core_attention and self.training: + core_attn_out = self._checkpointed_attention_forward( + query, + key, + value, + attention_mask, + attn_mask_type=attn_mask_type, + packed_seq_params=packed_seq_params, + ) + else: + core_attn_out = self.core_attention( + query, + key, + value, + attention_mask, + attn_mask_type=attn_mask_type, + packed_seq_params=packed_seq_params, + ) + + if packed_seq_params is not None: + # reshape to same output shape as unpacked case + # (t, np, hn) -> (t, b=1, h=np*hn) + # t is the pack size = sum (sq_i) + # note that batch is a dummy dimension in the packed case + core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1) + + return core_attn_out diff --git a/nemo/collections/diffusion/models/dit/dit_layer_spec.py b/nemo/collections/diffusion/models/dit/dit_layer_spec.py index 672dcff3ba00..cb7c520493f0 100644 --- a/nemo/collections/diffusion/models/dit/dit_layer_spec.py +++ b/nemo/collections/diffusion/models/dit/dit_layer_spec.py @@ -42,6 +42,12 @@ from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules from megatron.core.utils import make_viewless_tensor +from nemo.collections.diffusion.models.dit.dit_attention import ( + FluxSingleAttention, + JointSelfAttention, + JointSelfAttentionSubmodules, +) + @dataclass class DiTWithAdaLNSubmodules(TransformerLayerSubmodules): @@ -75,7 +81,14 @@ class AdaLN(MegatronModule): Adaptive Layer Normalization Module for DiT. """ - def __init__(self, config: TransformerConfig, n_adaln_chunks=9, norm=nn.LayerNorm): + def __init__( + self, + config: TransformerConfig, + n_adaln_chunks=9, + norm=nn.LayerNorm, + modulation_bias=False, + use_second_norm=False, + ): super().__init__(config) if norm == TENorm: self.ln = norm(config, config.hidden_size, config.layernorm_epsilon) @@ -83,8 +96,11 @@ def __init__(self, config: TransformerConfig, n_adaln_chunks=9, norm=nn.LayerNor self.ln = norm(config.hidden_size, elementwise_affine=False, eps=self.config.layernorm_epsilon) self.n_adaln_chunks = n_adaln_chunks self.adaLN_modulation = nn.Sequential( - nn.SiLU(), nn.Linear(config.hidden_size, self.n_adaln_chunks * config.hidden_size, bias=False) + nn.SiLU(), nn.Linear(config.hidden_size, self.n_adaln_chunks * config.hidden_size, bias=modulation_bias) ) + self.use_second_norm = use_second_norm + if self.use_second_norm: + self.ln2 = nn.LayerNorm(config.hidden_size, elementwise_affine=False, eps=1e-6) nn.init.constant_(self.adaLN_modulation[-1].weight, 0) setattr(self.adaLN_modulation[-1].weight, "sequence_parallel", config.sequence_parallel) @@ -92,29 +108,59 @@ def __init__(self, config: TransformerConfig, n_adaln_chunks=9, norm=nn.LayerNor def forward(self, timestep_emb): return self.adaLN_modulation(timestep_emb).chunk(self.n_adaln_chunks, dim=-1) - @jit_fuser + # @jit_fuser def modulate(self, x, shift, scale): return x * (1 + scale) + shift - @jit_fuser + # @jit_fuser def scale_add(self, residual, x, gate): return residual + gate * x - @jit_fuser - def modulated_layernorm(self, x, shift, scale): + # @jit_fuser + def modulated_layernorm(self, x, shift, scale, layernorm_idx=0): + if self.use_second_norm and layernorm_idx == 1: + layernorm = self.ln2 + else: + layernorm = self.ln # Optional Input Layer norm - input_layernorm_output = self.ln(x).type_as(x) + input_layernorm_output = layernorm(x).type_as(x) # DiT block specific return self.modulate(input_layernorm_output, shift, scale) # @jit_fuser - def scaled_modulated_layernorm(self, residual, x, gate, shift, scale): + def scaled_modulated_layernorm(self, residual, x, gate, shift, scale, layernorm_idx=0): hidden_states = self.scale_add(residual, x, gate) - shifted_pre_mlp_layernorm_output = self.modulated_layernorm(hidden_states, shift, scale) + shifted_pre_mlp_layernorm_output = self.modulated_layernorm(hidden_states, shift, scale, layernorm_idx) return hidden_states, shifted_pre_mlp_layernorm_output +class AdaLNContinuous(MegatronModule): + def __init__( + self, + config: TransformerConfig, + conditioning_embedding_dim: int, + modulation_bias: bool = True, + norm_type: str = "layer_norm", + ): + super().__init__(config) + self.adaLN_modulation = nn.Sequential( + nn.SiLU(), nn.Linear(conditioning_embedding_dim, config.hidden_size * 2, bias=modulation_bias) + ) + if norm_type == "layer_norm": + self.norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False, eps=1e-6, bias=modulation_bias) + elif norm_type == "rms_norm": + self.norm = RMSNorm(config.hidden_size, eps=1e-6) + else: + raise ValueError("Unknown normalization type {}".format(norm_type)) + + def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor: + emb = self.adaLN_modulation(conditioning_embedding) + scale, shift = torch.chunk(emb, 2, dim=1) + x = self.norm(x) * (1 + scale) + shift + return x + + class STDiTLayerWithAdaLN(TransformerLayer): """A single transformer layer. @@ -407,6 +453,225 @@ def forward( return output, context +class DiTLayer(TransformerLayer): + """A single transformer layer. + + Transformer layer takes input with size [s, b, h] and returns an + output of the same size. + + Original DiT layer implementation from [https://arxiv.org/pdf/2212.09748]. + """ + + def __init__( + self, + config: TransformerConfig, + submodules: TransformerLayerSubmodules, + layer_number: int = 1, + mlp_ratio: int = 4, + n_adaln_chunks: int = 6, + modulation_bias: bool = True, + ): + # Modify the mlp layer hidden_size of a dit layer according to mlp_ratio + config.ffn_hidden_size = int(mlp_ratio * config.hidden_size) + super().__init__(config=config, submodules=submodules, layer_number=layer_number) + + self.adaLN = AdaLN( + config=config, n_adaln_chunks=n_adaln_chunks, modulation_bias=modulation_bias, use_second_norm=True + ) + + def forward( + self, + hidden_states, + attention_mask, + context=None, + context_mask=None, + rotary_pos_emb=None, + inference_params=None, + packed_seq_params=None, + ): + # passing in conditioning information via attention mask here + c = attention_mask + + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN(c) + + shifted_input_layernorm_output = self.adaLN.modulated_layernorm( + hidden_states, shift=shift_msa, scale=scale_msa, layernorm_idx=0 + ) + + x, bias = self.self_attention(shifted_input_layernorm_output, attention_mask=None) + + hidden_states = self.adaLN.scale_add(hidden_states, x=(x + bias), gate=gate_msa) + + residual = hidden_states + + shited_pre_mlp_layernorm_output = self.adaLN.modulated_layernorm( + hidden_states, shift=shift_mlp, scale=scale_mlp, layernorm_idx=1 + ) + + x, bias = self.mlp(shited_pre_mlp_layernorm_output) + + hidden_states = self.adaLN.scale_add(residual, x=(x + bias), gate=gate_mlp) + + return hidden_states, context + + +class MMDiTLayer(TransformerLayer): + """A single transformer layer. + + Transformer layer takes input with size [s, b, h] and returns an + output of the same size. + + MMDiT layer implementation from [https://arxiv.org/pdf/2403.03206]. + """ + + def __init__( + self, + config: TransformerConfig, + submodules: TransformerLayerSubmodules, + layer_number: int = 1, + context_pre_only: bool = False, + ): + + hidden_size = config.hidden_size + super().__init__(config=config, submodules=submodules, layer_number=layer_number) + + self.adaln = AdaLN(config, modulation_bias=True, n_adaln_chunks=6, use_second_norm=True) + + self.context_pre_only = context_pre_only + context_norm_type = "ada_norm_continous" if context_pre_only else "ada_norm_zero" + + if context_norm_type == "ada_norm_continous": + self.adaln_context = AdaLNContinous(config, hidden_size, modulation_bias=True, norm_type="layer_norm") + elif context_norm_type == "ada_norm_zero": + self.adaln_context = AdaLN(config, modulation_bias=True, n_adaln_chunks=6, use_second_norm=True) + else: + raise ValueError( + f"Unknown context_norm_type: {context_norm_type}, currently only support `ada_norm_continous`, `ada_norm_zero`" + ) + # Override Cross Attention to disable CP. + # Disable TP Comm overlap as well. Not disabling will attempt re-use of buffer size same as Q and lead to incorrect tensor shapes. + cp_override_config = copy.deepcopy(config) + cp_override_config.context_parallel_size = 1 + cp_override_config.tp_comm_overlap = False + + if not context_pre_only: + self.context_mlp = build_module( + submodules.mlp, + config=cp_override_config, + ) + else: + self.context_mlp = None + + def forward( + self, + hidden_states, + encoder_hidden_states, + attention_mask=None, + context=None, + context_mask=None, + rotary_pos_emb=None, + inference_params=None, + packed_seq_params=None, + emb=None, + ): + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaln(emb) + + norm_hidden_states = self.adaln.modulated_layernorm( + hidden_states, shift=shift_msa, scale=scale_msa, layernorm_idx=0 + ) + if self.context_pre_only: + norm_encoder_hidden_states = self.adaln_context(encoder_hidden_states, emb) + else: + c_shift_msa, c_scale_msa, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.adaln_context(emb) + norm_encoder_hidden_states = self.adaln_context.modulated_layernorm( + encoder_hidden_states, shift=c_shift_msa, scale=c_scale_msa, layernorm_idx=0 + ) + + attention_output, encoder_attention_output = self.self_attention( + norm_hidden_states, + attention_mask=attention_mask, + key_value_states=None, + additional_hidden_states=norm_encoder_hidden_states, + rotary_pos_emb=rotary_pos_emb, + ) + hidden_states = self.adaln.scale_add(hidden_states, x=attention_output, gate=gate_msa) + norm_hidden_states = self.adaln.modulated_layernorm( + hidden_states, shift=shift_mlp, scale=scale_mlp, layernorm_idx=1 + ) + + mlp_output, mlp_output_bias = self.mlp(norm_hidden_states) + hidden_states = self.adaln.scale_add(hidden_states, x=(mlp_output + mlp_output_bias), gate=gate_mlp) + + if self.context_pre_only: + encoder_hidden_states = None + else: + encoder_hidden_states = self.adaln_context.scale_add( + encoder_hidden_states, x=encoder_attention_output, gate=c_gate_msa + ) + norm_encoder_hidden_states = self.adaln_context.modulated_layernorm( + encoder_hidden_states, shift=c_shift_mlp, scale=c_scale_mlp, layernorm_idx=1 + ) + + context_mlp_output, context_mlp_output_bias = self.context_mlp(norm_encoder_hidden_states) + encoder_hidden_states = self.adaln.scale_add( + encoder_hidden_states, x=(context_mlp_output + context_mlp_output_bias), gate=c_gate_mlp + ) + + return hidden_states, encoder_hidden_states + + +class FluxSingleTransformerBlock(TransformerLayer): + def __init__( + self, + config: TransformerConfig, + submodules: TransformerLayerSubmodules, + layer_number: int = 1, + mlp_ratio: int = 4, + n_adaln_chunks: int = 3, + modulation_bias: bool = True, + ): + super().__init__(config=config, submodules=submodules, layer_number=layer_number) + hidden_size = config.hidden_size + self.adaln = AdaLN( + config=config, n_adaln_chunks=n_adaln_chunks, modulation_bias=modulation_bias, use_second_norm=False + ) + self.mlp_hidden_dim = int(hidden_size * mlp_ratio) + self.proj_in = nn.Linear(hidden_size, self.mlp_hidden_dim) + self.activation = nn.GELU(approximate="tanh") + self.proj_out = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size) + + def forward( + self, + hidden_states, + attention_mask=None, + context=None, + context_mask=None, + rotary_pos_emb=None, + inference_params=None, + packed_seq_params=None, + emb=None, + ): + residual = hidden_states + + shift, scale, gate = self.adaln(emb) + + norm_hidden_states = self.adaln.modulated_layernorm(hidden_states, shift=shift, scale=scale) + + mlp_hidden_states = self.activation(self.proj_in(norm_hidden_states)) + + attention_output = self.self_attention( + norm_hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb + ) + + hidden_states = torch.cat((attention_output, mlp_hidden_states), dim=2) + + hidden_states = self.proj_out(hidden_states) + + hidden_states = self.adaln.scale_add(residual, x=hidden_states, gate=gate) + + return hidden_states + + def get_stdit_adaln_block_with_transformer_engine_spec() -> ModuleSpec: params = {"attn_mask_type": AttnMaskType.padding} return ModuleSpec( @@ -530,3 +795,77 @@ def get_official_dit_adaln_block_with_transformer_engine_spec() -> ModuleSpec: ), ), ) + + +def get_mm_dit_block_with_transformer_engine_spec() -> ModuleSpec: + + return ModuleSpec( + module=MMDiTLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=JointSelfAttention, + params={"attn_mask_type": AttnMaskType.no_mask}, + submodules=JointSelfAttentionSubmodules( + linear_qkv=TEColumnParallelLinear, + added_linear_qkv=TEColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + ), + ), + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TEColumnParallelLinear, + linear_fc2=TERowParallelLinear, + ), + ), + ), + ) + + +def get_flux_single_transformer_engine_spec() -> ModuleSpec: + return ModuleSpec( + module=FluxSingleTransformerBlock, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=FluxSingleAttention, + params={"attn_mask_type": AttnMaskType.no_mask}, + submodules=SelfAttentionSubmodules( + linear_qkv=TEColumnParallelLinear, + core_attention=TEDotProductAttention, + q_layernorm=RMSNorm, + k_layernorm=RMSNorm, + linear_proj=IdentityOp, + ), + ), + ), + ) + + +def get_flux_double_transformer_engine_spec() -> ModuleSpec: + return ModuleSpec( + module=MMDiTLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=JointSelfAttention, + params={"attn_mask_type": AttnMaskType.no_mask}, + submodules=JointSelfAttentionSubmodules( + q_layernorm=RMSNorm, + k_layernorm=RMSNorm, + added_q_layernorm=RMSNorm, + added_k_layernorm=RMSNorm, + linear_qkv=TEColumnParallelLinear, + added_linear_qkv=TEColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + ), + ), + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TEColumnParallelLinear, + linear_fc2=TERowParallelLinear, + ), + ), + ), + ) diff --git a/nemo/collections/diffusion/models/flux/__init__.py b/nemo/collections/diffusion/models/flux/__init__.py new file mode 100644 index 000000000000..9e3250071955 --- /dev/null +++ b/nemo/collections/diffusion/models/flux/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/diffusion/models/flux/layers.py b/nemo/collections/diffusion/models/flux/layers.py new file mode 100644 index 000000000000..222a9a1d67ae --- /dev/null +++ b/nemo/collections/diffusion/models/flux/layers.py @@ -0,0 +1,173 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import torch +from torch import Tensor, nn + + +def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor: + """ + Different from the original ROPE used for flux. + Megatron attention takes the out product and calculate sin/cos inside, so we only need to get the freqs here + in the shape of [seq, ..., dim] + """ + assert dim % 2 == 0, "The dimension must be even." + + scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim + omega = 1.0 / (theta**scale) + + out = torch.einsum("...n,d->...nd", pos, omega) + + return out.float() + + +class EmbedND(nn.Module): + def __init__(self, dim: int, theta: int, axes_dim: list[int]): + super().__init__() + self.dim = dim + self.theta = theta + self.axes_dim = axes_dim + + def forward(self, ids: torch.Tensor) -> torch.Tensor: + n_axes = ids.shape[-1] + emb = torch.cat( + [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)], + dim=-1, + ) + emb = emb.unsqueeze(1).permute(2, 0, 1, 3) + return torch.stack([emb, emb], dim=-1).reshape(*emb.shape[:-1], -1) + + +class MLPEmbedder(nn.Module): + def __init__(self, in_dim: int, hidden_dim: int): + super().__init__() + self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True) + self.silu = nn.SiLU() + self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True) + + def forward(self, x: Tensor) -> Tensor: + return self.out_layer(self.silu(self.in_layer(x))) + + +def get_timestep_embedding( + timesteps: torch.Tensor, + embedding_dim: int, + flip_sin_to_cos: bool = True, + downscale_freq_shift: float = 0, + scale: float = 1, + max_period: int = 10000, +): + """ + This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings. + + Args + timesteps (torch.Tensor): + a 1-D Tensor of N indices, one per batch element. These may be fractional. + embedding_dim (int): + the dimension of the output. + flip_sin_to_cos (bool): + Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False) + downscale_freq_shift (float): + Controls the delta between frequencies between dimensions + scale (float): + Scaling factor applied to the embeddings. + max_period (int): + Controls the maximum frequency of the embeddings + Returns + torch.Tensor: an [N x dim] Tensor of positional embeddings. + """ + assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array" + + half_dim = embedding_dim // 2 + exponent = -math.log(max_period) * torch.arange( + start=0, end=half_dim, dtype=torch.float32, device=timesteps.device + ) + exponent = exponent / (half_dim - downscale_freq_shift) + + emb = torch.exp(exponent) + emb = timesteps[:, None].float() * emb[None, :] + + # scale embeddings + emb = scale * emb + + # concat sine and cosine embeddings + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1) + + # flip sine and cosine embeddings + if flip_sin_to_cos: + emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1) + + # zero pad + if embedding_dim % 2 == 1: + emb = torch.nn.functional.pad(emb, (0, 1, 0, 0)) + return emb + + +class Timesteps(nn.Module): + def __init__( + self, + embedding_dim: int, + flip_sin_to_cos: bool = True, + downscale_freq_shift: float = 0, + scale: float = 1, + max_period: int = 10000, + ): + super().__init__() + self.embedding_dim = embedding_dim + self.flip_sin_to_cos = flip_sin_to_cos + self.downscale_freq_shift = downscale_freq_shift + self.scale = scale + self.max_period = max_period + + def forward(self, timesteps: torch.Tensor) -> torch.Tensor: + t_emb = get_timestep_embedding( + timesteps, + self.embedding_dim, + flip_sin_to_cos=self.flip_sin_to_cos, + downscale_freq_shift=self.downscale_freq_shift, + scale=self.scale, + max_period=self.max_period, + ) + return t_emb + + +class TimeStepEmbedder(nn.Module): + def __init__( + self, + embedding_dim: int, + hidden_dim: int, + flip_sin_to_cos: bool = True, + downscale_freq_shift: float = 0, + scale: float = 1, + max_period: int = 10000, + ): + + super().__init__() + + self.time_proj = Timesteps( + embedding_dim=embedding_dim, + flip_sin_to_cos=flip_sin_to_cos, + downscale_freq_shift=downscale_freq_shift, + scale=scale, + max_period=max_period, + ) + self.time_embedder = MLPEmbedder(in_dim=embedding_dim, hidden_dim=hidden_dim) + + def forward(self, timesteps: torch.Tensor) -> torch.Tensor: + timesteps_proj = self.time_proj(timesteps) + timesteps_emb = self.time_embedder(timesteps_proj) + + return timesteps_emb diff --git a/nemo/collections/diffusion/models/flux/model.py b/nemo/collections/diffusion/models/flux/model.py new file mode 100644 index 000000000000..4d42c80a75a1 --- /dev/null +++ b/nemo/collections/diffusion/models/flux/model.py @@ -0,0 +1,156 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Callable + +import torch +from megatron.core.models.common.vision_module.vision_module import VisionModule +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.utils import openai_gelu +from torch import nn + +from nemo.collections.diffusion.models.dit.dit_layer_spec import ( + AdaLNContinuous, + FluxSingleTransformerBlock, + MMDiTLayer, + get_flux_double_transformer_engine_spec, + get_flux_single_transformer_engine_spec, +) +from nemo.collections.diffusion.models.flux.layers import EmbedND, MLPEmbedder, TimeStepEmbedder + + +@dataclass +class FluxParams: + num_joint_layers: int = 19 + num_single_layers: int = 38 + hidden_size: int = 3072 + num_attention_heads: int = 24 + activation_func: Callable = openai_gelu + add_qkv_bias: bool = True + ffn_hidden_size: int = 16384 + in_channels: int = 64 + context_dim: int = 4096 + model_channels: int = 256 + patch_size: int = 1 + guidance_embed: bool = False + vec_in_dim: int = 768 + + +class Flux(VisionModule): + def __init__(self, config: FluxParams): + + self.out_channels = config.in_channels + self.hidden_size = config.hidden_size + self.num_attention_heads = config.num_attention_heads + self.patch_size = config.patch_size + self.in_channels = config.in_channels + self.guidance_embed = config.guidance_embed + transformer_config = TransformerConfig( + num_layers=1, + hidden_size=self.hidden_size, + num_attention_heads=self.num_attention_heads, + use_cpu_initialization=True, + activation_func=config.activation_func, + hidden_dropout=0, + attention_dropout=0, + layernorm_epsilon=1e-6, + add_qkv_bias=config.add_qkv_bias, + rotary_interleaved=True, + ) + super().__init__(transformer_config) + + self.pos_embed = EmbedND(dim=self.hidden_size, theta=10000, axes_dim=[16, 56, 56]) + self.img_embed = nn.Linear(config.in_channels, self.hidden_size) + self.txt_embed = nn.Linear(config.context_dim, self.hidden_size) + self.timestep_embedding = TimeStepEmbedder(config.model_channels, self.hidden_size) + self.vector_embedding = MLPEmbedder(in_dim=config.vec_in_dim, hidden_dim=self.hidden_size) + if config.guidance_embed: + self.guidance_embedding = ( + MLPEmbedder(in_dim=config.model_channels, hidden_dim=self.hidden_size) + if config.guidance_embed + else nn.Identity() + ) + + self.double_blocks = nn.ModuleList( + [ + MMDiTLayer( + config=transformer_config, + submodules=get_flux_double_transformer_engine_spec().submodules, + layer_number=i, + context_pre_only=False, + ) + for i in range(config.num_joint_layers) + ] + ) + + self.single_blocks = nn.ModuleList( + [ + FluxSingleTransformerBlock( + config=transformer_config, + submodules=get_flux_single_transformer_engine_spec().submodules, + layer_number=i, + ) + for i in range(config.num_single_layers) + ] + ) + + self.norm_out = AdaLNContinuous(config=transformer_config, conditioning_embedding_dim=self.hidden_size) + self.proj_out = nn.Linear(self.hidden_size, self.patch_size * self.patch_size * self.out_channels, bias=True) + + def forward( + self, + img: torch.Tensor, + txt: torch.Tensor = None, + y: torch.Tensor = None, + timesteps: torch.LongTensor = None, + img_ids: torch.Tensor = None, + txt_ids: torch.Tensor = None, + guidance: torch.Tensor = None, + ): + hidden_states = self.img_embed(img) + encoder_hidden_states = self.txt_embed(txt) + + timesteps = timesteps.to(img.dtype) * 1000 + vec_emb = self.timestep_embedding(timesteps) + + if guidance is not None: + vec_emb = vec_emb + self.guidance_embedding(self.timestep_embedding.time_proj(guidance * 1000)) + vec_emb = vec_emb + self.vector_embedding(y) + + ids = torch.cat((txt_ids, img_ids), dim=1) + rotary_pos_emb = self.pos_embed(ids) + for id_block, block in enumerate(self.double_blocks): + hidden_states, encoder_hidden_states = block( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + rotary_pos_emb=rotary_pos_emb, + emb=vec_emb, + ) + + hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=0) + + for id_block, block in enumerate(self.single_blocks): + hidden_states = block( + hidden_states=hidden_states, + rotary_pos_emb=rotary_pos_emb, + emb=vec_emb, + ) + + hidden_states = hidden_states[encoder_hidden_states.shape[0] :, ...] + + hidden_states = self.norm_out(hidden_states, vec_emb) + output = self.proj_out(hidden_states) + + return output diff --git a/nemo/collections/diffusion/models/flux/pipeline.py b/nemo/collections/diffusion/models/flux/pipeline.py new file mode 100644 index 000000000000..e460f8f115bd --- /dev/null +++ b/nemo/collections/diffusion/models/flux/pipeline.py @@ -0,0 +1,342 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import List, Optional, Union + +import numpy as np +import torch +from PIL import Image +from safetensors.torch import load_file as load_safetensors +from safetensors.torch import save_file as save_safetensors +from torch import nn +from tqdm import tqdm + +from nemo.collections.diffusion.encoders.conditioner import FrozenCLIPEmbedder, FrozenT5Embedder +from nemo.collections.diffusion.models.flux.model import Flux, FluxParams +from nemo.collections.diffusion.sampler.flow_matching.flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler +from nemo.collections.diffusion.utils.flux_ckpt_converter import flux_transformer_converter +from nemo.collections.diffusion.utils.flux_pipeline_utils import FluxModelParams +from nemo.collections.diffusion.vae.autoencoder import AutoEncoder + + +class FluxInferencePipeline(nn.Module): + def __init__(self, params: FluxModelParams): + super().__init__() + self.device = params.device + params.clip_params['device'] = self.device + params.t5_params['device'] = self.device + + self.vae = AutoEncoder(params.vae_params).to(self.device).eval() + self.clip_encoder = FrozenCLIPEmbedder(**params.clip_params) + self.t5_encoder = FrozenT5Embedder(**params.t5_params) + self.transformer = Flux(params.flux_params).to(self.device).eval() + self.vae_scale_factor = 2 ** (len(self.vae.params.ch_mult)) + self.scheduler = FlowMatchEulerDiscreteScheduler(**params.scheduler_params) + self.params = params + + def load_from_pretrained(self, ckpt_path, do_convert_from_hf=True, save_converted_model=None): + if do_convert_from_hf: + ckpt = flux_transformer_converter(ckpt_path, self.transformer.config) + if save_converted_model: + save_path = os.path.join(ckpt_path, 'nemo_flux_transformer.safetensors') + save_safetensors(ckpt, save_path) + print(f'saving converted transformer checkpoint to {save_path}') + else: + ckpt = load_safetensors(ckpt_path) + missing, unexpected = self.transformer.load_state_dict(ckpt, strict=False) + missing = [ + k for k in missing if not k.endswith('_extra_state') + ] # These keys are mcore specific and should not affect the model performance + if len(missing) > 0: + print( + f"The folloing keys are missing during checkpoint loading, please check the ckpt provided or the image quality may be compromised.\n {missing}" + ) + print(f"Found unexepected keys: \n {unexpected}") + + def encoder_prompt( + self, + prompt: Union[str, List[str]], + num_images_per_prompt: int = 1, + prompt_embeds: Optional[torch.FloatTensor] = None, + pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + max_sequence_length: int = 512, + device: Optional[torch.device] = 'cuda', + dtype: Optional[torch.dtype] = torch.float, + ): + if prompt is not None: + batch_size = len(prompt) + elif prompt_embeds is not None: + batch_size = prompt_embeds.shape[0] + else: + raise ValueError("Either prompt or prompt_embeds must be provided.") + if device == 'cuda' and self.t5_encoder.device != device: + self.t5_encoder.to(device) + if prompt_embeds is None: + prompt_embeds = self.t5_encoder(prompt, max_sequence_length=max_sequence_length) + seq_len = prompt_embeds.shape[1] + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1).to(dtype=dtype) + + if device == 'cuda' and self.clip_encoder.device != device: + self.clip_encoder.to(device) + if pooled_prompt_embeds is None: + _, pooled_prompt_embeds = self.clip_encoder(prompt) + + pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1) + pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1).to(dtype=dtype) + + dtype = dtype if dtype is not None else self.t5_encoder.dtype + text_ids = torch.zeros(batch_size, prompt_embeds.shape[1], 3).to(device=device, dtype=dtype) + text_ids = text_ids.repeat(num_images_per_prompt, 1, 1) + + return prompt_embeds.transpose(0, 1), pooled_prompt_embeds, text_ids + + @staticmethod + def _prepare_latent_image_ids(batch_size: int, height: int, width: int, device: torch.device, dtype: torch.dtype): + latent_image_ids = torch.zeros(height // 2, width // 2, 3) + latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None] + latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :] + + latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape + + latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1) + latent_image_ids = latent_image_ids.reshape( + batch_size, latent_image_id_height * latent_image_id_width, latent_image_id_channels + ) + + return latent_image_ids.to(device=device, dtype=dtype) + + @staticmethod + def _pack_latents(latents, batch_size, num_channels_latents, height, width): + latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2) + latents = latents.permute(0, 2, 4, 1, 3, 5) + latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4) + + return latents + + @staticmethod + def _unpack_latents(latents, height, width, vae_scale_factor): + batch_size, num_patches, channels = latents.shape + + height = height // vae_scale_factor + width = width // vae_scale_factor + + latents = latents.view(batch_size, height, width, channels // 4, 2, 2) + latents = latents.permute(0, 3, 1, 4, 2, 5) + + latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2) + + return latents + + @staticmethod + def _calculate_shift( + image_seq_len, + base_seq_len: int = 256, + max_seq_len: int = 4096, + base_shift: float = 0.5, + max_shift: float = 1.16, + ): + m = (max_shift - base_shift) / (max_seq_len - base_seq_len) + b = base_shift - m * base_seq_len + mu = image_seq_len * m + b + return mu + + def prepare_latents( + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + device, + generator, + latents=None, + ): + height = 2 * int(height) // self.vae_scale_factor + width = 2 * int(width) // self.vae_scale_factor + + shape = (batch_size, num_channels_latents, height, width) + + if latents is not None: + latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype) + return latents.to(device=device, dtype=dtype), latent_image_ids + + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + latents = FluxInferencePipeline._generate_rand_latents(shape, generator=generator, device=device, dtype=dtype) + latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width) + + latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype) + + return latents.transpose(0, 1), latent_image_ids + + @staticmethod + def _generate_rand_latents( + shape, + generator, + device, + dtype, + ): + if isinstance(generator, list): + shape = (1,) + shape[1:] + latents = [ + torch.randn(shape, generator=generator[i], device=device, dtype=dtype, layout=layout) + for i in range(batch_size) + ] + latents = torch.cat(latents, dim=0).to(device=device) + else: + latents = torch.randn(shape, generator=generator, device=device, dtype=dtype) + + return latents + + @staticmethod + def numpy_to_pil(images): + """ + Convert a numpy image or a batch of images to a PIL image. + """ + if images.ndim == 3: + images = images[None, ...] + images = (images * 255).round().astype("uint8") + pil_images = [Image.fromarray(image) for image in images] + + return pil_images + + @staticmethod + def torch_to_numpy(images): + numpy_images = images.float().cpu().permute(0, 2, 3, 1).numpy() + return numpy_images + + @staticmethod + def denormalize(image): + return (image / 2 + 0.5).clamp(0, 1) + + def __call__( + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = 512, + width: Optional[int] = 512, + num_inference_steps: int = 28, + timesteps: Optional[List[int]] = None, + guidance_scale: float = 7.0, + num_images_per_prompt: Optional[int] = 1, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + max_sequence_length: int = 512, + device: torch.device = 'cuda', + dtype: torch.dtype = torch.float32, + save_to_disk: bool = True, + offload: bool = True, + ): + assert device == 'cuda', 'Transformer blocks in Mcore must run on cuda devices' + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + prompt = [prompt] + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + elif prompt_embeds is not None and isinstance(prompt_embeds, torch.FloatTensor): + batch_size = prompt_embeds.shape[0] + else: + raise ValueError("Either prompt or prompt_embeds must be provided.") + + ## get text prompt embeddings + prompt_embeds, pooled_prompt_embeds, text_ids = self.encoder_prompt( + prompt=prompt, + prompt_embeds=prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + device=device, + dtype=dtype, + ) + if offload: + self.t5_encoder.to('cpu') + self.clip_encoder.to('cpu') + torch.cuda.empty_cache() + + ## prepare image latents + num_channels_latents = self.transformer.in_channels // 4 + latents, latent_image_ids = self.prepare_latents( + batch_size * num_images_per_prompt, num_channels_latents, height, width, dtype, device, generator, latents + ) + # prepare timesteps + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) + image_seq_len = latents.shape[0] + + mu = FluxInferencePipeline._calculate_shift( + image_seq_len, + self.scheduler.base_image_seq_len, + self.scheduler.max_image_seq_len, + self.scheduler.base_shift, + self.scheduler.max_shift, + ) + + self.scheduler.set_timesteps(sigmas=sigmas, device=device, mu=mu) + timesteps = self.scheduler.timesteps + + if device == 'cuda' and device != self.device: + self.transformer.to(device) + with torch.no_grad(): + for i, t in tqdm(enumerate(timesteps)): + timestep = t.expand(latents.shape[1]).to(device=latents.device, dtype=latents.dtype) + if self.transformer.guidance_embed: + guidance = torch.tensor([guidance_scale], device=device).expand(latents.shape[1]) + else: + guidance = None + with torch.autocast(device_type='cuda', dtype=latents.dtype): + pred = self.transformer( + img=latents, + txt=prompt_embeds, + y=pooled_prompt_embeds, + timesteps=timestep / 1000, + img_ids=latent_image_ids, + txt_ids=text_ids, + guidance=guidance, + ) + latents = self.scheduler.step(pred, t, latents)[0] + if offload: + self.transformer.to('cpu') + torch.cuda.empty_cache() + + if output_type == "latent": + return latents.transpose(0, 1) + elif output_type == "pil": + latents = self._unpack_latents(latents.transpose(0, 1), height, width, self.vae_scale_factor) + latents = (latents / self.vae.params.scale_factor) + self.vae.params.shift_factor + if device == 'cuda' and device != self.device: + self.vae.to(device) + with torch.autocast(device_type='cuda', dtype=latents.dtype): + image = self.vae.decode(latents) + if offload: + self.vae.to('cpu') + torch.cuda.empty_cache() + image = FluxInferencePipeline.denormalize(image) + image = FluxInferencePipeline.torch_to_numpy(image) + image = FluxInferencePipeline.numpy_to_pil(image) + if save_to_disk: + print('Saving to disk') + assert len(image) == int(len(prompt) * num_images_per_prompt) + prompt = [p[:40] + f'_{idx}' for p in prompt for idx in range(num_images_per_prompt)] + for file_name, image in zip(prompt, image): + image.save(f'{file_name}.png') + + return image diff --git a/nemo/collections/diffusion/sampler/flow_matching/__init__.py b/nemo/collections/diffusion/sampler/flow_matching/__init__.py new file mode 100644 index 000000000000..d9155f923f18 --- /dev/null +++ b/nemo/collections/diffusion/sampler/flow_matching/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/diffusion/sampler/flow_matching/flow_match_euler_discrete.py b/nemo/collections/diffusion/sampler/flow_matching/flow_match_euler_discrete.py new file mode 100644 index 000000000000..5bde6b0d1dc1 --- /dev/null +++ b/nemo/collections/diffusion/sampler/flow_matching/flow_match_euler_discrete.py @@ -0,0 +1,284 @@ +# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from abc import ABC +from typing import List, Optional, Tuple, Union + + +import numpy as np +import torch + + +class FlowMatchEulerDiscreteScheduler(ABC): + """ + Euler scheduler. + + Args: + num_train_timesteps (`int`, defaults to 1000): + The number of diffusion steps to train the model. + timestep_spacing (`str`, defaults to `"linspace"`): + The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information. + shift (`float`, defaults to 1.0): + The shift value for the timestep schedule. + """ + + _compatibles = [] + order = 1 + + def __init__( + self, + num_train_timesteps: int = 1000, + shift: float = 1.0, + use_dynamic_shifting=False, + base_shift: Optional[float] = 0.5, + max_shift: Optional[float] = 1.15, + base_image_seq_len: Optional[int] = 256, + max_image_seq_len: Optional[int] = 4096, + ): + timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy() + timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32) + + sigmas = timesteps / num_train_timesteps + if not use_dynamic_shifting: + # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution + sigmas = shift * sigmas / (1 + (shift - 1) * sigmas) + + self.timesteps = sigmas * num_train_timesteps + + self._step_index = None + self._begin_index = None + + self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication + self.sigma_min = self.sigmas[-1].item() + self.sigma_max = self.sigmas[0].item() + + self.base_shift = base_shift + self.max_shift = max_shift + self.base_image_seq_len = base_image_seq_len + self.max_image_seq_len = max_image_seq_len + self.use_dynamic_shifting = use_dynamic_shifting + self.num_train_timesteps = num_train_timesteps + self.shift = shift + + @property + def step_index(self): + """ + The index counter for current timestep. It will increase 1 after each scheduler step. + """ + return self._step_index + + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + + def scale_noise( + self, + sample: torch.FloatTensor, + timestep: Union[float, torch.FloatTensor], + noise: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + """ + Forward process in flow-matching + + Args: + sample (`torch.FloatTensor`): + The input sample. + timestep (`int`, *optional*): + The current timestep in the diffusion chain. + + Returns: + `torch.FloatTensor`: + A scaled input sample. + """ + # Make sure sigmas and timesteps have the same device and dtype as original_samples + sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype) + + if sample.device.type == "mps" and torch.is_floating_point(timestep): + # mps does not support float64 + schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32) + timestep = timestep.to(sample.device, dtype=torch.float32) + else: + schedule_timesteps = self.timesteps.to(sample.device) + timestep = timestep.to(sample.device) + + # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep] + elif self.step_index is not None: + # add_noise is called after first denoising step (for inpainting) + step_indices = [self.step_index] * timestep.shape[0] + else: + # add noise is called before first denoising step to create initial latent(img2img) + step_indices = [self.begin_index] * timestep.shape[0] + + sigma = sigmas[step_indices].flatten() + while len(sigma.shape) < len(sample.shape): + sigma = sigma.unsqueeze(-1) + + sample = sigma * noise + (1.0 - sigma) * sample + + return sample + + def _sigma_to_t(self, sigma): + return sigma * self.num_train_timesteps + + def time_shift(self, mu: float, sigma: float, t: torch.Tensor): + return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma) + + def set_timesteps( + self, + num_inference_steps: int = None, + device: Union[str, torch.device] = None, + sigmas: Optional[List[float]] = None, + mu: Optional[float] = None, + ): + """ + Sets the discrete timesteps used for the diffusion chain (to be run before inference). + + Args: + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + """ + + if self.use_dynamic_shifting and mu is None: + raise ValueError(" you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`") + + if sigmas is None: + self.num_inference_steps = num_inference_steps + timesteps = np.linspace( + self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps + ) + + sigmas = timesteps / self.num_train_timesteps + + if self.use_dynamic_shifting: + sigmas = self.time_shift(mu, 1.0, sigmas) + else: + sigmas = self.shift * sigmas / (1 + (self.shift - 1) * sigmas) + sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device) + timesteps = sigmas * self.num_train_timesteps + + self.timesteps = timesteps.to(device=device) + self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)]) + + self._step_index = None + self._begin_index = None + + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps + + indices = (schedule_timesteps == timestep).nonzero() + + # The sigma index that is taken for the **very** first `step` + # is always the second index (or the last index if there is only 1) + # This way we can ensure we don't accidentally skip a sigma in + # case we start in the middle of the denoising schedule (e.g. for image-to-image) + pos = 1 if len(indices) > 1 else 0 + + return indices[pos].item() + + def _init_step_index(self, timestep): + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index + + def step( + self, + model_output: torch.FloatTensor, + timestep: Union[float, torch.FloatTensor], + sample: torch.FloatTensor, + s_churn: float = 0.0, + s_tmin: float = 0.0, + s_tmax: float = float("inf"), + s_noise: float = 1.0, + generator: Optional[torch.Generator] = None, + ) -> Tuple: + """ + Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`torch.FloatTensor`): + The direct output from learned diffusion model. + timestep (`float`): + The current discrete timestep in the diffusion chain. + sample (`torch.FloatTensor`): + A current instance of a sample created by the diffusion process. + s_churn (`float`): + s_tmin (`float`): + s_tmax (`float`): + s_noise (`float`, defaults to 1.0): + Scaling factor for noise added to the sample. + generator (`torch.Generator`, *optional*): + A random number generator. + + Returns: + A tuple is returned where the first element is the sample tensor. + """ + + if ( + isinstance(timestep, int) + or isinstance(timestep, torch.IntTensor) + or isinstance(timestep, torch.LongTensor) + ): + raise ValueError( + ( + "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" + " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass" + " one of the `scheduler.timesteps` as a timestep." + ), + ) + + if self.step_index is None: + self._init_step_index(timestep) + + # Upcast to avoid precision issues when computing prev_sample + sample = sample.to(torch.float32) + + sigma = self.sigmas[self.step_index] + sigma_next = self.sigmas[self.step_index + 1] + prev_sample = sample + (sigma_next - sigma) * model_output + + # Cast sample back to model compatible dtype + prev_sample = prev_sample.to(model_output.dtype) + + # upon completion increase step index by one + self._step_index += 1 + + return (prev_sample,) + + def __len__(self): + return self.num_train_timesteps diff --git a/nemo/collections/diffusion/utils/__init__.py b/nemo/collections/diffusion/utils/__init__.py new file mode 100644 index 000000000000..9e3250071955 --- /dev/null +++ b/nemo/collections/diffusion/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/diffusion/utils/flux_ckpt_converter.py b/nemo/collections/diffusion/utils/flux_ckpt_converter.py new file mode 100644 index 000000000000..444a77bfad68 --- /dev/null +++ b/nemo/collections/diffusion/utils/flux_ckpt_converter.py @@ -0,0 +1,206 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import torch +from safetensors.torch import load_file as load_safetensors + + +def _import_qkv_bias(transformer_config, qb, kb, vb): + + head_num = transformer_config.num_attention_heads + num_query_groups = transformer_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = transformer_config.hidden_size + head_num = transformer_config.num_attention_heads + head_size = hidden_size // head_num + + new_q_bias_tensor_shape = (head_num, head_size) + new_kv_bias_tensor_shape = (num_query_groups, head_size) + + qb = qb.view(*new_q_bias_tensor_shape) + kb = kb.view(*new_kv_bias_tensor_shape) + vb = vb.view(*new_kv_bias_tensor_shape) + + qkv_bias_l = [] + for i in range(num_query_groups): + qkv_bias_l.append(qb[i * heads_per_group : (i + 1) * heads_per_group, :]) + qkv_bias_l.append(kb[i : i + 1, :]) + qkv_bias_l.append(vb[i : i + 1, :]) + + qkv_bias = torch.cat(qkv_bias_l) + qkv_bias = qkv_bias.reshape([head_size * (head_num + 2 * num_query_groups)]) + + return qkv_bias + + +def _import_qkv(transformer_config, q, k, v): + + head_num = transformer_config.num_attention_heads + num_query_groups = transformer_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = transformer_config.hidden_size + head_num = transformer_config.num_attention_heads + head_size = hidden_size // head_num + + old_tensor_shape = q.size() + new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:] + new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:] + + q = q.view(*new_q_tensor_shape) + k = k.view(*new_kv_tensor_shape) + v = v.view(*new_kv_tensor_shape) + + qkv_weights_l = [] + for i in range(num_query_groups): + qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :]) + qkv_weights_l.append(k[i : i + 1, :, :]) + qkv_weights_l.append(v[i : i + 1, :, :]) + qkv_weights = torch.cat(qkv_weights_l) + assert qkv_weights.ndim == 3, qkv_weights.shape + assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape + assert qkv_weights.shape[1] == head_size, qkv_weights.shape + assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape + + qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size]) + + return qkv_weights + + +key_mapping = { + 'double_blocks': { + 'norm1.linear.weight': 'adaln.adaLN_modulation.1.weight', + 'norm1.linear.bias': 'adaln.adaLN_modulation.1.bias', + 'norm1_context.linear.weight': 'adaln_context.adaLN_modulation.1.weight', + 'norm1_context.linear.bias': 'adaln_context.adaLN_modulation.1.bias', + 'attn.norm_q.weight': 'self_attention.q_layernorm.weight', + 'attn.norm_k.weight': 'self_attention.k_layernorm.weight', + 'attn.norm_added_q.weight': 'self_attention.added_q_layernorm.weight', + 'attn.norm_added_k.weight': 'self_attention.added_k_layernorm.weight', + 'attn.to_out.0.weight': 'self_attention.linear_proj.weight', + 'attn.to_out.0.bias': 'self_attention.linear_proj.bias', + 'attn.to_add_out.weight': 'self_attention.added_linear_proj.weight', + 'attn.to_add_out.bias': 'self_attention.added_linear_proj.bias', + 'ff.net.0.proj.weight': 'mlp.linear_fc1.weight', + 'ff.net.0.proj.bias': 'mlp.linear_fc1.bias', + 'ff.net.2.weight': 'mlp.linear_fc2.weight', + 'ff.net.2.bias': 'mlp.linear_fc2.bias', + 'ff_context.net.0.proj.weight': 'context_mlp.linear_fc1.weight', + 'ff_context.net.0.proj.bias': 'context_mlp.linear_fc1.bias', + 'ff_context.net.2.weight': 'context_mlp.linear_fc2.weight', + 'ff_context.net.2.bias': 'context_mlp.linear_fc2.bias', + }, + 'single_blocks': { + 'norm.linear.weight': 'adaln.adaLN_modulation.1.weight', + 'norm.linear.bias': 'adaln.adaLN_modulation.1.bias', + 'proj_mlp.weight': 'proj_in.weight', + 'proj_mlp.bias': 'proj_in.bias', + 'proj_out.weight': 'proj_out.weight', + 'proj_out.bias': 'proj_out.bias', + 'attn.norm_q.weight': 'self_attention.q_layernorm.weight', + 'attn.norm_k.weight': 'self_attention.k_layernorm.weight', + }, + 'norm_out.linear.bias': 'norm_out.adaLN_modulation.1.bias', + 'norm_out.linear.weight': 'norm_out.adaLN_modulation.1.weight', + 'proj_out.bias': 'proj_out.bias', + 'proj_out.weight': 'proj_out.weight', + 'time_text_embed.guidance_embedder.linear_1.bias': 'guidance_embedding.in_layer.bias', + 'time_text_embed.guidance_embedder.linear_1.weight': 'guidance_embedding.in_layer.weight', + 'time_text_embed.guidance_embedder.linear_2.bias': 'guidance_embedding.out_layer.bias', + 'time_text_embed.guidance_embedder.linear_2.weight': 'guidance_embedding.out_layer.weight', + 'x_embedder.bias': 'img_embed.bias', + 'x_embedder.weight': 'img_embed.weight', + 'time_text_embed.timestep_embedder.linear_1.bias': 'timestep_embedding.time_embedder.in_layer.bias', + 'time_text_embed.timestep_embedder.linear_1.weight': 'timestep_embedding.time_embedder.in_layer.weight', + 'time_text_embed.timestep_embedder.linear_2.bias': 'timestep_embedding.time_embedder.out_layer.bias', + 'time_text_embed.timestep_embedder.linear_2.weight': 'timestep_embedding.time_embedder.out_layer.weight', + 'context_embedder.bias': 'txt_embed.bias', + 'context_embedder.weight': 'txt_embed.weight', + 'time_text_embed.text_embedder.linear_1.bias': 'vector_embedding.in_layer.bias', + 'time_text_embed.text_embedder.linear_1.weight': 'vector_embedding.in_layer.weight', + 'time_text_embed.text_embedder.linear_2.bias': 'vector_embedding.out_layer.bias', + 'time_text_embed.text_embedder.linear_2.weight': 'vector_embedding.out_layer.weight', +} + + +def flux_transformer_converter(ckpt_path=None, transformer_config=None): + diffuser_state_dict = {} + if os.path.isdir(ckpt_path): + files = os.listdir(ckpt_path) + for file in files: + if file.endswith('.safetensors'): + loaded_dict = load_safetensors(os.path.join(ckpt_path, file)) + diffuser_state_dict.update(loaded_dict) + elif os.path.isfile(ckpt_path): + diffuser_state_dict = load_safetensors(ckpt_path) + else: + raise FileNotFoundError("Please provide a valid ckpt path.") + new_state_dict = {} + num_single_blocks = 0 + num_double_blocks = 0 + for key, value in diffuser_state_dict.items(): + if 'attn.to_q' in key or 'attn.to_k' in key or 'attn.to_v' in key: + continue + if 'attn.add_q_proj' in key or 'attn.add_k_proj' in key or 'attn.add_v_proj' in key: + continue + if key.startswith('transformer_blocks'): + temp = key.split('.') + idx, k = temp[1], '.'.join(temp[2:]) + num_double_blocks = max(int(idx), num_double_blocks) + new_key = '.'.join(['double_blocks', idx, key_mapping['double_blocks'][k]]) + elif key.startswith('single_transformer_blocks'): + temp = key.split('.') + idx, k = temp[1], '.'.join(temp[2:]) + num_single_blocks = max(int(idx), num_single_blocks) + new_key = '.'.join(['single_blocks', idx, key_mapping['single_blocks'][k]]) + else: + new_key = key_mapping[key] + new_state_dict[new_key] = value + + for i in range(num_double_blocks + 1): + new_key = f'double_blocks.{str(i)}.self_attention.linear_qkv.weight' + qk, kk, vk = [f'transformer_blocks.{str(i)}.attn.to_{n}.weight' for n in ('q', 'k', 'v')] + new_state_dict[new_key] = _import_qkv( + transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk] + ) + new_key = f'double_blocks.{str(i)}.self_attention.linear_qkv.bias' + qk, kk, vk = [f'transformer_blocks.{str(i)}.attn.to_{n}.bias' for n in ('q', 'k', 'v')] + new_state_dict[new_key] = _import_qkv_bias( + transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk] + ) + new_key = f'double_blocks.{str(i)}.self_attention.added_linear_qkv.weight' + qk, kk, vk = [f'transformer_blocks.{str(i)}.attn.add_{n}_proj.weight' for n in ('q', 'k', 'v')] + new_state_dict[new_key] = _import_qkv( + transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk] + ) + new_key = f'double_blocks.{str(i)}.self_attention.added_linear_qkv.bias' + qk, kk, vk = [f'transformer_blocks.{str(i)}.attn.add_{n}_proj.bias' for n in ('q', 'k', 'v')] + new_state_dict[new_key] = _import_qkv_bias( + transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk] + ) + + for i in range(num_single_blocks + 1): + new_key = f'single_blocks.{str(i)}.self_attention.linear_qkv.weight' + qk, kk, vk = [f'single_transformer_blocks.{str(i)}.attn.to_{n}.weight' for n in ('q', 'k', 'v')] + new_state_dict[new_key] = _import_qkv( + transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk] + ) + new_key = f'single_blocks.{str(i)}.self_attention.linear_qkv.bias' + qk, kk, vk = [f'single_transformer_blocks.{str(i)}.attn.to_{n}.bias' for n in ('q', 'k', 'v')] + new_state_dict[new_key] = _import_qkv_bias( + transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk] + ) + + return new_state_dict diff --git a/nemo/collections/diffusion/utils/flux_pipeline_utils.py b/nemo/collections/diffusion/utils/flux_pipeline_utils.py new file mode 100644 index 000000000000..77dcfa58450f --- /dev/null +++ b/nemo/collections/diffusion/utils/flux_pipeline_utils.py @@ -0,0 +1,76 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass + +import torch +from megatron.core.transformer.utils import openai_gelu + +from nemo.collections.diffusion.models.flux.model import FluxParams +from nemo.collections.diffusion.vae.autoencoder import AutoEncoderParams + + +@dataclass +class FluxModelParams: + flux_params: FluxParams + vae_params: AutoEncoderParams + clip_params: dict | None + t5_params: dict | None + scheduler_params: dict | None + device: str | torch.device + + +configs = { + "dev": FluxModelParams( + flux_params=FluxParams( + num_joint_layers=19, + num_single_layers=38, + hidden_size=3072, + num_attention_heads=24, + activation_func=openai_gelu, + add_qkv_bias=True, + ffn_hidden_size=16384, + in_channels=64, + context_dim=4096, + model_channels=256, + patch_size=1, + guidance_embed=True, + vec_in_dim=768, + ), + vae_params=AutoEncoderParams( + ch_mult=[1, 2, 4, 4], + attn_resolutions=[], + resolution=256, + in_channels=3, + ch=128, + out_ch=3, + num_res_blocks=2, + z_channels=16, + scale_factor=0.3611, + shift_factor=0.1159, + ckpt=None, + ), + clip_params={ + 'max_length': 77, + 'always_return_pooled': True, + }, + t5_params={ + 'max_length': 512, + }, + scheduler_params={ + 'num_train_timesteps': 1000, + }, + device='cpu', + ) +} diff --git a/nemo/collections/diffusion/utils/mcore_parallel_utils.py b/nemo/collections/diffusion/utils/mcore_parallel_utils.py new file mode 100644 index 000000000000..0b9bdec97464 --- /dev/null +++ b/nemo/collections/diffusion/utils/mcore_parallel_utils.py @@ -0,0 +1,80 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Megatron Model Parallel Initialization +""" + +import os + +import megatron.core.parallel_state as ps +import torch + + +class Utils: + world_size = torch.cuda.device_count() + # rank = int(os.environ["LOCAL_RANK"]) + rank = 0 + + @staticmethod + def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1, context_parallel_size=1): + ps.destroy_model_parallel() + + # Torch setup for distributed training + rank = int(os.environ['LOCAL_RANK']) + world_size = 1 # torch.cuda.device_count() + torch.cuda.set_device(rank) + torch.distributed.init_process_group(world_size=world_size, rank=rank) + + # Megatron core distributed training initialization + ps.initialize_model_parallel( + tensor_model_parallel_size, pipeline_model_parallel_size, context_parallel_size=context_parallel_size + ) + + @staticmethod + def set_world_size(world_size=None, rank=None): + Utils.world_size = torch.cuda.device_count() if world_size is None else world_size + if torch.distributed.is_initialized() and Utils.world_size != torch.distributed.get_world_size(): + torch.distributed.destroy_process_group() + + if rank is None: + # Utils.rank = int(os.environ["LOCAL_RANK"]) + Utils.rank = 0 + if Utils.rank >= Utils.world_size: + Utils.rank = -1 + else: + Utils.rank = rank + + @staticmethod + def destroy_model_parallel(): + ps.destroy_model_parallel() + torch.distributed.barrier() + + @staticmethod + def initialize_model_parallel( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + virtual_pipeline_model_parallel_size=None, + pipeline_model_parallel_split_rank=None, + **kwargs, + ): + ps.destroy_model_parallel() + Utils.initialize_distributed() + ps.initialize_model_parallel( + tensor_model_parallel_size, + pipeline_model_parallel_size, + virtual_pipeline_model_parallel_size, + pipeline_model_parallel_split_rank, + **kwargs, + ) diff --git a/nemo/collections/diffusion/vae/autoencoder.py b/nemo/collections/diffusion/vae/autoencoder.py new file mode 100644 index 000000000000..b356d74baac1 --- /dev/null +++ b/nemo/collections/diffusion/vae/autoencoder.py @@ -0,0 +1,334 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass + +import numpy as np +import torch +from torch import Tensor, nn + +from nemo.collections.diffusion.vae.blocks import AttnBlock, Downsample, Normalize, ResnetBlock, Upsample, make_attn + + +@dataclass +class AutoEncoderParams: + ch_mult: list[int] + attn_resolutions: list[int] + resolution: int = 256 + in_channels: int = 3 + ch: int = 128 + out_ch: int = 3 + num_res_blocks: int = 2 + z_channels: int = 16 + scale_factor: float = 0.3611 + shift_factor: float = 0.1159 + attn_type: str = 'vanilla' + double_z: bool = True + dropout: float = 0.0 + ckpt: str = None + + +def nonlinearity(x): + # swish + return torch.nn.functional.silu(x) + + +class Encoder(nn.Module): + def __init__( + self, + *, + ch: int, + out_ch: int, + ch_mult: list[int], + num_res_blocks: int, + attn_resolutions: list[int], + in_channels: int, + resolution: int, + z_channels: int, + dropout=0.0, + resamp_with_conv=True, + double_z=True, + use_linear_attn=False, + attn_type="vanilla", + ): + super().__init__() + if use_linear_attn: + attn_type = "linear" + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + + # downsampling + self.conv_in = torch.nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1) + + curr_res = resolution + in_ch_mult = (1,) + tuple(ch_mult) + self.in_ch_mult = in_ch_mult + self.down = nn.ModuleList() + for i_level in range(self.num_resolutions): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout + ) + ) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + down = nn.Module() + down.block = block + down.attn = attn + if i_level != self.num_resolutions - 1: + down.downsample = Downsample(block_in, resamp_with_conv) + curr_res = curr_res // 2 + self.down.append(down) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout + ) + self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout + ) + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, 2 * z_channels if double_z else z_channels, kernel_size=3, stride=1, padding=1 + ) + + def forward(self, x): + # timestep embedding + temb = None + + # downsampling + hs = [self.conv_in(x)] + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = self.down[i_level].block[i_block](hs[-1], temb) + if len(self.down[i_level].attn) > 0: + h = self.down[i_level].attn[i_block](h) + hs.append(h) + if i_level != self.num_resolutions - 1: + hs.append(self.down[i_level].downsample(hs[-1])) + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # end + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + +class Decoder(nn.Module): + def __init__( + self, + *, + ch: int, + out_ch: int, + ch_mult: list[int], + num_res_blocks: int, + attn_resolutions: list[int], + in_channels: int, + resolution: int, + z_channels: int, + dropout=0.0, + resamp_with_conv=True, + give_pre_end=False, + tanh_out=False, + use_linear_attn=False, + attn_type="vanilla", + **ignorekwargs, + ): + super().__init__() + if use_linear_attn: + attn_type = "linear" + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.tanh_out = tanh_out + + # compute in_ch_mult, block_in and curr_res at lowest res + in_ch_mult = (1,) + tuple(ch_mult) + block_in = ch * ch_mult[self.num_resolutions - 1] + curr_res = resolution // 2 ** (self.num_resolutions - 1) + self.z_shape = (1, z_channels, curr_res, curr_res) + print("Working with z of shape {} = {} dimensions.".format(self.z_shape, np.prod(self.z_shape))) + + # z to block_in + self.conv_in = torch.nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout + ) + self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout + ) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout + ) + ) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + up = nn.Module() + up.block = block + up.attn = attn + if i_level != 0: + up.upsample = Upsample(block_in, resamp_with_conv) + curr_res = curr_res * 2 + self.up.insert(0, up) # prepend to get consistent order + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1) + + def forward(self, z): + # assert z.shape[1:] == self.z_shape[1:] + self.last_z_shape = z.shape + + # timestep embedding + temb = None + + # z to block_in + h = self.conv_in(z) + + # middle + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # upsampling + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = self.up[i_level].block[i_block](h, temb) + if len(self.up[i_level].attn) > 0: + h = self.up[i_level].attn[i_block](h) + if i_level != 0: + h = self.up[i_level].upsample(h) + + # end + if self.give_pre_end: + return h + + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + if self.tanh_out: + h = torch.tanh(h) + return h + + +class DiagonalGaussian(nn.Module): + def __init__(self, sample: bool = True, chunk_dim: int = 1): + super().__init__() + self.sample = sample + self.chunk_dim = chunk_dim + + def forward(self, z: Tensor) -> Tensor: + mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim) + if self.sample: + std = torch.exp(0.5 * logvar) + return mean + std * torch.randn_like(mean) + else: + return mean + + +class AutoEncoder(nn.Module): + def __init__(self, params: AutoEncoderParams): + super().__init__() + self.encoder = Encoder( + resolution=params.resolution, + in_channels=params.in_channels, + ch=params.ch, + ch_mult=params.ch_mult, + num_res_blocks=params.num_res_blocks, + z_channels=params.z_channels, + double_z=params.double_z, + attn_type=params.attn_type, + dropout=params.dropout, + out_ch=params.out_ch, + attn_resolutions=params.attn_resolutions, + ) + self.decoder = Decoder( + resolution=params.resolution, + in_channels=params.in_channels, + ch=params.ch, + out_ch=params.out_ch, + ch_mult=params.ch_mult, + num_res_blocks=params.num_res_blocks, + z_channels=params.z_channels, + double_z=params.double_z, + attn_type=params.attn_type, + dropout=params.dropout, + attn_resolutions=params.attn_resolutions, + ) + self.reg = DiagonalGaussian() + + self.scale_factor = params.scale_factor + self.shift_factor = params.shift_factor + self.params = params + + if params.ckpt is not None: + self.load_from_checkpoint(params.ckpt) + + def encode(self, x: Tensor) -> Tensor: + z = self.reg(self.encoder(x)) + z = self.scale_factor * (z - self.shift_factor) + return z + + def decode(self, z: Tensor) -> Tensor: + z = z / self.scale_factor + self.shift_factor + return self.decoder(z) + + def forward(self, x: Tensor) -> Tensor: + return self.decode(self.encode(x)) + + def load_from_checkpoint(self, ckpt_path): + from safetensors.torch import load_file as load_sft + + state_dict = load_sft(ckpt_path) + missing, unexpected = self.load_state_dict(state_dict) + if len(missing) > 0: + logger.warning(f"Following keys are missing from checkpoint loaded: {missing}") diff --git a/nemo/collections/diffusion/vae/blocks.py b/nemo/collections/diffusion/vae/blocks.py new file mode 100644 index 000000000000..ad38a7a463cf --- /dev/null +++ b/nemo/collections/diffusion/vae/blocks.py @@ -0,0 +1,180 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from einops import rearrange +from torch import Tensor, nn + +try: + from apex.contrib.group_norm import GroupNorm + + OPT_GROUP_NORM = True +except Exception: + print('Fused optimized group norm has not been installed.') + OPT_GROUP_NORM = False + + +def Normalize(in_channels, num_groups=32, act=""): + return GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True, act=act) + + +class ResnetBlock(nn.Module): + def __init__(self, in_channels, out_channels=None, conv_shortcut=False, dropout=0.0, temb_channels=0): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + + self.norm1 = Normalize(in_channels, act="silu") + self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, act="silu") + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, x, temb): + h = x + h = self.norm1(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class Upsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16 + # TODO(yuya): Remove this cast once the issue is fixed in PyTorch + # https://github.com/pytorch/pytorch/issues/86679 + dtype = x.dtype + if dtype == torch.bfloat16: + x = x.to(torch.float32) + x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest") + if dtype == torch.bfloat16: + x = x.to(dtype) + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + # no asymmetric padding in torch conv, must do it ourselves + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0) + + def forward(self, x): + if self.with_conv: + pad = (0, 1, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x = self.conv(x) + else: + x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return x + + +class AttnBlock(nn.Module): + def __init__(self, in_channels: int): + super().__init__() + self.in_channels = in_channels + + self.norm = Normalize(in_channels, act="silu") + + self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1) + self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1) + self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1) + self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1) + + def attention(self, h_: Tensor) -> Tensor: + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + b, c, h, w = q.shape + q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous() + k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous() + v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous() + h_ = nn.functional.scaled_dot_product_attention(q, k, v) + + return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b) + + def forward(self, x: Tensor) -> Tensor: + return x + self.proj_out(self.attention(x)) + + +class LinearAttention(nn.Module): + def __init__(self, dim, heads=4, dim_head=32): + super().__init__() + self.heads = heads + hidden_dim = dim_head * heads + self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False) + self.to_out = nn.Conv2d(hidden_dim, dim, 1) + + def forward(self, x): + b, c, h, w = x.shape + qkv = self.to_qkv(x) + q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads=self.heads, qkv=3) + k = k.softmax(dim=-1) + context = torch.einsum('bhdn,bhen->bhde', k, v) + out = torch.einsum('bhde,bhdn->bhen', context, q) + out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w) + return self.to_out(out) + + +class LinAttnBlock(LinearAttention): + """ + to match AttnBlock usage + """ + + def __init__(self, in_channels): + super().__init__(dim=in_channels, heads=1, dim_head=in_channels) + + +def make_attn(in_channels, attn_type="vanilla"): + assert attn_type in ["vanilla", "linear", "none"], f'attn_type {attn_type} unknown' + print(f"making attention of type '{attn_type}' with {in_channels} in_channels") + if attn_type == "vanilla": + return AttnBlock(in_channels) + elif attn_type == "none": + return nn.Identity(in_channels) + else: + return LinAttnBlock(in_channels) diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index 5ddbcf5913ad..4205c401eea8 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -70,6 +70,7 @@ MaskedTokenLossReduction, MistralConfig7B, MistralModel, + MistralNeMoConfig12B, MixtralConfig8x3B, MixtralConfig8x7B, MixtralConfig8x22B, @@ -115,6 +116,7 @@ "t5_forward_step", "MaskedTokenLossReduction", "MistralConfig7B", + "MistralNeMoConfig12B", "MistralModel", "MixtralConfig8x3B", "MixtralConfig8x7B", diff --git a/nemo/collections/llm/gpt/data/dolly.py b/nemo/collections/llm/gpt/data/dolly.py index 78751d60cdb0..fb8cf9fd5da0 100644 --- a/nemo/collections/llm/gpt/data/dolly.py +++ b/nemo/collections/llm/gpt/data/dolly.py @@ -26,6 +26,7 @@ if TYPE_CHECKING: from nemo.collections.common.tokenizers import TokenizerSpec + from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs class DollyDataModule(FineTuningDataModule, IOMixin): @@ -56,7 +57,7 @@ def __init__( pin_memory: bool = True, persistent_workers: bool = False, pad_to_max_length: bool = False, - packed_sequence_size: int = -1, + packed_sequence_specs: Optional["PackedSequenceSpecs"] = None, ): self.force_redownload = force_redownload self.delete_raw = delete_raw @@ -74,7 +75,7 @@ def __init__( pin_memory=pin_memory, persistent_workers=persistent_workers, pad_to_max_length=pad_to_max_length, - packed_sequence_size=packed_sequence_size, + packed_sequence_specs=packed_sequence_specs, ) def prepare_data(self) -> None: diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py index 3e4dba7ec89c..01cf617a094d 100644 --- a/nemo/collections/llm/gpt/data/fine_tuning.py +++ b/nemo/collections/llm/gpt/data/fine_tuning.py @@ -20,12 +20,14 @@ import pytorch_lightning as pl from torch.utils.data import DataLoader +from nemo.collections.common.tokenizers import AutoTokenizer from nemo.collections.llm.gpt.data.core import create_sft_dataset from nemo.lightning.pytorch.plugins import MegatronDataSampler from nemo.utils import logging if TYPE_CHECKING: from nemo.collections.common.tokenizers import TokenizerSpec + from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs class FineTuningDataModule(pl.LightningDataModule): @@ -50,10 +52,7 @@ class FineTuningDataModule(pl.LightningDataModule): persistent_workers (bool, optional): Whether to keep data loading workers persistent across epochs. Defaults to False. max_train_steps (int, optional): Maximum number of steps to train. Used to calculate samples mapping for the mmap dataset pad_to_max_length (bool, optional): Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch. - packed_sequence_size (int, optional): If a positive integer, this arg enables training with sequence packing and specifies the pack size - If less than or equal to 0, sequence packing is disabled. Defaults to -1. - Note: This arg is distinct from `seq_length` because `seq_length` specifies the maximum length of the original sequence - (i.e. the length to truncate long sequences in the input data). + packed_sequence_specs (PackedSequenceSpecs, optional): See PackedSequenceSpecs for details """ def __init__( @@ -70,7 +69,7 @@ def __init__( pin_memory: bool = True, persistent_workers: bool = False, pad_to_max_length: bool = False, - packed_sequence_size: int = -1, + packed_sequence_specs: Optional["PackedSequenceSpecs"] = None, ): super().__init__() self.seq_length = seq_length @@ -87,22 +86,21 @@ def __init__( self.data_sampler = None self.max_train_samples = None self.pad_to_max_length = pad_to_max_length - self.packed_sequence_size = packed_sequence_size - self._adjust_batch_sizes_for_packed_sequence() + self.packed_sequence_specs = packed_sequence_specs + self.packed_sequence_size = -1 if not packed_sequence_specs else packed_sequence_specs.packed_sequence_size + self.validate_batch_size_for_packed_sequence() - def _adjust_batch_sizes_for_packed_sequence(self): + def validate_batch_size_for_packed_sequence(self): if self.packed_sequence_size > 0 and self.micro_batch_size > 1: - logging.warning( + raise ValueError( "Micro batch size should be 1 when training with packed sequence, but your micro batch size " - f"is {self.micro_batch_size}. Your config will be automatically updated to the following: " - f"MBS will be set to 1 (from {self.micro_batch_size}), " - f"GBS will be set to {self.global_batch_size // self.micro_batch_size} (from {self.global_batch_size}), " - f"packed sequence length will be set to {self.packed_sequence_size*self.micro_batch_size} (from {self.packed_sequence_size}). " + f"is {self.micro_batch_size}. \nThe following config is equivalent to your current setting for " + f"a packed dataset. Please update your config to the following: \n" + f"Set micro batch size to 1 (currently {self.micro_batch_size})\n" + f"Set global batch size to {self.global_batch_size // self.micro_batch_size} (currently {self.global_batch_size}) \n" + f"Set packed sequence length to {self.packed_sequence_size*self.micro_batch_size} (currently {self.packed_sequence_size}) \n" f"For details please visit https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/optimizations/sequence_packing.html" ) - self.global_batch_size //= self.micro_batch_size - self.packed_sequence_size *= self.micro_batch_size - self.micro_batch_size = 1 def prepare_data(self) -> None: if self.packed_sequence_size > 0 and not self.train_path_packed.is_file(): @@ -187,7 +185,12 @@ def train_path(self) -> Path: @property def train_path_packed(self) -> Path: if self.packed_sequence_size > 0: - return self.dataset_root / f"training_packed{self.packed_sequence_size}.npy" + if self.packed_sequence_specs.packed_data_path is not None: + return self.packed_sequence_specs.packed_data_path + tokenizer_model_name = self._extract_tokenizer_model_name() + folder_name = self.dataset_root / "packed" / tokenizer_model_name + folder_name.mkdir(parents=True, exist_ok=True) + return folder_name / f"training_{self.packed_sequence_size}.npy" else: raise ValueError("`train_path_packed` invalid since packed sequence size is not specified.") @@ -198,3 +201,18 @@ def validation_path(self) -> Path: @property def test_path(self) -> Path: return self.dataset_root / "test.jsonl" + + def _extract_tokenizer_model_name(self) -> str: + if self.packed_sequence_specs.tokenizer_model_name is not None: + tokenizer_model_name = self.packed_sequence_specs.tokenizer_model_name + elif isinstance(self.tokenizer, AutoTokenizer): + name = self.tokenizer.tokenizer.name_or_path + if name.endswith("nemo_tokenizer"): + # NEMO_HOME/hf_org/hf_model/nemo_tokenizer => hf_org--hf_model + tokenizer_model_name = '--'.join(name.split("/")[-3:-1]) + else: + # hf_org/hf_model => hf_org--hf_model + tokenizer_model_name = name.replace("/", "--") + else: + tokenizer_model_name = f"unknown_tokenizer_{hash(self.tokenizer)}" + return tokenizer_model_name diff --git a/nemo/collections/llm/gpt/data/packed_sequence.py b/nemo/collections/llm/gpt/data/packed_sequence.py index 4675b3fbb398..372e851da7cd 100644 --- a/nemo/collections/llm/gpt/data/packed_sequence.py +++ b/nemo/collections/llm/gpt/data/packed_sequence.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +from dataclasses import dataclass from pathlib import Path from typing import Optional @@ -83,3 +83,32 @@ def prepare_packed_sequence_data( # save output data np.save(output_path, output_data) logging.info(f"Packed sequence is prepared and saved to {output_path}") + + +@dataclass +class PackedSequenceSpecs: + packed_sequence_size: int = -1 + """ + If a positive integer, this arg enables training with sequence packing and specifies the pack size + If less than or equal to 0, sequence packing is disabled. Defaults to -1. + Note: This arg is distinct from `seq_length` because `seq_length` specifies the maximum length of the original sequence + (i.e. the length to truncate long sequences in the input data). + """ + + tokenizer_model_name: str = None + """ + Keep track of tokenizer model name, since each tokenizer produces a different packed sequence dataset file. + This field is set by llm.finetune api. + """ + + packed_data_path: Path = None + """ + If specified, use the packed dataset from this file instead of the default path. + """ + + def __post_init__(self): + if self.packed_data_path is not None: + assert ( + self.packed_data_path.suffix == ".npy" + ), f"packed data file must be a .npy file: {self.packed_data_path}" + assert self.packed_data_path.exists(), f"packed data file does not exist: {self.packed_data_path}" diff --git a/nemo/collections/llm/gpt/data/squad.py b/nemo/collections/llm/gpt/data/squad.py index ec0fc1aad02c..f872db94077d 100644 --- a/nemo/collections/llm/gpt/data/squad.py +++ b/nemo/collections/llm/gpt/data/squad.py @@ -24,6 +24,7 @@ if TYPE_CHECKING: from nemo.collections.common.tokenizers import TokenizerSpec + from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs class SquadDataModule(FineTuningDataModule, IOMixin): @@ -54,7 +55,7 @@ def __init__( pin_memory: bool = True, persistent_workers: bool = False, pad_to_max_length: bool = False, - packed_sequence_size: int = -1, + packed_sequence_specs: Optional["PackedSequenceSpecs"] = None, ): self.force_redownload = force_redownload self.delete_raw = delete_raw @@ -72,7 +73,7 @@ def __init__( pin_memory=pin_memory, persistent_workers=persistent_workers, pad_to_max_length=pad_to_max_length, - packed_sequence_size=packed_sequence_size, + packed_sequence_specs=packed_sequence_specs, ) def prepare_data(self) -> None: diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index aa3615b3ddfd..ebecc06140fe 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -53,7 +53,7 @@ LlamaConfig, LlamaModel, ) -from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel +from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel, MistralNeMoConfig12B from nemo.collections.llm.gpt.model.mixtral import ( MixtralConfig8x3B, MixtralConfig8x7B, diff --git a/nemo/collections/llm/gpt/model/baichuan.py b/nemo/collections/llm/gpt/model/baichuan.py index 56231978061f..c283b802a118 100644 --- a/nemo/collections/llm/gpt/model/baichuan.py +++ b/nemo/collections/llm/gpt/model/baichuan.py @@ -215,7 +215,7 @@ def _import_qkv(ctx: io.TransformCTX, qkv_weights): q = qkv_weights[0].squeeze().view(*new_q_tensor_shape) k = qkv_weights[1].squeeze().view(*new_kv_tensor_shape) v = qkv_weights[2].squeeze().view(*new_kv_tensor_shape) - qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:]) + qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:]).type_as(qkv_weights) for i in range(num_query_groups): qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :])) qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :])) diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py index 4f65ce404356..62c49286146a 100644 --- a/nemo/collections/llm/gpt/model/base.py +++ b/nemo/collections/llm/gpt/model/base.py @@ -204,6 +204,9 @@ class GPTConfig5B(GPTConfig): ffn_hidden_size: int = 16384 num_attention_heads: int = 32 + bias_activation_fusion: bool = True + bias_dropout_add_fusion: bool = True + @dataclass class GPTConfig7B(GPTConfig): @@ -222,6 +225,9 @@ class GPTConfig20B(GPTConfig): ffn_hidden_size: int = 24576 num_attention_heads: int = 48 + bias_activation_fusion: bool = True + bias_dropout_add_fusion: bool = True + @dataclass class GPTConfig40B(GPTConfig): @@ -240,6 +246,9 @@ class GPTConfig175B(GPTConfig): ffn_hidden_size: int = 49152 num_attention_heads: int = 96 + bias_activation_fusion: bool = True + bias_dropout_add_fusion: bool = True + class GPTModel(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin): def __init__( diff --git a/nemo/collections/llm/gpt/model/chatglm.py b/nemo/collections/llm/gpt/model/chatglm.py index 5bd1319102e2..e7450a8db28d 100644 --- a/nemo/collections/llm/gpt/model/chatglm.py +++ b/nemo/collections/llm/gpt/model/chatglm.py @@ -221,7 +221,7 @@ def _import_qkv_weight(ctx: io.TransformCTX, hf_qkv_weights): k = k.view(*new_kv_tensor_shape) v = v.view(*new_kv_tensor_shape) - qkv_weights = torch.empty((0, head_size, old_tensor_shape[1])) + qkv_weights = torch.empty((0, head_size, old_tensor_shape[1])).type_as(hf_qkv_weights) for i in range(num_query_groups): qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :])) qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :])) @@ -251,7 +251,7 @@ def _import_qkv_bias(ctx: io.TransformCTX, hf_qkv_bias): q = q.view(*new_q_tensor_shape) k = k.view(*new_kv_tensor_shape) v = v.view(*new_kv_tensor_shape) - qkv_bias = torch.empty((0, head_size)) + qkv_bias = torch.empty((0, head_size)).type_as(hf_qkv_bias) for i in range(num_query_groups): qkv_bias = torch.cat((qkv_bias, q[i * heads_per_group : (i + 1) * heads_per_group, :])) qkv_bias = torch.cat((qkv_bias, k[i : i + 1, :])) diff --git a/nemo/collections/llm/gpt/model/mistral.py b/nemo/collections/llm/gpt/model/mistral.py index f353362c9cbd..b9f4b6fb8f65 100644 --- a/nemo/collections/llm/gpt/model/mistral.py +++ b/nemo/collections/llm/gpt/model/mistral.py @@ -59,7 +59,7 @@ class MistralConfig7B(GPTConfig): @dataclass -class MistralNeMo2407Config12B(MistralConfig7B): +class MistralNeMoConfig12B(MistralConfig7B): """ https://mistral.ai/news/mistral-nemo/ """ @@ -75,7 +75,7 @@ class MistralNeMo2407Config12B(MistralConfig7B): @dataclass -class MistralNeMo2407Config123B(MistralConfig7B): +class MistralNeMoConfig123B(MistralConfig7B): """ https://mistral.ai/news/mistral-large-2407/ """ diff --git a/nemo/collections/llm/gpt/model/ssm.py b/nemo/collections/llm/gpt/model/ssm.py index 954fa8bfe9f7..c7228951fa78 100644 --- a/nemo/collections/llm/gpt/model/ssm.py +++ b/nemo/collections/llm/gpt/model/ssm.py @@ -53,6 +53,9 @@ class SSMConfig(TransformerConfig, io.IOMixin): fp16_lm_cross_entropy: bool = False parallel_output: bool = True share_embeddings_and_output_weights: bool = False + params_dtype: torch.dtype = torch.bfloat16 + fp16: bool = False + bf16: bool = True num_layers: int = 2 mamba_ssm_ngroups: int = 8 num_attention_heads: int = 1 @@ -81,6 +84,7 @@ class SSMConfig(TransformerConfig, io.IOMixin): forward_step_fn: Callable = ssm_forward_step data_step_fn: Callable = gpt_data_step + tokenizer_model_path: str = None def configure_model(self, tokenizer) -> "MCoreMambaModel": @@ -127,9 +131,17 @@ def __init__(self, state_dict): def state_dict(self): return self._state_dict + def to(self, dtype): + for k, v in self._state_dict.items(): + if v.dtype != dtype: + logging.warning(f"Converting {k} from {v.dtype} (source model) to {dtype} (target model)") + self._state_dict[k] = v.to(dtype) + source = ModelState(source) target = self.init() - trainer = self.nemo_setup(target) + trainer = self.nemo_setup(target, ckpt_async_save=False) + source.to(self.config.params_dtype) + target.to(self.config.params_dtype) self.convert_state(source, target) self.nemo_save(output_path, trainer) diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py index 36a6b0a3f350..579034deb19f 100644 --- a/nemo/collections/llm/peft/lora.py +++ b/nemo/collections/llm/peft/lora.py @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import re from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, List, Literal import torch from megatron.core import parallel_state -from pytorch_lightning.trainer.states import TrainerFn +from megatron.core.tensor_parallel import ColumnParallelLinear, RowParallelLinear from torch import nn from nemo import lightning as nl @@ -32,15 +33,16 @@ from nemo.utils.import_utils import safe_import_from TEColumnParallelLinear, HAVE_TE_COL_LINEAR = safe_import_from( - "megatron.core.transformer.custom_layers.transformer_engine", "TEColumnParallelLinear" + "megatron.core.extensions.transformer_engine", "TEColumnParallelLinear" ) -TELayerNormColumnParallelLinear, HAVE_TE_COL_LINEAR = safe_import_from( - "megatron.core.transformer.custom_layers.transformer_engine", +TELayerNormColumnParallelLinear, HAVE_TE_LN_COL_LINEAR = safe_import_from( + "megatron.core.extensions.transformer_engine", "TELayerNormColumnParallelLinear", ) TERowParallelLinear, HAVE_TE_ROW_LINEAR = safe_import_from( - "megatron.core.transformer.custom_layers.transformer_engine", "TERowParallelLinear" + "megatron.core.extensions.transformer_engine", "TERowParallelLinear" ) +HAVE_TE = all((HAVE_TE_COL_LINEAR, HAVE_TE_LN_COL_LINEAR, HAVE_TE_ROW_LINEAR)) class AdapterParallelAdd(AdapterWrapper): @@ -91,6 +93,9 @@ class LoRA(PEFT): - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention modules. - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP. - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP. + Target modules can also contain wildcards. For example, you can specify + target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv + on the first two layers. dim (int): Dimension of the low-rank projection space. Defaults to 32. alpha (int): Weighting factor for the low-rank projection. Defaults to 32. dropout (float): Dropout rate for the low-rank projection. Defaults to 0.0. @@ -138,37 +143,43 @@ def transform(self, m: nn.Module, name=None, prefix=None): """ from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ParallelLinearAdapter + def wildcard_match(pattern, key): + if key is None: + return None + regex_pattern = re.compile("^" + pattern.replace("*", "(.*)") + "$") + match = regex_pattern.match(key) + return match is not None + tp_size = parallel_state.get_tensor_model_parallel_world_size() - if name in self.target_modules: - if name in ['linear_qkv', 'linear_fc1']: - # Column Parallel Linear + full_name = f"{prefix}.{name}" if prefix else name + if name in self.target_modules or any(wildcard_match(pattern, full_name) for pattern in self.target_modules): + if HAVE_TE and isinstance(m, TEColumnParallelLinear) or isinstance(m, TELayerNormColumnParallelLinear): input_is_parallel = False - if HAVE_TE_COL_LINEAR and ( - isinstance(m, TEColumnParallelLinear) or isinstance(m, TELayerNormColumnParallelLinear) - ): - # m.in_features and m.out_features are divided by tp_size already, - # but in_features and out_features passed to ParallelLinearAdapter are not. - in_features = m.in_features - out_features = m.out_features * tp_size - else: - in_features = m.input_size - out_features = m.output_size + # m.in_features and m.out_features are divided by tp_size already, + # but in_features and out_features passed to ParallelLinearAdapter are not. + in_features = m.in_features + out_features = m.out_features * tp_size # LoRA is applied after layernorm, so layernorm output must be returned m.return_layernorm_output = True # perf optimization for LoRA + SP if m.config.sequence_parallel and not m.ub_overlap_ag: m.return_layernorm_output_gathered = True - else: # name in ['linear_proj', 'linear_fc2'] - # Row Parallel Linear + elif HAVE_TE and isinstance(m, TERowParallelLinear): + input_is_parallel = True + in_features = m.in_features * tp_size + out_features = m.out_features + elif isinstance(m, ColumnParallelLinear): + input_is_parallel = False + in_features = m.input_size + out_features = m.output_size + elif isinstance(m, RowParallelLinear): input_is_parallel = True - if HAVE_TE_ROW_LINEAR and isinstance(m, TERowParallelLinear): - in_features = m.in_features * tp_size - out_features = m.out_features - else: - in_features = m.input_size - out_features = m.output_size - - logging.info(f"Adding lora to: {prefix}.{name}") + in_features = m.input_size + out_features = m.output_size + else: + raise NotImplementedError(f"Layer type is unrecognized for LoRA: {type(m)}") + + logging.info(f"Adding lora to: {full_name}") adapter = ParallelLinearAdapter( in_features, out_features, diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py index 6bee8c882ffd..ff81c3b383fc 100644 --- a/nemo/collections/llm/recipes/__init__.py +++ b/nemo/collections/llm/recipes/__init__.py @@ -14,6 +14,10 @@ from nemo.collections.llm.recipes import ( + baichuan2_7b, + chatglm3_6b, + gemma_2b, + gemma_7b, llama3_8b, llama3_8b_16k, llama3_8b_64k, @@ -21,7 +25,15 @@ llama3_70b_16k, llama3_70b_64k, llama31_405b, - mistral, + mamba2_1_3b, + mamba2_2_7b, + mamba2_8b, + mamba2_130m, + mamba2_370m, + mamba2_780m, + mamba2_hybrid_8b, + mistral_7b, + mistral_nemo_12b, mixtral_8x7b, mixtral_8x7b_16k, mixtral_8x7b_64k, @@ -41,6 +53,10 @@ from nemo.collections.llm.recipes.optim import adam __all__ = [ + "baichuan2_7b", + "chatglm3_6b", + "gemma_2b", + "gemma_7b", "llama3_8b", "llama3_8b_16k", "llama3_8b_64k", @@ -48,7 +64,15 @@ "llama3_70b_16k", "llama3_70b_64k", "llama31_405b", - "mistral", + "mamba2_130m", + "mamba2_370m", + "mamba2_780m", + "mamba2_1_3b", + "mamba2_2_7b", + "mamba2_8b", + "mamba2_hybrid_8b", + "mistral_7b", + "mistral_nemo_12b", "mixtral_8x7b", "mixtral_8x7b_16k", "mixtral_8x7b_64k", diff --git a/nemo/collections/llm/recipes/baichuan2_7b.py b/nemo/collections/llm/recipes/baichuan2_7b.py new file mode 100644 index 000000000000..3ebb643af779 --- /dev/null +++ b/nemo/collections/llm/recipes/baichuan2_7b.py @@ -0,0 +1,285 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Callable, Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections.llm import Baichuan2Config7B, Baichuan2Model +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback +from nemo.utils.exp_manager import TimingCallback + +NAME = "baichuan2_7b" + + +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Baichuan2 7B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Baichuan2 7B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=baichuan2_7b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(Baichuan2Model, config=run.Config(Baichuan2Config7B)) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 2, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Baichuan2 7B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=baichuan2_7b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Baichuan2 7B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory baichuan2_7b + $ nemo llm pretrain --factory "baichuan2_7b(num_nodes=2, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="baichuan2_7b_pretrain", num_nodes=2) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=pretrain, name=NAME + "_optimized") +def pretrain_recipe_performance( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn: Callable = pretrain, +) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for Baichuan2 7B model. + + This recipe enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Examples: + $ nemo llm pretrain --factory baichuan2_7b_optimized + + Python API usage: + >>> recipe = pretrain_recipe_performance(name="baichuan2_7b_perf", num_nodes=4) + >>> print(recipe) + + Note: + Use this recipe with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) + + recipe.trainer.callbacks.append( + run.Config( + MegatronCommOverlapCallback, + tp_comm_overlap=False, + ) + ) + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + peft_scheme: Optional[str] = 'lora', +) -> run.Partial: + """ + Create a fine-tuning recipe for Baichuan2 7B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory baichuan2_7b + + Python API usage: + >>> recipe = finetune_recipe(name="baichuan2_7b_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + """ + recipe = default_finetune_recipe( + model(), "baichuan-inc/Baichuan2-7B-Base", dir, name, num_nodes, num_gpus_per_node + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 2 + recipe.optim.config.lr = 5e-6 + elif peft_scheme.lower() == 'lora': + recipe.peft = run.Config(LoRA) + recipe.optim.config.lr = 1e-4 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/chatglm3_6b.py b/nemo/collections/llm/recipes/chatglm3_6b.py new file mode 100644 index 000000000000..f5d580a9c6ea --- /dev/null +++ b/nemo/collections/llm/recipes/chatglm3_6b.py @@ -0,0 +1,283 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Callable, Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections.llm import ChatGLM3Config6B, ChatGLMModel +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback +from nemo.utils.exp_manager import TimingCallback + +NAME = "chatglm3_6b" + + +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a ChatGLM3 6B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the ChatGLM3 6B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=chatglm3_6b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(ChatGLMModel, config=run.Config(ChatGLM3Config6B)) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 2, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for ChatGLM3 6B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=chatglm3_6b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for ChatGLM3 6B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory chatglm3_6b + $ nemo llm pretrain --factory "chatglm3_6b(num_nodes=2, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="chatglm3_6b_pretrain", num_nodes=2) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=pretrain, name=NAME + "_optimized") +def pretrain_recipe_performance( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn: Callable = pretrain, +) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for ChatGLM3 6B model. + + This recipe enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Examples: + $ nemo llm pretrain --factory chatglm3_6b_optimized + + Python API usage: + >>> recipe = pretrain_recipe_performance(name="chatglm3_6b_perf", num_nodes=4) + >>> print(recipe) + + Note: + Use this recipe with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) + + recipe.trainer.callbacks.append( + run.Config( + MegatronCommOverlapCallback, + tp_comm_overlap=False, + ) + ) + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + peft_scheme: Optional[str] = 'lora', +) -> run.Partial: + """ + Create a fine-tuning recipe for ChatGLM3 6B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory chatglm3_6b + + Python API usage: + >>> recipe = finetune_recipe(name="chatglm3_6b_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + """ + recipe = default_finetune_recipe(model(), "THUDM/chatglm3-6b", dir, name, num_nodes, num_gpus_per_node) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 2 + recipe.optim.config.lr = 5e-6 + elif peft_scheme.lower() == 'lora': + recipe.peft = run.Config(LoRA) + recipe.optim.config.lr = 1e-4 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/finetune_default.py b/nemo/collections/llm/recipes/finetune_default.py index 89c982613126..255763abbf50 100644 --- a/nemo/collections/llm/recipes/finetune_default.py +++ b/nemo/collections/llm/recipes/finetune_default.py @@ -60,7 +60,7 @@ def default_finetune_recipe( ), data=run.Config(llm.SquadDataModule, seq_length=2048, global_batch_size=128, micro_batch_size=1), log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), - optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50, adam_beta2=0.98), resume=nemo_resume(resume_path), ) @@ -77,9 +77,9 @@ def default_finetune_trainer( num_nodes=1, num_gpus_per_node=8, max_steps=1000, - limit_test_batches=None, - limit_val_batches=None, - val_check_interval=5, + limit_test_batches=1, + limit_val_batches=1, + val_check_interval=30, ): strategy = run.Config( nl.MegatronStrategy, diff --git a/nemo/collections/llm/recipes/gemma_2b.py b/nemo/collections/llm/recipes/gemma_2b.py new file mode 100644 index 000000000000..cbcd340c1e92 --- /dev/null +++ b/nemo/collections/llm/recipes/gemma_2b.py @@ -0,0 +1,285 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from typing import Callable, Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections.llm import GemmaConfig2B, GemmaModel +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback +from nemo.utils.exp_manager import TimingCallback + +NAME = "gemma_2b" + + +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Gemma 2B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Gemma 2B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=gemma_2b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(GemmaModel, config=run.Config(GemmaConfig2B)) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 2, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Gemma 2B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=gemma_2b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Gemma 2B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory gemma_2b + $ nemo llm pretrain --factory "gemma_2b(num_nodes=2, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="gemma_2b_pretrain", num_nodes=2) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=pretrain, name=NAME + "_optimized") +def pretrain_recipe_performance( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn: Callable = pretrain, +) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for Gemma 2B model. + + This recipe enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Examples: + $ nemo llm pretrain --factory gemma_2b_optimized + + Python API usage: + >>> recipe = pretrain_recipe_performance(name="gemma_2b_perf", num_nodes=4) + >>> print(recipe) + + Note: + Use this recipe with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) + + recipe.trainer.callbacks.append( + run.Config( + MegatronCommOverlapCallback, + tp_comm_overlap=False, + ) + ) + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + peft_scheme: Optional[str] = 'lora', +) -> run.Partial: + """ + Create a fine-tuning recipe for Gemma 2B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory gemma_2b + + Python API usage: + >>> recipe = finetune_recipe(name="gemma_2b_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + """ + # Disable cuDNN attention since TE 1.8 does not support head dim > 128 + os.environ['NVTE_FUSED_ATTN'] = "0" + + recipe = default_finetune_recipe(model(), "google/gemma-2b", dir, name, num_nodes, num_gpus_per_node) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 2 + recipe.optim.config.lr = 5e-6 + elif peft_scheme.lower() == 'lora': + recipe.peft = run.Config(LoRA) + recipe.optim.config.lr = 1e-4 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/gemma_7b.py b/nemo/collections/llm/recipes/gemma_7b.py new file mode 100644 index 000000000000..3b0e206d9ce7 --- /dev/null +++ b/nemo/collections/llm/recipes/gemma_7b.py @@ -0,0 +1,285 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from typing import Callable, Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections.llm import GemmaConfig7B, GemmaModel +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback +from nemo.utils.exp_manager import TimingCallback + +NAME = "gemma_7b" + + +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Gemma 7B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Gemma 7B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=gemma_7b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(GemmaModel, config=run.Config(GemmaConfig7B)) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 2, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Gemma 7B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=gemma_7b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Gemma 7B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory gemma_7b + $ nemo llm pretrain --factory "gemma_7b(num_nodes=2, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="gemma_7b_pretrain", num_nodes=2) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=pretrain, name=NAME + "_optimized") +def pretrain_recipe_performance( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn: Callable = pretrain, +) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for Gemma 7B model. + + This recipe enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Examples: + $ nemo llm pretrain --factory gemma_7b_optimized + + Python API usage: + >>> recipe = pretrain_recipe_performance(name="gemma_7b_perf", num_nodes=4) + >>> print(recipe) + + Note: + Use this recipe with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) + + recipe.trainer.callbacks.append( + run.Config( + MegatronCommOverlapCallback, + tp_comm_overlap=False, + ) + ) + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + peft_scheme: Optional[str] = 'lora', +) -> run.Partial: + """ + Create a fine-tuning recipe for Gemma 7B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory gemma_7b + + Python API usage: + >>> recipe = finetune_recipe(name="gemma_7b_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + """ + # Disable cuDNN attention since TE 1.8 does not support head dim > 128 + os.environ['NVTE_FUSED_ATTN'] = "0" + + recipe = default_finetune_recipe(model(), "google/gemma-7b", dir, name, num_nodes, num_gpus_per_node) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 2 + recipe.optim.config.lr = 5e-6 + elif peft_scheme.lower() == 'lora': + recipe.peft = run.Config(LoRA) + recipe.optim.config.lr = 1e-4 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/gpt3_175b.py b/nemo/collections/llm/recipes/gpt3_175b.py new file mode 100644 index 000000000000..1abe8a218e82 --- /dev/null +++ b/nemo/collections/llm/recipes/gpt3_175b.py @@ -0,0 +1,237 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Callable, Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections.llm.api import pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.model import GPTConfig175B, GPTModel +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import ( + userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048, +) +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback +from nemo.utils.exp_manager import TimingCallback + +NAME = "gpt3_175b" + + +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a GPT3 175B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the GPT3 175B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=gpt3_175b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(GPTModel, config=run.Config(GPTConfig175B)) + + +def trainer( + tensor_parallelism: int = 4, + pipeline_parallelism: int = 8, + pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16, + virtual_pipeline_parallelism: Optional[int] = 6, + context_parallelism: int = 1, + sequence_parallelism: bool = True, + num_nodes: int = 64, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for GPT3 175B model. + + This function sets up the distributed training strategy optimized for the large 175B model. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=gpt3_175b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=64, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + This configuration uses extensive parallelism to handle the large model size efficiently. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + average_in_collective=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, +) -> run.Partial: + """ + Create a pre-training recipe for GPT3 175B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory gpt3_175b + $ nemo llm pretrain --factory "gpt3_175b(num_nodes=64, name='my_175b_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="gpt3_175b_pretrain", num_nodes=64) + >>> print(recipe) + + Note: + This recipe is optimized for the large 175B model and requires significant computational resources. + """ + recipe = run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config(MockDataModule, seq_length=2048, global_batch_size=2048, micro_batch_size=2), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=0.9e-4), + resume=default_resume(), + ) + + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) + + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for GPT3 175B model. + + This method enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Note: + Use this method with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. + + recipe.trainer.callbacks.append( + run.Config( + MegatronCommOverlapCallback, + tp_comm_overlap=True, + tp_comm_overlap_cfg=userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048, + defer_embedding_wgrad_compute=True, + wgrad_deferral_limit=50, + overlap_param_gather_with_optimizer_step=True, + align_param_gather=True, + ) + ) + + return recipe diff --git a/nemo/collections/llm/recipes/llama31_405b.py b/nemo/collections/llm/recipes/llama31_405b.py index f36773551ea0..055e9a06fcba 100644 --- a/nemo/collections/llm/recipes/llama31_405b.py +++ b/nemo/collections/llm/recipes/llama31_405b.py @@ -13,11 +13,12 @@ # limitations under the License. -from typing import Optional +from typing import Callable, Optional import nemo_run as run import pytorch_lightning as pl import torch +from megatron.core.distributed import DistributedDataParallelConfig from pytorch_lightning.callbacks.callback import Callback from nemo import lightning as nl @@ -27,6 +28,10 @@ from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import ( + userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192, +) +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback from nemo.utils.exp_manager import TimingCallback NAME = "llama31_405b" @@ -107,6 +112,14 @@ def trainer( gradient_as_bucket_view=True, ckpt_async_save=True, ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + average_in_collective=True, + ), ) trainer = run.Config( @@ -131,7 +144,12 @@ def trainer( @run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, ) -> run.Partial: """ Create a pre-training recipe for Llama3.1 405B model. @@ -144,6 +162,7 @@ def pretrain_recipe( name (str): Name of the pre-training run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -161,7 +180,7 @@ def pretrain_recipe( Note: This recipe is optimized for the large 405B model and requires significant computational resources. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=trainer( @@ -174,3 +193,47 @@ def pretrain_recipe( optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), resume=default_resume(), ) + + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) + + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for Llama3.1 405B model. + + This method enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Note: + Use this method with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. + + recipe.trainer.callbacks.append( + run.Config( + MegatronCommOverlapCallback, + tp_comm_overlap=True, + tp_comm_overlap_cfg=userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192, + defer_embedding_wgrad_compute=True, + wgrad_deferral_limit=50, + overlap_param_gather_with_optimizer_step=True, + align_param_gather=True, + ) + ) + + return recipe diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py index 9cfc198038f2..b283c68b222b 100644 --- a/nemo/collections/llm/recipes/llama3_70b.py +++ b/nemo/collections/llm/recipes/llama3_70b.py @@ -13,7 +13,7 @@ # limitations under the License. -from typing import Optional +from typing import Callable, Optional import nemo_run as run import pytorch_lightning as pl @@ -24,7 +24,6 @@ from nemo import lightning as nl from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.gpt.model.llama import Llama3Config70B, LlamaModel from nemo.collections.llm.peft.lora import LoRA from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe @@ -64,7 +63,7 @@ def trainer( virtual_pipeline_parallelism: Optional[int] = 5, context_parallelism: int = 2, sequence_parallelism: bool = True, - num_nodes: int = 1, + num_nodes: int = 4, num_gpus_per_node: int = 8, max_steps: int = 1168251, callbacks: Optional[list[run.Config[Callback]]] = None, @@ -117,6 +116,7 @@ def trainer( grad_reduce_in_fp32=True, overlap_grad_reduce=True, overlap_param_gather=True, + average_in_collective=True, ), ) @@ -142,7 +142,12 @@ def trainer( @run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, ) -> run.Partial: """ Create a pre-training recipe for Llama3 70B model. @@ -155,6 +160,7 @@ def pretrain_recipe( name (str): Name of the pre-training run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -172,7 +178,8 @@ def pretrain_recipe( Note: This recipe is optimized for the large 70B model and requires significant computational resources. """ - return run.Partial( + + recipe = run.Partial( fn, model=model(), trainer=trainer( @@ -186,40 +193,35 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Llama3 70B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - CLI usage: - $ nemo llm pretrain --factory "llama3_70b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="llama3_70b_perf", num_nodes=4) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) + + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. recipe.trainer.callbacks.append( run.Config( @@ -228,6 +230,8 @@ def pretrain_recipe_performance( tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192, defer_embedding_wgrad_compute=True, wgrad_deferral_limit=22, + overlap_param_gather_with_optimizer_step=True, + align_param_gather=True, ) ) diff --git a/nemo/collections/llm/recipes/llama3_70b_16k.py b/nemo/collections/llm/recipes/llama3_70b_16k.py index c8c1957d7bdc..928f961f7cf3 100644 --- a/nemo/collections/llm/recipes/llama3_70b_16k.py +++ b/nemo/collections/llm/recipes/llama3_70b_16k.py @@ -49,7 +49,7 @@ def model() -> run.Config[pl.LightningModule]: def trainer( - num_nodes: int = 2, + num_nodes: int = 4, num_gpus_per_node: int = 8, ) -> run.Config: """ @@ -58,8 +58,8 @@ def trainer( This function sets up the distributed training strategy optimized for the large 70B model with longer sequences. Args: - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. + num_nodes (int, optional): Number of compute nodes to use. Defaults to 4. + num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8. Returns: run.Config: Configuration for the NeMo Lightning Trainer. @@ -76,10 +76,10 @@ def trainer( This configuration uses extensive parallelism to handle the large model size and longer sequence length efficiently. """ return llama3_70b.trainer( - tensor_parallelism=2, - pipeline_parallelism=4, + tensor_parallelism=8, + pipeline_parallelism=2, pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=5, + virtual_pipeline_parallelism=None, context_parallelism=2, sequence_parallelism=True, num_nodes=num_nodes, @@ -91,7 +91,7 @@ def trainer( def pretrain_recipe( dir: Optional[str] = None, name: str = "default", - num_nodes: int = 2, + num_nodes: int = 4, num_gpus_per_node: int = 8, ) -> run.Partial: """ @@ -103,8 +103,8 @@ def pretrain_recipe( Args: dir (Optional[str]): Directory for saving logs and checkpoints. name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. + num_nodes (int, optional): Number of compute nodes to use. Defaults to 4. + num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8. Returns: run.Partial: Partial configuration for pre-training. diff --git a/nemo/collections/llm/recipes/llama3_70b_64k.py b/nemo/collections/llm/recipes/llama3_70b_64k.py index 5d9845d9aaa7..ffadf5ca8084 100644 --- a/nemo/collections/llm/recipes/llama3_70b_64k.py +++ b/nemo/collections/llm/recipes/llama3_70b_64k.py @@ -21,7 +21,6 @@ from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import llama3_70b from nemo.utils.exp_manager import TimingCallback @@ -59,8 +58,8 @@ def trainer( This function sets up the distributed training strategy optimized for the large 70B model with long sequences. Args: - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. + num_nodes (int, optional): Number of compute nodes to use. Defaults to 32. + num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8. Returns: run.Config: Configuration for the NeMo Lightning Trainer. @@ -81,7 +80,7 @@ def trainer( tensor_parallelism=8, pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=5, + virtual_pipeline_parallelism=None, context_parallelism=8, sequence_parallelism=True, num_nodes=num_nodes, @@ -106,8 +105,8 @@ def pretrain_recipe( Args: dir (Optional[str]): Directory for saving logs and checkpoints. name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. + num_nodes (int, optional): Number of compute nodes to use. Defaults to 32. + num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8. Returns: run.Partial: Partial configuration for pre-training. diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py index 4b2934739529..269eb7865dcf 100644 --- a/nemo/collections/llm/recipes/llama3_8b.py +++ b/nemo/collections/llm/recipes/llama3_8b.py @@ -117,6 +117,7 @@ def trainer( grad_reduce_in_fp32=True, overlap_grad_reduce=True, overlap_param_gather=True, + average_in_collective=True, ), ) @@ -142,7 +143,12 @@ def trainer( @run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, ) -> run.Partial: """ Create a pre-training recipe for Llama3 8B model. @@ -155,6 +161,7 @@ def pretrain_recipe( name (str): Name of the pre-training run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -173,7 +180,7 @@ def pretrain_recipe( For more details on pre-training LLMs with NeMo, see the pre-training guide in the `examples/llm/pretrain/` directory. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=trainer( @@ -187,44 +194,29 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_optimized") -def pretrain_recipe_performance( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 1, - num_gpus_per_node: int = 8, - fn: Callable = pretrain, -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Llama3 8B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - $ nemo llm pretrain --factory llama3_8b_optimized - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="llama3_8b_perf", num_nodes=4) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - recipe.trainer.callbacks.append( run.Config( MegatronCommOverlapCallback, diff --git a/nemo/collections/llm/recipes/llama3_8b_16k.py b/nemo/collections/llm/recipes/llama3_8b_16k.py index 0b42b392827a..d6c1677a3b4b 100644 --- a/nemo/collections/llm/recipes/llama3_8b_16k.py +++ b/nemo/collections/llm/recipes/llama3_8b_16k.py @@ -49,7 +49,7 @@ def model() -> run.Config[pl.LightningModule]: def trainer( - num_nodes: int = 1, + num_nodes: int = 2, num_gpus_per_node: int = 8, ) -> run.Config: """ @@ -58,8 +58,8 @@ def trainer( This function sets up the distributed training strategy optimized for longer sequences. Args: - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. + num_nodes (int, optional): Number of compute nodes to use. Defaults to 2. + num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8. Returns: run.Config: Configuration for the NeMo Lightning Trainer. @@ -76,10 +76,10 @@ def trainer( This configuration uses increased parallelism to handle the longer sequence length efficiently. """ return llama3_8b.trainer( - tensor_parallelism=2, - pipeline_parallelism=4, + tensor_parallelism=4, + pipeline_parallelism=2, pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=5, + virtual_pipeline_parallelism=None, context_parallelism=2, sequence_parallelism=True, num_nodes=num_nodes, @@ -91,7 +91,7 @@ def trainer( def pretrain_recipe( dir: Optional[str] = None, name: str = "default", - num_nodes: int = 1, + num_nodes: int = 2, num_gpus_per_node: int = 8, ) -> run.Partial: """ @@ -103,8 +103,8 @@ def pretrain_recipe( Args: dir (Optional[str]): Directory for saving logs and checkpoints. name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. + num_nodes (int, optional): Number of compute nodes to use. Defaults to 2. + num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8. Returns: run.Partial: Partial configuration for pre-training. diff --git a/nemo/collections/llm/recipes/llama3_8b_64k.py b/nemo/collections/llm/recipes/llama3_8b_64k.py index 38f787113bf5..692347ea8dd0 100644 --- a/nemo/collections/llm/recipes/llama3_8b_64k.py +++ b/nemo/collections/llm/recipes/llama3_8b_64k.py @@ -49,7 +49,7 @@ def model() -> run.Config[pl.LightningModule]: def trainer( - num_nodes: int = 1, + num_nodes: int = 4, num_gpus_per_node: int = 8, ) -> run.Config: """ @@ -58,8 +58,8 @@ def trainer( This function sets up the distributed training strategy optimized for long sequences. Args: - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. + num_nodes (int, optional): Number of compute nodes to use. Defaults to 4. + num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8. Returns: run.Config: Configuration for the NeMo Lightning Trainer. @@ -69,17 +69,17 @@ def trainer( $ nemo llm pretrain trainer=llama3_8b_64k ... Python API usage: - >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> trainer_config = trainer(num_nodes=4, num_gpus_per_node=8) >>> print(trainer_config) Note: This configuration uses significantly increased parallelism to handle the long sequence length efficiently. """ return llama3_8b.trainer( - tensor_parallelism=2, - pipeline_parallelism=4, + tensor_parallelism=4, + pipeline_parallelism=2, pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=5, + virtual_pipeline_parallelism=None, context_parallelism=4, sequence_parallelism=True, num_nodes=num_nodes, @@ -91,7 +91,7 @@ def trainer( def pretrain_recipe( dir: Optional[str] = None, name: str = "default", - num_nodes: int = 1, + num_nodes: int = 4, num_gpus_per_node: int = 8, ) -> run.Partial: """ @@ -103,8 +103,8 @@ def pretrain_recipe( Args: dir (Optional[str]): Directory for saving logs and checkpoints. name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. + num_nodes (int, optional): Number of compute nodes to use. Defaults to 4. + num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8. Returns: run.Partial: Partial configuration for pre-training. @@ -112,10 +112,10 @@ def pretrain_recipe( Examples: CLI usage: $ nemo llm pretrain --factory llama3_8b_64k - $ nemo llm pretrain --factory "llama3_8b_64k(num_nodes=2, name='my_64k_pretrain')" + $ nemo llm pretrain --factory "llama3_8b_64k(num_nodes=4, name='my_64k_pretrain')" Python API usage: - >>> recipe = pretrain_recipe(name="llama3_8b_64k_pretrain", num_nodes=2) + >>> recipe = pretrain_recipe(name="llama3_8b_64k_pretrain", num_nodes=4) >>> print(recipe) Note: diff --git a/nemo/collections/llm/recipes/mamba2_130m.py b/nemo/collections/llm/recipes/mamba2_130m.py new file mode 100644 index 000000000000..08640604a112 --- /dev/null +++ b/nemo/collections/llm/recipes/mamba2_130m.py @@ -0,0 +1,321 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.utils.exp_manager import TimingCallback + +NAME = "mamba2_130m" + + +@run.cli.factory(name=NAME) +def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + + return run.Config( + get_nmt_tokenizer, + library='huggingface', + model_name="EleutherAI/gpt-neox-20b", + tokenizer_model=tokenizer_model, + use_fast=True, + ) + + +@run.cli.factory(name=NAME) +def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mamba2 130M model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mamba2 130M model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mamba2_130m ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config( + llm.GPTModel, config=run.Config(llm.BaseMambaConfig130M), tokenizer=tokenizer(tokenizer_model=tokenizer_model) + ) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mamba2 130M model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mamba2_130m ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=False, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn=pretrain, +) -> run.Partial: + """ + Create a pre-training recipe for Mamba2 130M model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mamba2_130M + $ nemo llm pretrain --factory "mamba2_130M(num_nodes=1, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mamba2_130M_pretrain", num_nodes=1) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config( + MockDataModule, + seq_length=4096, + global_batch_size=8, + micro_batch_size=1, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + resume_path: str = None, + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + gbs: int = 8, + mbs: int = 1, + peft_scheme: Optional[str] = 'none', +) -> run.Partial: + """ + Create a fine-tuning recipe for Mamba2 130M model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + resume_path (str): Path to the NeMo checkpoint (refer to notes below + on how to convert a pytorch checkpoint to NeMo) + tokenizer_model (str): Path to tokenizer model (defaults to None) + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mamba2_130m + + Python API usage: + >>> recipe = finetune_recipe(name="mamba2_130m_finetune", num_nodes=1) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + For converting an SSM pytorch checkpoint, use the following line of python code: + + llm.GPTModel(llm.BaseMambaConfig130M(), tokenizer=tokenizer()).import_ckpt( + path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file", + model_config=llm.BaseMambaConfig130M()) + This line will cache the nemo checkpoint to following directory: + /root/.cache/nemo/models/your_pytorch_state_dict_file + + """ + nemo_resume = run.Config( + nl.AutoResume, + restore_config=run.Config(nl.RestoreConfig, path=resume_path), + ) + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + gradient_as_bucket_view=True, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, + ckpt_async_save=False, + ) + checkpoint_callback = run.Config( + nl.ModelCheckpoint, + every_n_train_steps=10, + dirpath=dir, + ) + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + devices=num_gpus_per_node, + limit_test_batches=10, + limit_val_batches=10, + log_every_n_steps=20, + max_steps=100, + num_nodes=num_nodes, + plugins=run.Config( + nl.MegatronMixedPrecision, + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + callbacks=[checkpoint_callback], + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=20, + ) + recipe = run.Partial( + llm.finetune, + model=model(tokenizer_model=tokenizer_model), + trainer=trainer, + data=run.Config( + llm.SquadDataModule, + seq_length=2048, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50), + resume=nemo_resume, + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 1 + recipe.optim.config.lr = 5e-6 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/mamba2_1_3b.py b/nemo/collections/llm/recipes/mamba2_1_3b.py new file mode 100644 index 000000000000..58eaf049b059 --- /dev/null +++ b/nemo/collections/llm/recipes/mamba2_1_3b.py @@ -0,0 +1,321 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.utils.exp_manager import TimingCallback + +NAME = "mamba2_1_3b" + + +@run.cli.factory(name=NAME) +def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + + return run.Config( + get_nmt_tokenizer, + library='huggingface', + model_name="EleutherAI/gpt-neox-20b", + tokenizer_model=tokenizer_model, + use_fast=True, + ) + + +@run.cli.factory(name=NAME) +def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mamba2 1.3B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mamba2 1.3B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mamba2_1_3B ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config( + llm.GPTModel, config=run.Config(llm.BaseMambaConfig1_3B), tokenizer=tokenizer(tokenizer_model=tokenizer_model) + ) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mamba2 1.3B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mamba2_1_3b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=False, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn=pretrain, +) -> run.Partial: + """ + Create a pre-training recipe for Mamba2 1.3B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mamba2_1_3b + $ nemo llm pretrain --factory "mamba2_1_3b(num_nodes=1, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mamba2_1_3b_pretrain", num_nodes=1) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config( + MockDataModule, + seq_length=4096, + global_batch_size=8, + micro_batch_size=1, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + resume_path: str = None, + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + gbs: int = 8, + mbs: int = 1, + peft_scheme: Optional[str] = 'none', +) -> run.Partial: + """ + Create a fine-tuning recipe for Mamba2 1.3B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + resume_path (str): Path to the NeMo checkpoint (refer to notes below + on how to convert a pytorch checkpoint to NeMo) + tokenizer_model (str): Path to tokenizer model (defaults to None) + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mamba2_1_3b + + Python API usage: + >>> recipe = finetune_recipe(name="mamba2_1_3b_finetune", num_nodes=1) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + For converting an SSM pytorch checkpoint, use the following line of python code: + + llm.GPTModel(llm.BaseMambaConfig1_3B(), tokenizer=tokenizer()).import_ckpt( + path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file", + model_config=llm.BaseMambaConfig1_3B()) + This line will cache the nemo checkpoint to following directory: + /root/.cache/nemo/models/your_pytorch_state_dict_file + + """ + nemo_resume = run.Config( + nl.AutoResume, + restore_config=run.Config(nl.RestoreConfig, path=resume_path), + ) + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + gradient_as_bucket_view=True, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, + ckpt_async_save=False, + ) + checkpoint_callback = run.Config( + nl.ModelCheckpoint, + every_n_train_steps=10, + dirpath=dir, + ) + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + devices=num_gpus_per_node, + limit_test_batches=10, + limit_val_batches=10, + log_every_n_steps=20, + max_steps=100, + num_nodes=num_nodes, + plugins=run.Config( + nl.MegatronMixedPrecision, + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + callbacks=[checkpoint_callback], + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=20, + ) + recipe = run.Partial( + llm.finetune, + model=model(tokenizer_model=tokenizer_model), + trainer=trainer, + data=run.Config( + llm.SquadDataModule, + seq_length=2048, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50), + resume=nemo_resume, + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 1 + recipe.optim.config.lr = 5e-6 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/mamba2_2_7b.py b/nemo/collections/llm/recipes/mamba2_2_7b.py new file mode 100644 index 000000000000..5cb37c6a02a5 --- /dev/null +++ b/nemo/collections/llm/recipes/mamba2_2_7b.py @@ -0,0 +1,321 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.utils.exp_manager import TimingCallback + +NAME = "mamba2_2_7b" + + +@run.cli.factory(name=NAME) +def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + + return run.Config( + get_nmt_tokenizer, + library='huggingface', + model_name="EleutherAI/gpt-neox-20b", + tokenizer_model=tokenizer_model, + use_fast=True, + ) + + +@run.cli.factory(name=NAME) +def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mamba2 2.7B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mamba2 2.7B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mamba2_2_7B ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config( + llm.GPTModel, config=run.Config(llm.BaseMambaConfig2_7B), tokenizer=tokenizer(tokenizer_model=tokenizer_model) + ) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mamba2 2.7B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mamba2_2_7b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=False, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn=pretrain, +) -> run.Partial: + """ + Create a pre-training recipe for Mamba2 2.7B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mamba2_2_7b + $ nemo llm pretrain --factory "mamba2_2_7b(num_nodes=1, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mamba2_2_7b_pretrain", num_nodes=1) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config( + MockDataModule, + seq_length=4096, + global_batch_size=8, + micro_batch_size=1, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + resume_path: str = None, + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + gbs: int = 8, + mbs: int = 1, + peft_scheme: Optional[str] = 'none', +) -> run.Partial: + """ + Create a fine-tuning recipe for Mamba2 2.7B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + resume_path (str): Path to the NeMo checkpoint (refer to notes below + on how to convert a pytorch checkpoint to NeMo) + tokenizer_model (str): Path to tokenizer model (defaults to None) + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mamba2_2_7b + + Python API usage: + >>> recipe = finetune_recipe(name="mamba2_2_7b_finetune", num_nodes=1) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + For converting an SSM pytorch checkpoint, use the following line of python code: + + llm.GPTModel(llm.BaseMambaConfig2_7B(), tokenizer=tokenizer()).import_ckpt( + path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file", + model_config=llm.BaseMambaConfig2_7B()) + This line will cache the nemo checkpoint to following directory: + /root/.cache/nemo/models/your_pytorch_state_dict_file + + """ + nemo_resume = run.Config( + nl.AutoResume, + restore_config=run.Config(nl.RestoreConfig, path=resume_path), + ) + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + gradient_as_bucket_view=True, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, + ckpt_async_save=False, + ) + checkpoint_callback = run.Config( + nl.ModelCheckpoint, + every_n_train_steps=10, + dirpath=dir, + ) + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + devices=num_gpus_per_node, + limit_test_batches=10, + limit_val_batches=10, + log_every_n_steps=20, + max_steps=100, + num_nodes=num_nodes, + plugins=run.Config( + nl.MegatronMixedPrecision, + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + callbacks=[checkpoint_callback], + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=20, + ) + recipe = run.Partial( + llm.finetune, + model=model(tokenizer_model=tokenizer_model), + trainer=trainer, + data=run.Config( + llm.SquadDataModule, + seq_length=2048, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50), + resume=nemo_resume, + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 1 + recipe.optim.config.lr = 5e-6 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/mamba2_370m.py b/nemo/collections/llm/recipes/mamba2_370m.py new file mode 100644 index 000000000000..bb8bddc4045a --- /dev/null +++ b/nemo/collections/llm/recipes/mamba2_370m.py @@ -0,0 +1,321 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.utils.exp_manager import TimingCallback + +NAME = "mamba2_370m" + + +@run.cli.factory(name=NAME) +def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + + return run.Config( + get_nmt_tokenizer, + library='huggingface', + model_name="EleutherAI/gpt-neox-20b", + tokenizer_model=tokenizer_model, + use_fast=True, + ) + + +@run.cli.factory(name=NAME) +def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mamba2 370M model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mamba2 370M model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mamba2_370m ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config( + llm.GPTModel, config=run.Config(llm.BaseMambaConfig370M), tokenizer=tokenizer(tokenizer_model=tokenizer_model) + ) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mamba2 370M model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mamba2_370m ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=False, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn=pretrain, +) -> run.Partial: + """ + Create a pre-training recipe for Mamba2 370M model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mamba2_370M + $ nemo llm pretrain --factory "mamba2_370M(num_nodes=1, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mamba2_370M_pretrain", num_nodes=1) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config( + MockDataModule, + seq_length=4096, + global_batch_size=8, + micro_batch_size=1, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + resume_path: str = None, + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + gbs: int = 8, + mbs: int = 1, + peft_scheme: Optional[str] = 'none', +) -> run.Partial: + """ + Create a fine-tuning recipe for Mamba2 370M model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + resume_path (str): Path to the NeMo checkpoint (refer to notes below + on how to convert a pytorch checkpoint to NeMo) + tokenizer_model (str): Path to tokenizer model (defaults to None) + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mamba2_370m + + Python API usage: + >>> recipe = finetune_recipe(name="mamba2_370m_finetune", num_nodes=1) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + For converting an SSM pytorch checkpoint, use the following line of python code: + + llm.GPTModel(llm.BaseMambaConfig370M(), tokenizer=tokenizer()).import_ckpt( + path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file", + model_config=llm.BaseMambaConfig370M()) + This line will cache the nemo checkpoint to following directory: + /root/.cache/nemo/models/your_pytorch_state_dict_file + + """ + nemo_resume = run.Config( + nl.AutoResume, + restore_config=run.Config(nl.RestoreConfig, path=resume_path), + ) + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + gradient_as_bucket_view=True, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, + ckpt_async_save=False, + ) + checkpoint_callback = run.Config( + nl.ModelCheckpoint, + every_n_train_steps=10, + dirpath=dir, + ) + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + devices=num_gpus_per_node, + limit_test_batches=10, + limit_val_batches=10, + log_every_n_steps=20, + max_steps=100, + num_nodes=num_nodes, + plugins=run.Config( + nl.MegatronMixedPrecision, + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + callbacks=[checkpoint_callback], + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=20, + ) + recipe = run.Partial( + llm.finetune, + model=model(tokenizer_model=tokenizer_model), + trainer=trainer, + data=run.Config( + llm.SquadDataModule, + seq_length=2048, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50), + resume=nemo_resume, + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 1 + recipe.optim.config.lr = 5e-6 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/mamba2_780m.py b/nemo/collections/llm/recipes/mamba2_780m.py new file mode 100644 index 000000000000..2f6ab6717ae1 --- /dev/null +++ b/nemo/collections/llm/recipes/mamba2_780m.py @@ -0,0 +1,321 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.utils.exp_manager import TimingCallback + +NAME = "mamba2_780m" + + +@run.cli.factory(name=NAME) +def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + + return run.Config( + get_nmt_tokenizer, + library='huggingface', + model_name="EleutherAI/gpt-neox-20b", + tokenizer_model=tokenizer_model, + use_fast=True, + ) + + +@run.cli.factory(name=NAME) +def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mamba2 780M model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mamba2 780M model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mamba2_780m ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config( + llm.GPTModel, config=run.Config(llm.BaseMambaConfig780M), tokenizer=tokenizer(tokenizer_model=tokenizer_model) + ) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mamba2 780M model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mamba2_780m ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=False, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn=pretrain, +) -> run.Partial: + """ + Create a pre-training recipe for Mamba2 780M model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mamba2_780M + $ nemo llm pretrain --factory "mamba2_780M(num_nodes=1, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mamba2_780M_pretrain", num_nodes=1) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config( + MockDataModule, + seq_length=4096, + global_batch_size=8, + micro_batch_size=1, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + resume_path: str = None, + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + gbs: int = 8, + mbs: int = 1, + peft_scheme: Optional[str] = 'none', +) -> run.Partial: + """ + Create a fine-tuning recipe for Mamba2 780M model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + resume_path (str): Path to the NeMo checkpoint (refer to notes below + on how to convert a pytorch checkpoint to NeMo) + tokenizer_model (str): Path to tokenizer model (defaults to None) + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mamba2_780m + + Python API usage: + >>> recipe = finetune_recipe(name="mamba2_780m_finetune", num_nodes=1) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + For converting an SSM pytorch checkpoint, use the following line of python code: + + llm.GPTModel(llm.BaseMambaConfig780M(), tokenizer=tokenizer()).import_ckpt( + path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file", + model_config=llm.BaseMambaConfig780M()) + This line will cache the nemo checkpoint to following directory: + /root/.cache/nemo/models/your_pytorch_state_dict_file + + """ + nemo_resume = run.Config( + nl.AutoResume, + restore_config=run.Config(nl.RestoreConfig, path=resume_path), + ) + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + gradient_as_bucket_view=True, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, + ckpt_async_save=False, + ) + checkpoint_callback = run.Config( + nl.ModelCheckpoint, + every_n_train_steps=10, + dirpath=dir, + ) + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + devices=num_gpus_per_node, + limit_test_batches=10, + limit_val_batches=10, + log_every_n_steps=20, + max_steps=100, + num_nodes=num_nodes, + plugins=run.Config( + nl.MegatronMixedPrecision, + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + callbacks=[checkpoint_callback], + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=20, + ) + recipe = run.Partial( + llm.finetune, + model=model(tokenizer_model=tokenizer_model), + trainer=trainer, + data=run.Config( + llm.SquadDataModule, + seq_length=2048, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50), + resume=nemo_resume, + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 1 + recipe.optim.config.lr = 5e-6 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/mamba2_8b.py b/nemo/collections/llm/recipes/mamba2_8b.py new file mode 100644 index 000000000000..58883deba732 --- /dev/null +++ b/nemo/collections/llm/recipes/mamba2_8b.py @@ -0,0 +1,321 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.utils.exp_manager import TimingCallback + +NAME = "mamba2_8b" + + +@run.cli.factory(name=NAME) +def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + + return run.Config( + get_nmt_tokenizer, + library='megatron', + model_name="GPTSentencePieceTokenizer", + tokenizer_model=tokenizer_model, + use_fast=True, + ) + + +@run.cli.factory(name=NAME) +def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mamba2 8B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mamba2 Hybrid 8B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mamba2_8b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config( + llm.GPTModel, config=run.Config(llm.NVIDIAMambaConfig8B), tokenizer=tokenizer(tokenizer_model=tokenizer_model) + ) + + +def trainer( + tensor_parallelism: int = 8, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mamba2 Hybrid 8B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mamba2_8b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=False, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn=pretrain, +) -> run.Partial: + """ + Create a pre-training recipe for Mamba2 Hybrid 8B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mamba2_8b + $ nemo llm pretrain --factory "mamba2_8b(num_nodes=1, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mamba2_8b_pretrain", num_nodes=1) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config( + MockDataModule, + seq_length=4096, + global_batch_size=8, + micro_batch_size=1, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + resume_path, + tokenizer_model, + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + gbs: int = 8, + mbs: int = 1, + peft_scheme: Optional[str] = 'none', +) -> run.Partial: + """ + Create a fine-tuning recipe for Mamba2 8B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + resume_path (str): Path to the NeMo checkpoint (refer to notes below + on how to convert a pytorch checkpoint to NeMo) + tokenizer_model (str): Path to tokenizer model (defaults to None) + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mamba2_8b + + Python API usage: + >>> recipe = finetune_recipe(name="mamba2_8b_finetune", num_nodes=1) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + For converting an SSM pytorch checkpoint, use the following line of python code: + + llm.GPTModel(llm.NVIDIAMambaConfig8B(), tokenizer=tokenizer(tokenizer_model=tokenizer_model)).import_ckpt( + path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file", + model_config=llm.NVIDIAMambaConfig8B()) + This line will cache the nemo checkpoint to following directory: + /root/.cache/nemo/models/your_pytorch_state_dict_file + + """ + nemo_resume = run.Config( + nl.AutoResume, + restore_config=run.Config(nl.RestoreConfig, path=resume_path), + ) + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=8, + pipeline_model_parallel_size=1, + gradient_as_bucket_view=True, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, + ckpt_async_save=False, + ) + checkpoint_callback = run.Config( + nl.ModelCheckpoint, + every_n_train_steps=10, + dirpath=dir, + ) + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + devices=num_gpus_per_node, + limit_test_batches=10, + limit_val_batches=10, + log_every_n_steps=20, + max_steps=100, + num_nodes=num_nodes, + plugins=run.Config( + nl.MegatronMixedPrecision, + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + callbacks=[checkpoint_callback], + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=20, + ) + recipe = run.Partial( + llm.finetune, + model=model(tokenizer_model=tokenizer_model), + trainer=trainer, + data=run.Config( + llm.SquadDataModule, + seq_length=2048, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50), + resume=nemo_resume, + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 8 + recipe.optim.config.lr = 5e-6 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/mamba2_hybrid_8b.py b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py new file mode 100644 index 000000000000..eff37da46fca --- /dev/null +++ b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py @@ -0,0 +1,323 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.utils.exp_manager import TimingCallback + +NAME = "mamba2_hybrid_8b" + + +@run.cli.factory(name=NAME) +def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + + return run.Config( + get_nmt_tokenizer, + library='megatronNVIDIAMambaConfig8B', + model_name="GPTSentencePieceTokenizer", + tokenizer_model=tokenizer_model, + use_fast=True, + ) + + +@run.cli.factory(name=NAME) +def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mamba2 Hybrid 8B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mamba2 Hybrid 8B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mamba2_hybrid_8b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config( + llm.GPTModel, + config=run.Config(llm.NVIDIAMambaHybridConfig8B), + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ) + + +def trainer( + tensor_parallelism: int = 8, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mamba2 Hybrid 8B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mamba2_hybrid_8b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=False, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn=pretrain, +) -> run.Partial: + """ + Create a pre-training recipe for Mamba2 Hybrid 8B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mamba2_hybrid_8b + $ nemo llm pretrain --factory "mamba2_hybrid_8b(num_nodes=1, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mamba2_hybrid_8b_pretrain", num_nodes=1) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config( + MockDataModule, + seq_length=4096, + global_batch_size=8, + micro_batch_size=1, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + resume_path, + tokenizer_model, + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + gbs: int = 8, + mbs: int = 1, + peft_scheme: Optional[str] = 'none', +) -> run.Partial: + """ + Create a fine-tuning recipe for Mamba2 Hybrid 8B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + resume_path (str): Path to the NeMo checkpoint (refer to notes below + on how to convert a pytorch checkpoint to NeMo) + tokenizer_model (str): Path to tokenizer model (defaults to None) + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mamba2_hybrid_8b + + Python API usage: + >>> recipe = finetune_recipe(name="mamba2_hybrid_8b_finetune", num_nodes=1) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + For converting an SSM pytorch checkpoint, use the following line of python code: + + llm.GPTModel(llm.NVIDIAMambaHybridConfig8B(), tokenizer=tokenizer(tokenizer_model=tokenizer_model)).import_ckpt( + path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file", + model_config=llm.NVIDIAMambaHybridConfig8B()) + This line will cache the nemo checkpoint to following directory: + /root/.cache/nemo/models/your_pytorch_state_dict_file + + """ + nemo_resume = run.Config( + nl.AutoResume, + restore_config=run.Config(nl.RestoreConfig, path=resume_path), + ) + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=8, + pipeline_model_parallel_size=1, + gradient_as_bucket_view=True, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, + ckpt_async_save=False, + ) + checkpoint_callback = run.Config( + nl.ModelCheckpoint, + every_n_train_steps=10, + dirpath=dir, + ) + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + devices=num_gpus_per_node, + limit_test_batches=10, + limit_val_batches=10, + log_every_n_steps=20, + max_steps=100, + num_nodes=num_nodes, + plugins=run.Config( + nl.MegatronMixedPrecision, + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + callbacks=[checkpoint_callback], + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=20, + ) + recipe = run.Partial( + llm.finetune, + model=model(tokenizer_model=tokenizer_model), + trainer=trainer, + data=run.Config( + llm.SquadDataModule, + seq_length=2048, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50), + resume=nemo_resume, + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 8 + recipe.optim.config.lr = 5e-6 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/mistral.py b/nemo/collections/llm/recipes/mistral_7b.py similarity index 99% rename from nemo/collections/llm/recipes/mistral.py rename to nemo/collections/llm/recipes/mistral_7b.py index 2b8c42e54ee7..6e82df598140 100644 --- a/nemo/collections/llm/recipes/mistral.py +++ b/nemo/collections/llm/recipes/mistral_7b.py @@ -33,7 +33,7 @@ from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed from nemo.utils.exp_manager import TimingCallback -NAME = "mistral" +NAME = "mistral_7b" @run.cli.factory(name=NAME) diff --git a/nemo/collections/llm/recipes/mistral_nemo_12b.py b/nemo/collections/llm/recipes/mistral_nemo_12b.py new file mode 100644 index 000000000000..e74fa5435b62 --- /dev/null +++ b/nemo/collections/llm/recipes/mistral_nemo_12b.py @@ -0,0 +1,285 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Callable, Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule +from nemo.collections.llm.gpt.model.mistral import MistralModel, MistralNeMoConfig12B +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback +from nemo.utils.exp_manager import TimingCallback + +NAME = "mistral_nemo_base_12b" + + +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mistral-Nemo-Base-12B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mistral-Nemo-Base-12B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mistral_nemo_base_12b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(MistralModel, config=run.Config(MistralNeMoConfig12B)) + + +def trainer( + tensor_parallelism: int = 2, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 2, + sequence_parallelism: bool = True, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mistral-Nemo-Base-12B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mistral_nemo_base_12b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Mistral-Nemo-Base-12B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mistral_nemo_base_12b + $ nemo llm pretrain --factory "mistral_nemo_base_12b(num_nodes=2, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mistral_nemo_base_12b", num_nodes=2) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=pretrain, name=NAME + "_optimized") +def pretrain_recipe_performance( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn: Callable = pretrain, +) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for Mistral-Nemo-Base-12B model. + + This recipe enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Examples: + $ nemo llm pretrain --factory mistral_nemo_base_12b_optimized + + Python API usage: + >>> recipe = pretrain_recipe_performance(name="mistral_nemo_base_12b_perf", num_nodes=4) + >>> print(recipe) + + Note: + Use this recipe with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) + + recipe.trainer.callbacks.append( + run.Config( + MegatronCommOverlapCallback, + tp_comm_overlap=True, + ) + ) + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + peft_scheme: Optional[str] = 'lora', +) -> run.Partial: + """ + Create a fine-tuning recipe for Mistral-Nemo-Base-12B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mistral_nemo_base_12b + + Python API usage: + >>> recipe = finetune_recipe(name="mistral_nemo_base_12b_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + """ + recipe = default_finetune_recipe( + model(), "mistralai/Mistral-Nemo-Base-2407", dir, name, num_nodes, num_gpus_per_node + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.optim.config.lr = 5e-6 + elif peft_scheme.lower() == 'lora': + recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) + recipe.optim.config.lr = 1e-4 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py index 222a37d7a0c5..1bfef9be5582 100644 --- a/nemo/collections/llm/recipes/mixtral_8x22b.py +++ b/nemo/collections/llm/recipes/mixtral_8x22b.py @@ -13,7 +13,7 @@ # limitations under the License. -from typing import Optional +from typing import Callable, Optional import nemo_run as run import pytorch_lightning as pl @@ -117,6 +117,9 @@ def trainer( DistributedDataParallelConfig, check_for_nan_in_grad=True, grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + average_in_collective=True, ), ) @@ -142,7 +145,12 @@ def trainer( @run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 16, num_gpus_per_node: int = 8, fn=pretrain + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 16, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, ) -> run.Partial: """ Create a pre-training recipe for Mixtral 8x22B model. @@ -155,6 +163,7 @@ def pretrain_recipe( name (str): Name of the pre-training run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -169,7 +178,7 @@ def pretrain_recipe( >>> recipe = pretrain_recipe(name="mixtral_pretrain", num_nodes=16) >>> print(recipe) """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=trainer( @@ -181,44 +190,44 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Mixtral 8x22B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - CLI usage: - $ nemo llm pretrain --factory "mixtral_8x22b.pretrain_recipe_performance(num_nodes=8, name='perf_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="mixtral_8x22b_perf", num_nodes=8) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) + + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. + recipe.trainer.callbacks.extend( [ - run.Config(MegatronTokenDropCallback), - run.Config(MegatronCommOverlapCallback), + run.Config( + MegatronTokenDropCallback, + ), + run.Config( + MegatronCommOverlapCallback, overlap_param_gather_with_optimizer_step=True, align_param_gather=True + ), ] ) diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py index d0609761feea..8e39e73aab76 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b.py @@ -13,7 +13,7 @@ # limitations under the License. -from typing import Optional +from typing import Callable, Optional import nemo_run as run import pytorch_lightning as pl @@ -116,6 +116,7 @@ def trainer( grad_reduce_in_fp32=True, overlap_grad_reduce=True, overlap_param_gather=True, + average_in_collective=True, ), ) @@ -141,7 +142,12 @@ def trainer( @run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 8, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, ) -> run.Partial: """ Create a pre-training recipe for Mixtral 8x7B model. @@ -154,6 +160,7 @@ def pretrain_recipe( name (str): Name of the pre-training run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -168,7 +175,7 @@ def pretrain_recipe( >>> recipe = pretrain_recipe(name="mixtral_8x7b_pretrain", num_nodes=8) >>> print(recipe) """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=trainer( @@ -180,44 +187,44 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Mixtral 8x7B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - CLI usage: - $ nemo llm pretrain --factory "mixtral_8x3b.pretrain_recipe_performance(num_nodes=8, name='perf_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="mixtral_8x7b_perf", num_nodes=8) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) + + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. + recipe.trainer.callbacks.extend( [ run.Config(MegatronTokenDropCallback), - run.Config(MegatronCommOverlapCallback), + run.Config( + MegatronCommOverlapCallback, + overlap_param_gather_with_optimizer_step=True, + align_param_gather=True, + ), ] ) diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py index 8b26a8c7c3e3..7cbfaf723544 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py @@ -51,7 +51,7 @@ def model() -> run.Config[pl.LightningModule]: def trainer( - num_nodes: int = 2, + num_nodes: int = 4, num_gpus_per_node: int = 8, ) -> run.Config: """ @@ -60,8 +60,8 @@ def trainer( This function sets up the distributed training strategy optimized for longer sequences. Args: - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. + num_nodes (int, optional): Number of compute nodes to use. Defaults to 4. + num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8. Returns: run.Config: Configuration for the NeMo Lightning Trainer. @@ -71,17 +71,17 @@ def trainer( $ nemo llm pretrain trainer=mixtral_8x7b_16k ... Python API usage: - >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> trainer_config = trainer(num_nodes=4, num_gpus_per_node=8) >>> print(trainer_config) Note: This configuration uses increased parallelism to handle the longer sequence length efficiently. """ return mixtral_8x7b.trainer( - tensor_parallelism=2, - pipeline_parallelism=4, + tensor_parallelism=4, + pipeline_parallelism=2, pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=8, + virtual_pipeline_parallelism=None, context_parallelism=4, sequence_parallelism=True, expert_parallelism=1, @@ -95,7 +95,7 @@ def trainer( def pretrain_recipe( dir: Optional[str] = None, name: str = "default", - num_nodes: int = 2, + num_nodes: int = 4, num_gpus_per_node: int = 8, ) -> run.Partial: """ @@ -107,8 +107,8 @@ def pretrain_recipe( Args: dir (Optional[str]): Directory for saving logs and checkpoints. name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. + num_nodes (int, optional): Number of compute nodes to use. Defaults to 4. + num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8. Returns: run.Partial: Partial configuration for pre-training. @@ -116,10 +116,10 @@ def pretrain_recipe( Examples: CLI usage: $ nemo llm pretrain --factory mixtral_8x7b_16k - $ nemo llm pretrain --factory "mixtral_8x7b_16k(num_nodes=2, name='my_16k_pretrain')" + $ nemo llm pretrain --factory "mixtral_8x7b_16k(num_nodes=4, name='my_16k_pretrain')" Python API usage: - >>> recipe = pretrain_recipe(name="mixtral_8x7b_16k_pretrain", num_nodes=2) + >>> recipe = pretrain_recipe(name="mixtral_8x7b_16k_pretrain", num_nodes=4) >>> print(recipe) """ recipe = mixtral_8x7b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py index 6c8f7077fba3..3606be5ec12b 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py @@ -21,7 +21,6 @@ from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import mixtral_8x7b from nemo.utils.exp_manager import TimingCallback @@ -59,8 +58,8 @@ def trainer( This function sets up the distributed training strategy optimized for very long sequences. Args: - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. + num_nodes (int, optional): Number of compute nodes to use. Defaults to 8. + num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8. Returns: run.Config: Configuration for the NeMo Lightning Trainer. @@ -78,11 +77,11 @@ def trainer( It requires a substantial amount of computational resources. """ return mixtral_8x7b.trainer( - tensor_parallelism=4, + tensor_parallelism=8, pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=4, - context_parallelism=8, + virtual_pipeline_parallelism=None, + context_parallelism=4, sequence_parallelism=True, expert_parallelism=1, num_nodes=num_nodes, @@ -107,8 +106,8 @@ def pretrain_recipe( Args: dir (Optional[str]): Directory for saving logs and checkpoints. name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. + num_nodes (int, optional): Number of compute nodes to use. Defaults to 16. + num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8. Returns: run.Partial: Partial configuration for pre-training. diff --git a/nemo/collections/llm/recipes/nemotron.py b/nemo/collections/llm/recipes/nemotron.py index 1dd1ef2f83bc..aedf3fcf2954 100644 --- a/nemo/collections/llm/recipes/nemotron.py +++ b/nemo/collections/llm/recipes/nemotron.py @@ -17,6 +17,7 @@ import nemo_run as run import pytorch_lightning as pl import torch +from megatron.core.distributed import DistributedDataParallelConfig from pytorch_lightning.callbacks.callback import Callback from nemo import lightning as nl @@ -124,6 +125,14 @@ def nemotron_trainer( ckpt_include_optimizer=True, ckpt_async_save=True, ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + average_in_collective=True, + ), ) precision_plugin = None diff --git a/nemo/collections/llm/recipes/nemotron3_8b.py b/nemo/collections/llm/recipes/nemotron3_8b.py index 3cdb647b5f84..7dcebe17f872 100644 --- a/nemo/collections/llm/recipes/nemotron3_8b.py +++ b/nemo/collections/llm/recipes/nemotron3_8b.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional +from typing import Callable, Optional import nemo_run as run import pytorch_lightning as pl @@ -26,6 +26,7 @@ from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback from nemo.utils.exp_manager import TimingCallback NAME = "nemotron3_8b" @@ -82,6 +83,7 @@ def pretrain_recipe( constant_steps=0, min_lr=3.0e-5, max_lr=3e-4, + performance_mode: bool = False, # Training function fn=pretrain, ) -> run.Partial: @@ -117,6 +119,7 @@ def pretrain_recipe( constant_steps (int): Number of constant steps. min_lr (float): Minimum learning rate. max_lr (float): Maximum learning rate. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -134,7 +137,7 @@ def pretrain_recipe( Note: This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=nemotron_trainer( @@ -173,6 +176,38 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) + + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for Nemotron3 8B model. + + This method enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Note: + Use this method with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + + recipe.trainer.callbacks.append( + run.Config( + MegatronCommOverlapCallback, + tp_comm_overlap=True, + ) + ) + return recipe + @run.cli.factory(name=NAME + "_nemo") def nemo_resume() -> run.Config[nl.AutoResume]: diff --git a/nemo/collections/llm/recipes/nemotron4_15b.py b/nemo/collections/llm/recipes/nemotron4_15b.py index c0acae6b13f0..16ae7b2b1e79 100644 --- a/nemo/collections/llm/recipes/nemotron4_15b.py +++ b/nemo/collections/llm/recipes/nemotron4_15b.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional +from typing import Callable, Optional import nemo_run as run import pytorch_lightning as pl @@ -23,6 +23,7 @@ from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback from nemo.utils.exp_manager import TimingCallback NAME = "nemotron4_15b" @@ -79,6 +80,7 @@ def pretrain_recipe( constant_steps=0, min_lr=4.5e-5, max_lr=4.5e-5, + performance_mode: bool = False, # Training function fn=pretrain, ) -> run.Partial: @@ -114,6 +116,7 @@ def pretrain_recipe( constant_steps (int): Number of constant steps. min_lr (float): Minimum learning rate. max_lr (float): Maximum learning rate. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -131,7 +134,7 @@ def pretrain_recipe( Note: This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=nemotron_trainer( @@ -169,3 +172,34 @@ def pretrain_recipe( ), resume=default_resume(), ) + + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) + + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for Nemotron4 15B model. + + This method enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Note: + Use this method with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + recipe.trainer.callbacks.append( + run.Config( + MegatronCommOverlapCallback, + tp_comm_overlap=True, + ) + ) + return recipe diff --git a/nemo/collections/llm/recipes/nemotron4_15b_16k.py b/nemo/collections/llm/recipes/nemotron4_15b_16k.py index d0e9d939d8e7..75eced72761f 100644 --- a/nemo/collections/llm/recipes/nemotron4_15b_16k.py +++ b/nemo/collections/llm/recipes/nemotron4_15b_16k.py @@ -56,7 +56,7 @@ def pretrain_recipe( # Trainer tensor_parallelism: int = 2, pipeline_parallelism: int = 2, - pipeline_parallelism_type: Optional[torch.dtype] = None, + pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16, virtual_pipeline_parallelism: Optional[int] = None, context_parallelism: int = 2, sequence_parallelism: bool = True, diff --git a/nemo/collections/llm/recipes/nemotron4_15b_64k.py b/nemo/collections/llm/recipes/nemotron4_15b_64k.py index c3f4575a1fd6..8286778aa7ba 100644 --- a/nemo/collections/llm/recipes/nemotron4_15b_64k.py +++ b/nemo/collections/llm/recipes/nemotron4_15b_64k.py @@ -56,7 +56,7 @@ def pretrain_recipe( # Trainer tensor_parallelism: int = 4, pipeline_parallelism: int = 2, - pipeline_parallelism_type: Optional[torch.dtype] = None, + pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16, virtual_pipeline_parallelism: Optional[int] = None, context_parallelism: int = 4, sequence_parallelism: bool = True, diff --git a/nemo/collections/llm/recipes/nemotron4_22b.py b/nemo/collections/llm/recipes/nemotron4_22b.py index ba07bae241d8..a20afedfea56 100644 --- a/nemo/collections/llm/recipes/nemotron4_22b.py +++ b/nemo/collections/llm/recipes/nemotron4_22b.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional +from typing import Callable, Optional import nemo_run as run import pytorch_lightning as pl @@ -23,6 +23,7 @@ from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback from nemo.utils.exp_manager import TimingCallback NAME = "nemotron4_22b" @@ -56,7 +57,7 @@ def pretrain_recipe( # Trainer tensor_parallelism: int = 2, pipeline_parallelism: int = 4, - pipeline_parallelism_type: Optional[torch.dtype] = None, + pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16, virtual_pipeline_parallelism: Optional[int] = 10, context_parallelism: int = 1, sequence_parallelism: bool = False, @@ -79,6 +80,7 @@ def pretrain_recipe( constant_steps=0, min_lr=1e-5, max_lr=1e-4, + performance_mode: bool = False, # Training function fn=pretrain, ) -> run.Partial: @@ -114,6 +116,7 @@ def pretrain_recipe( constant_steps (int): Number of constant steps. min_lr (float): Minimum learning rate. max_lr (float): Maximum learning rate. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -131,7 +134,7 @@ def pretrain_recipe( Note: This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=nemotron_trainer( @@ -169,3 +172,45 @@ def pretrain_recipe( ), resume=default_resume(), ) + + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) + + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for Nemotron4 22B model. + + This method enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Note: + Use this method with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. + + recipe.trainer.callbacks.append( + run.Config( + MegatronCommOverlapCallback, + tp_comm_overlap=True, + defer_embedding_wgrad_compute=True, + wgrad_deferral_limit=22, + overlap_param_gather_with_optimizer_step=True, + align_param_gather=True, + ) + ) + return recipe diff --git a/nemo/collections/llm/recipes/nemotron4_22b_16k.py b/nemo/collections/llm/recipes/nemotron4_22b_16k.py index 614004d12aa3..42f258c6057d 100644 --- a/nemo/collections/llm/recipes/nemotron4_22b_16k.py +++ b/nemo/collections/llm/recipes/nemotron4_22b_16k.py @@ -57,7 +57,7 @@ def pretrain_recipe( tensor_parallelism: int = 4, pipeline_parallelism: int = 1, pipeline_parallelism_type: Optional[torch.dtype] = None, - virtual_pipeline_parallelism: Optional[int] = 10, + virtual_pipeline_parallelism: Optional[int] = None, context_parallelism: int = 2, sequence_parallelism: bool = True, num_nodes: int = 1, diff --git a/nemo/collections/llm/recipes/nemotron4_22b_64k.py b/nemo/collections/llm/recipes/nemotron4_22b_64k.py index 57211e5dddc1..67d60a6e1c90 100644 --- a/nemo/collections/llm/recipes/nemotron4_22b_64k.py +++ b/nemo/collections/llm/recipes/nemotron4_22b_64k.py @@ -56,9 +56,9 @@ def pretrain_recipe( # Trainer tensor_parallelism: int = 4, pipeline_parallelism: int = 2, - pipeline_parallelism_type: Optional[torch.dtype] = None, - virtual_pipeline_parallelism: Optional[int] = 10, - context_parallelism: int = 2, + pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 4, sequence_parallelism: bool = True, num_nodes: int = 4, num_gpus_per_node: int = 8, @@ -122,10 +122,10 @@ def pretrain_recipe( Examples: CLI usage: $ nemo llm pretrain --factory nemotron4_22b_64k - $ nemo llm pretrain --factory "nemotron4_22b_64k(num_nodes=1, name='my_nemotron_pretrain')" + $ nemo llm pretrain --factory "nemotron4_22b_64k(num_nodes=2, name='my_nemotron_pretrain')" Python API usage: - >>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=1) + >>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=2) >>> print(recipe) Note: diff --git a/nemo/collections/llm/recipes/nemotron4_340b.py b/nemo/collections/llm/recipes/nemotron4_340b.py index 238acb0dac3c..8268b2a87791 100644 --- a/nemo/collections/llm/recipes/nemotron4_340b.py +++ b/nemo/collections/llm/recipes/nemotron4_340b.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional +from typing import Callable, Optional import nemo_run as run import pytorch_lightning as pl @@ -26,6 +26,7 @@ from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback from nemo.utils.exp_manager import TimingCallback NAME = "nemotron4_340b" @@ -41,7 +42,7 @@ def model() -> run.Config[pl.LightningModule]: Examples: CLI usage: - $ nemo llm pretrain model=nemotron4_340 ... + $ nemo llm pretrain model=nemotron4_340b ... Python API usage: >>> model_config = model() @@ -59,7 +60,7 @@ def pretrain_recipe( # Trainer tensor_parallelism: int = 8, pipeline_parallelism: int = 12, - pipeline_parallelism_type: Optional[torch.dtype] = None, + pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16, virtual_pipeline_parallelism: Optional[int] = 8, context_parallelism: int = 1, sequence_parallelism: bool = False, @@ -82,6 +83,7 @@ def pretrain_recipe( constant_steps=0, min_lr=1.0e-5, max_lr=1.0e-4, + performance_mode: bool = False, # Training function fn=pretrain, ) -> run.Partial: @@ -117,6 +119,7 @@ def pretrain_recipe( constant_steps (int): Number of constant steps. min_lr (float): Minimum learning rate. max_lr (float): Maximum learning rate. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -124,8 +127,8 @@ def pretrain_recipe( Examples: CLI usage: - $ nemo llm pretrain --factory nemotron4_340 - $ nemo llm pretrain --factory "nemotron4_340(num_nodes=1, name='my_nemotron_pretrain')" + $ nemo llm pretrain --factory nemotron4_340b + $ nemo llm pretrain --factory "nemotron4_340b(num_nodes=1, name='my_nemotron_pretrain')" Python API usage: >>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=1) @@ -134,7 +137,7 @@ def pretrain_recipe( Note: This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=nemotron_trainer( @@ -173,6 +176,48 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) + + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for Nemotron4 340B model. + + This method enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Note: + Use this method with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. + + recipe.trainer.callbacks.append( + run.Config( + MegatronCommOverlapCallback, + tp_comm_overlap=True, + defer_embedding_wgrad_compute=True, + wgrad_deferral_limit=22, + overlap_param_gather_with_optimizer_step=True, + align_param_gather=True, + ) + ) + return recipe + @run.cli.factory(name=NAME + "_nemo") def nemo_resume() -> run.Config[nl.AutoResume]: @@ -207,7 +252,7 @@ def finetune_recipe( # Trainer tensor_parallelism: int = 8, pipeline_parallelism: int = 12, - pipeline_parallelism_type: Optional[torch.dtype] = None, + pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16, virtual_pipeline_parallelism: Optional[int] = 8, context_parallelism: int = 1, sequence_parallelism: bool = False, @@ -272,8 +317,8 @@ def finetune_recipe( Examples: CLI usage: - $ nemo llm finetune --factory nemotron4_340 - $ nemo llm finetune --factory "nemotron4_340(name='my_nemotron4_340_finetune', num_nodes=4)" + $ nemo llm finetune --factory nemotron4_340b + $ nemo llm finetune --factory "nemotron4_340b(name='my_nemotron4_340_finetune', num_nodes=4)" Python API usage: >>> recipe = finetune_recipe(name="my_nemotron4_340_finetune", num_nodes=4) diff --git a/nemo/collections/llm/recipes/optim/adam.py b/nemo/collections/llm/recipes/optim/adam.py index 5be87ac71e9d..c6510577711d 100644 --- a/nemo/collections/llm/recipes/optim/adam.py +++ b/nemo/collections/llm/recipes/optim/adam.py @@ -25,6 +25,8 @@ def distributed_fused_adam_with_cosine_annealing( precision: str = "bf16-mixed", # or "16-mixed" warmup_steps: int = 2000, constant_steps: int = 0, + adam_beta1: float = 0.9, + adam_beta2: float = 0.95, max_lr: float = 1e-4, min_lr: Optional[float] = None, clip_grad: float = 1.0, @@ -37,14 +39,14 @@ def distributed_fused_adam_with_cosine_annealing( weight_decay=0.1, bf16=precision == "bf16-mixed", fp16=precision == "16-mixed", - adam_beta1=0.9, - adam_beta2=0.95, + adam_beta1=adam_beta1, + adam_beta2=adam_beta2, adam_eps=1e-5, use_distributed_optimizer=True, clip_grad=clip_grad, ) - min_lr = min_lr or (0.1 * max_lr) + min_lr = min_lr if min_lr is not None else (0.1 * max_lr) sched = run.Config( CosineAnnealingScheduler, warmup_steps=warmup_steps, diff --git a/nemo/collections/llm/t5/data/fine_tuning.py b/nemo/collections/llm/t5/data/fine_tuning.py index b1315f7a708a..9326dabe7b84 100644 --- a/nemo/collections/llm/t5/data/fine_tuning.py +++ b/nemo/collections/llm/t5/data/fine_tuning.py @@ -61,8 +61,6 @@ def __init__( from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "BertWordPieceCase") - additional_tokens = {'additional_special_tokens': [f'' for i in range(100)]} - self.tokenizer.add_special_tokens(additional_tokens) self.memmap_workers = memmap_workers self.num_workers = num_workers diff --git a/nemo/collections/llm/t5/data/pre_training.py b/nemo/collections/llm/t5/data/pre_training.py index 2c73e0b78b11..e6f619972284 100644 --- a/nemo/collections/llm/t5/data/pre_training.py +++ b/nemo/collections/llm/t5/data/pre_training.py @@ -130,10 +130,6 @@ def __init__( # add additional tokens for T5 tokenizer from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer - self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "BertWordPieceCase") - additional_tokens = {'additional_special_tokens': [f'' for i in range(100)]} - self.tokenizer.add_special_tokens(additional_tokens) - self.data_sampler = MegatronDataSampler( seq_len=self.seq_length, micro_batch_size=micro_batch_size, diff --git a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py index 4882708f698f..f62613db891b 100644 --- a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py @@ -127,7 +127,7 @@ def __init__( index_mapping_dir=index_mapping_dir, ) - if is_distributed: + if is_distributed and not _lightning_prepare_data(): torch.distributed.barrier() if is_distributed and AppState().local_rank == 0: @@ -152,7 +152,7 @@ def __init__( index_mapping_dir=index_mapping_dir, ) - if is_distributed: + if is_distributed and not _lightning_prepare_data(): torch.distributed.barrier() logging.info(f"Loading data files") @@ -260,7 +260,8 @@ def load_file(self, fn, index_mapping_dir: Optional[str] = None): raise RuntimeError(f"Missing header, expected {self._header_lines} header lines") # load meta info - idx_info_dict = pickle.load(open(idx_fn + ".info", "rb")) + with open(idx_fn + ".info", "rb") as fp: + idx_info_dict = pickle.load(fp) # test for mismatch in expected newline_int if "newline_int" in idx_info_dict: newline_int = idx_info_dict["newline_int"] @@ -378,9 +379,7 @@ def __init__( self._data_sep = data_sep def _build_data_from_text(self, text: str): - """ - - """ + """ """ _build_data_from_text = super()._build_data_from_text data = {} text_fields = text.split(self._data_sep) @@ -513,7 +512,11 @@ def _build_memmap_index_files(newline_int, build_index_fn, fn, index_mapping_dir def build_index_files( - dataset_paths, newline_int, workers=None, build_index_fn=_build_index_from_memdata, index_mapping_dir: str = None, + dataset_paths, + newline_int, + workers=None, + build_index_fn=_build_index_from_memdata, + index_mapping_dir: str = None, ): """Auxiliary method to build multiple index files""" if len(dataset_paths) < 1: @@ -528,7 +531,12 @@ def build_index_files( ctx = mp.get_context("fork") with ctx.Pool(workers) as p: build_status = p.map( - partial(_build_memmap_index_files, newline_int, build_index_fn, index_mapping_dir=index_mapping_dir,), + partial( + _build_memmap_index_files, + newline_int, + build_index_fn, + index_mapping_dir=index_mapping_dir, + ), dataset_paths, ) @@ -741,3 +749,19 @@ def get_sample_block(self, block_idx: int) -> np.ndarray: sample_block = sample_block % self.dataset_size return sample_block + + +def _lightning_prepare_data(): + """ + This function checks whether it is invoked in lightning's hook "prepare_data", which is run only on rank 0. + TextMemMapDataset contains a torch.distributed.barrier operation, so when run inside the single-process hook + prepare_data, the barrier operation would hang forever. + """ + import inspect + + return any( + [ + frame.function == 'prepare_data' and 'prepare_packed_sequence_data' in frame.code_context[0] + for frame in inspect.stack() + ] + ) diff --git a/nemo/collections/nlp/modules/common/text_generation_server.py b/nemo/collections/nlp/modules/common/text_generation_server.py index 6c257317b99f..3f8e34b94134 100644 --- a/nemo/collections/nlp/modules/common/text_generation_server.py +++ b/nemo/collections/nlp/modules/common/text_generation_server.py @@ -15,11 +15,17 @@ import json import threading +import time +import uuid import torch from flask import Flask, jsonify, request from flask_restful import Api, Resource +from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_chat_dataset import ( + _get_header_conversation_type_mask_role, + get_prompt_template_example, +) from nemo.collections.nlp.modules.common.retro_inference_strategies import ( RetroModelTextGenerationStrategy, RetroQAModelTextGenerationStrategy, @@ -61,6 +67,189 @@ def send_do_generate(): choice = torch.cuda.LongTensor([GENERATE_NUM]) torch.distributed.broadcast(choice, 0) + def convert_messages(self, input_list): + output_dict = { + 'system': '', + 'conversations': [], + 'mask': 'User', + 'type': 'VALUE_TO_TEXT', + } + + # Extract the system message + for msg in input_list: + if msg['role'] == 'system': + output_dict['system'] = msg['content'] + break # Assuming only one system message + + # Build the conversations list + for msg in input_list: + if msg['role'] != 'system': + conversation_entry = { + 'from': msg['role'].capitalize(), # Capitalize 'user' and 'assistant' + 'value': msg['content'], + 'label': None, + } + output_dict['conversations'].append(conversation_entry) + + return output_dict + + def completion(self, data): + output_sentence = "" + with lock: # Need to get lock to keep multiple threads from hitting code + MegatronGenerate.send_do_generate() # Tell other ranks we're doing generate + extra = {} + if self.inference_strategy is not None: + extra['strategy'] = self.inference_strategy + + all_probs = False + add_BOS = False + top_p = data.get("top_p", 1.0) + top_k = data.get("top_k", 0) + max_tokens = data.get("max_tokens", 32) + temperature = data.get("temperature", 0.0) + logprobs = data.get("logprobs", False) + greedy = temperature == 0.0 + end_strings = ['<|endoftext|>'] + data.get("end_strings", []) + prompt = data["prompt"] + random_seed = data.get("seed", 1234) + + output = generate( + self.model, + [prompt], + tokens_to_generate=max_tokens, + all_probs=all_probs, + temperature=temperature, + add_BOS=add_BOS, + top_k=top_k, + top_p=top_p, + greedy=greedy, + repetition_penalty=1.0, + end_strings=end_strings, + min_tokens_to_generate=0, + compute_logprob=logprobs, + random_seed=random_seed, + **extra, + ) + for k in output: + if isinstance(output[k], torch.Tensor): + output[k] = output[k].tolist() + + output_sentence = output['sentences'][0][len(prompt) :] + tokens = output['tokens'][0] + logprobs = output['logprob'][0] if output['logprob'] is not None else None + num_prompt_tokens = len(prompt.split()) + num_output_sentence = len(output_sentence.split()) + + return jsonify( + { + "choices": [ + { + "finish_reason": "", + "index": 0, + "logprobs": logprobs, + "text": output_sentence, + "tokens": tokens, + } + ], + "created": int(time.time()), + "id": f"cmpl-{uuid.uuid4()}", + "model": "nemo model", + "object": "text_completion", + "usage": { + "completion_tokens": num_output_sentence, + "prompt_tokens": num_prompt_tokens, + "total_tokens": num_output_sentence + num_prompt_tokens, + }, + } + ) + + def chat_completion(self, data): + data['messages'] = data['messages'] + [ + {'role': 'assistant', 'content': ''} + ] # adding trailing assistant message so that prompt ends with Assistant tag. + special_tokens = self.model.cfg.data.chat_prompt_tokens + nemo_source = self.convert_messages(data['messages']) + header, conversation, data_type, mask_role = _get_header_conversation_type_mask_role( + nemo_source, special_tokens + ) + len_strip = len(special_tokens['end_of_turn'] + special_tokens['turn_start']) + conversation = conversation[:-len_strip] + # Return a response mimicking the OpenAI ChatCompletion API format + with lock: # Need to get lock to keep multiple threads from hitting code + MegatronGenerate.send_do_generate() # Tell other ranks we're doing generate + extra = {} + if self.inference_strategy is not None: + extra['strategy'] = self.inference_strategy + + all_probs = False + add_BOS = False + top_k = 0 + greedy = data['temperature'] == 0.0 + logprobs = data.get("logprobs", False) + end_strings = ['<|endoftext|>', special_tokens['turn_start'], special_tokens['label_start']] + random_seed = None + + output = generate( + self.model, + [conversation], + data.get('max_tokens', 32), + all_probs=all_probs, + temperature=data.get('temperature', 1.0), + add_BOS=add_BOS, + top_k=top_k, + top_p=data.get("top_p", 0.95), + greedy=greedy, + repetition_penalty=1.0, + end_strings=end_strings, + min_tokens_to_generate=0, + compute_logprob=logprobs, + random_seed=random_seed, + **extra, + ) + for k in output: + if isinstance(output[k], torch.Tensor): + output[k] = output[k].tolist() + + output_sentence = output['sentences'][0][len(conversation) :] + tokens = output['tokens'][0] + logprobs = output['logprob'][0] if output['logprob'] is not None else None + num_prompt_tokens = len(conversation.split()) # @adithyare only produces an approx. number of tokens + num_output_sentence = len(output_sentence.split()) + + return jsonify( + { + "id": f"chatcmpl-{uuid.uuid4()}", + "object": "chat.completion", + "created": int(time.time()), + "model": data.get("model", "nemo model"), + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": output_sentence}, + "logprobs": logprobs, + "tokens": tokens, + "finish_reason": "", + } + ], + "usage": { + "prompt_tokens": num_prompt_tokens, + "completion_tokens": num_output_sentence, + "total_tokens": num_output_sentence + num_prompt_tokens, + }, + } + ) + + def post(self): + # Access the request data if needed + if request.endpoint == "oai_completions": + data = request.get_json() + return self.completion(data) + elif request.endpoint == "oai_chat_completions": + data = request.get_json() + return self.chat_completion(data) + else: + raise RuntimeError("Unknown enpoint requested.") + def put(self): logging.info("request IP: " + str(request.remote_addr)) logging.info(json.dumps(request.get_json())) @@ -135,7 +324,7 @@ def put(self): if not (0.0 <= top_p <= 1.0): return "top_p must be a positive number less than or equal to 1.0" - repetition_penalty = 1.2 + repetition_penalty = 1.0 if "repetition_penalty" in request.get_json(): repetition_penalty = request.get_json()["repetition_penalty"] if not (type(repetition_penalty) == int or type(repetition_penalty) == float): @@ -231,7 +420,24 @@ class MegatronServer(object): def __init__(self, model, inference_strategy=None): self.app = Flask(__name__, static_url_path='') api = Api(self.app) - api.add_resource(MegatronGenerate, '/generate', resource_class_args=[model, inference_strategy]) + api.add_resource( + MegatronGenerate, + '/generate', + endpoint="generate", + resource_class_kwargs={"model": model, "inference_strategy": inference_strategy}, + ) + api.add_resource( + MegatronGenerate, + '/v1/completions', + endpoint="oai_completions", + resource_class_kwargs={"model": model, "inference_strategy": inference_strategy}, + ) + api.add_resource( + MegatronGenerate, + '/v1/chat/completions', + endpoint="oai_chat_completions", + resource_class_kwargs={"model": model, "inference_strategy": inference_strategy}, + ) def run(self, url, port=5000): self.app.run(url, threaded=True, port=port, debug=False) diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py index 4e6f9e15b839..dfc55a6c9065 100644 --- a/nemo/collections/nlp/modules/common/tokenizer_utils.py +++ b/nemo/collections/nlp/modules/common/tokenizer_utils.py @@ -69,7 +69,8 @@ def get_tokenizer( To see the list of all HuggingFace pretrained models, use: nemo_nlp.modules.common.get_huggingface_pretrained_lm_models_list() tokenizer_model: tokenizer model file of sentencepiece - special_tokens: dict of special tokens + special_tokens: dict of special tokens. + For additional special tokens besides standard special tokens (bos, eos, pad, etc.), such as sentinel tokens for T5 (, , etc.), use key 'additional_special_tokens' vocab_file: path to vocab file use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer bpe_dropout: (experimental) BPE dropout tries to corrupt the standard segmentation @@ -224,7 +225,11 @@ def get_nmt_tokenizer( f'Getting Megatron tokenizer for pretrained model name: {model_name}, custom vocab file: {vocab_file}, and merges file: {merges_file}' ) return get_tokenizer( - tokenizer_name=model_name, vocab_file=vocab_file, merges_file=merges_file, chat_template=chat_template + tokenizer_name=model_name, + vocab_file=vocab_file, + merges_file=merges_file, + special_tokens=special_tokens_dict, + chat_template=chat_template, ) elif library == 'tabular': from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer diff --git a/nemo/collections/tts/data/dataset.py b/nemo/collections/tts/data/dataset.py index 83d2b969ea91..901b4168130f 100644 --- a/nemo/collections/tts/data/dataset.py +++ b/nemo/collections/tts/data/dataset.py @@ -204,7 +204,8 @@ def __init__( self.text_normalizer_call = None elif not PYNINI_AVAILABLE: raise ImportError( - "`nemo_text_processing` is not installed, see https://github.com/NVIDIA/NeMo-text-processing for details" + "`nemo_text_processing` is not installed, see https://github.com/NVIDIA/NeMo-text-processing for details. " + "If you wish to continue without text normalization, please remove the text_normalizer part in your TTS yaml file." ) else: self.text_normalizer_call = ( diff --git a/nemo/collections/tts/models/aligner.py b/nemo/collections/tts/models/aligner.py index 72d023e9ee10..d8e65d6e6821 100644 --- a/nemo/collections/tts/models/aligner.py +++ b/nemo/collections/tts/models/aligner.py @@ -24,6 +24,7 @@ from torch import nn from nemo.collections.tts.losses.aligner_loss import BinLoss, ForwardSumLoss +from nemo.collections.tts.models.base import NeedsNormalizer from nemo.collections.tts.parts.utils.helpers import ( binarize_attention, g2p_backward_compatible_support, @@ -41,7 +42,7 @@ HAVE_WANDB = False -class AlignerModel(ModelPT): +class AlignerModel(NeedsNormalizer, ModelPT): """Speech-to-text alignment model (https://arxiv.org/pdf/2108.10447.pdf) that is used to learn alignments between mel spectrogram and text.""" def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): @@ -77,29 +78,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.bin_loss_start_ratio = cfg.bin_loss_start_ratio self.bin_loss_warmup_epochs = cfg.bin_loss_warmup_epochs - def _setup_normalizer(self, cfg): - if "text_normalizer" in cfg: - normalizer_kwargs = {} - - if "whitelist" in cfg.text_normalizer: - normalizer_kwargs["whitelist"] = self.register_artifact( - 'text_normalizer.whitelist', cfg.text_normalizer.whitelist - ) - - try: - import nemo_text_processing - - self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs) - except Exception as e: - logging.error(e) - raise ImportError( - "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details" - ) - - self.text_normalizer_call = self.normalizer.normalize - if "text_normalizer_call_kwargs" in cfg: - self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs - def _setup_tokenizer(self, cfg): text_tokenizer_kwargs = {} if "g2p" in cfg.text_tokenizer: diff --git a/nemo/collections/tts/models/base.py b/nemo/collections/tts/models/base.py index fe19ae75a3b3..b4b0ea9c43fa 100644 --- a/nemo/collections/tts/models/base.py +++ b/nemo/collections/tts/models/base.py @@ -18,6 +18,7 @@ from typing import List, Optional import torch +from hydra.utils import instantiate from omegaconf import DictConfig from tqdm import tqdm @@ -28,9 +29,39 @@ from nemo.core.neural_types.neural_type import NeuralType from nemo.utils import logging, model_utils +PYNINI_AVAILABLE = True +try: + import nemo_text_processing +except (ImportError, ModuleNotFoundError): + PYNINI_AVAILABLE = False -class SpectrogramGenerator(ModelPT, ABC): - """ Base class for all TTS models that turn text into a spectrogram """ + +class NeedsNormalizer: + """Base class for all TTS models that needs text normalization(TN)""" + + def _setup_normalizer(self, cfg): + if "text_normalizer" in cfg: + if not PYNINI_AVAILABLE: + logging.error( + "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details." + ) + logging.error("The normalizer will be disabled.") + return + normalizer_kwargs = {} + + if "whitelist" in cfg.text_normalizer: + normalizer_kwargs["whitelist"] = self.register_artifact( + 'text_normalizer.whitelist', cfg.text_normalizer.whitelist + ) + + self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs) + self.text_normalizer_call = self.normalizer.normalize + if "text_normalizer_call_kwargs" in cfg: + self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs + + +class SpectrogramGenerator(NeedsNormalizer, ModelPT, ABC): + """Base class for all TTS models that turn text into a spectrogram""" @abstractmethod def parse(self, str_input: str, **kwargs) -> 'torch.tensor': @@ -115,7 +146,7 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]': class GlowVocoder(Vocoder): - """ Base class for all Vocoders that use a Glow or reversible Flow-based setup. All child class are expected + """Base class for all Vocoders that use a Glow or reversible Flow-based setup. All child class are expected to have a parameter called audio_to_melspec_precessor that is an instance of nemo.collections.asr.parts.FilterbankFeatures""" @@ -175,7 +206,11 @@ def yet_another_patch(audio, n_fft, hop_length, win_length, window): return torch.sqrt(spec.pow(2).sum(-1)), torch.atan2(spec[..., -1], spec[..., 0]) self.stft = lambda x: yet_another_patch( - x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, + x, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, ) self.istft = lambda x, y: torch.istft( torch.complex(x * torch.cos(y), x * torch.sin(y)), @@ -252,15 +287,15 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]': return list_of_models -class TextToWaveform(ModelPT, ABC): - """ Base class for all end-to-end TTS models that generate a waveform from text """ +class TextToWaveform(NeedsNormalizer, ModelPT, ABC): + """Base class for all end-to-end TTS models that generate a waveform from text""" @abstractmethod def parse(self, str_input: str, **kwargs) -> 'torch.tensor': """ - A helper function that accepts a raw python string and turns it into a tensor. The tensor should have 2 - dimensions. The first is the batch, which should be of size 1. The second should represent time. The tensor - should represent either tokenized or embedded text, depending on the model. + A helper function that accepts a raw python string and turns it into a tensor. The tensor should have 2 + dimensions. The first is the batch, which should be of size 1. The second should represent time. The tensor + should represent either tokenized or embedded text, depending on the model. """ @abstractmethod @@ -299,7 +334,6 @@ def convert_graphemes_to_phonemes( num_workers: int = 0, pred_field: Optional[str] = "pred_text", ) -> List[str]: - """ Main function for Inference. Converts grapheme entries from the manifest "graheme_field" to phonemes Args: diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index 3235a096a04b..b1e702c89124 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -200,28 +200,6 @@ def _get_default_text_tokenizer_conf(self): text_tokenizer: TextTokenizerConfig = TextTokenizerConfig() return OmegaConf.create(OmegaConf.to_yaml(text_tokenizer)) - def _setup_normalizer(self, cfg): - if "text_normalizer" in cfg: - normalizer_kwargs = {} - - if "whitelist" in cfg.text_normalizer: - normalizer_kwargs["whitelist"] = self.register_artifact( - 'text_normalizer.whitelist', cfg.text_normalizer.whitelist - ) - try: - import nemo_text_processing - - self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs) - except Exception as e: - logging.error(e) - raise ImportError( - "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details" - ) - - self.text_normalizer_call = self.normalizer.normalize - if "text_normalizer_call_kwargs" in cfg: - self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs - def _setup_tokenizer(self, cfg): text_tokenizer_kwargs = {} @@ -240,12 +218,14 @@ def _setup_tokenizer(self, cfg): if "phoneme_dict" in cfg.text_tokenizer.g2p: g2p_kwargs["phoneme_dict"] = self.register_artifact( - 'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict, + 'text_tokenizer.g2p.phoneme_dict', + cfg.text_tokenizer.g2p.phoneme_dict, ) if "heteronyms" in cfg.text_tokenizer.g2p: g2p_kwargs["heteronyms"] = self.register_artifact( - 'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms, + 'text_tokenizer.g2p.heteronyms', + cfg.text_tokenizer.g2p.heteronyms, ) # for backward compatability @@ -478,16 +458,25 @@ def training_step(self, batch, batch_idx): ) spec_predict = mels_pred[0].data.cpu().float().numpy() self.tb_logger.add_image( - "train_mel_predicted", plot_spectrogram_to_numpy(spec_predict), self.global_step, dataformats="HWC", + "train_mel_predicted", + plot_spectrogram_to_numpy(spec_predict), + self.global_step, + dataformats="HWC", ) if self.learn_alignment: attn = attn_hard[0].data.cpu().float().numpy().squeeze() self.tb_logger.add_image( - "train_attn", plot_alignment_to_numpy(attn.T), self.global_step, dataformats="HWC", + "train_attn", + plot_alignment_to_numpy(attn.T), + self.global_step, + dataformats="HWC", ) soft_attn = attn_soft[0].data.cpu().float().numpy().squeeze() self.tb_logger.add_image( - "train_soft_attn", plot_alignment_to_numpy(soft_attn.T), self.global_step, dataformats="HWC", + "train_soft_attn", + plot_alignment_to_numpy(soft_attn.T), + self.global_step, + dataformats="HWC", ) return loss @@ -527,7 +516,20 @@ def validation_step(self, batch, batch_idx): ) # Calculate val loss on ground truth durations to better align L2 loss in time - (mels_pred, _, _, log_durs_pred, pitch_pred, _, _, _, attn_hard_dur, pitch, energy_pred, energy_tgt,) = self( + ( + mels_pred, + _, + _, + log_durs_pred, + pitch_pred, + _, + _, + _, + attn_hard_dur, + pitch, + energy_pred, + energy_tgt, + ) = self( text=text, durs=durs, pitch=pitch, @@ -587,7 +589,10 @@ def on_validation_epoch_end(self): ) spec_predict = spec_predict[0].data.cpu().float().numpy() self.tb_logger.add_image( - "val_mel_predicted", plot_spectrogram_to_numpy(spec_predict), self.global_step, dataformats="HWC", + "val_mel_predicted", + plot_spectrogram_to_numpy(spec_predict), + self.global_step, + dataformats="HWC", ) self.log_train_images = True self.validation_step_outputs.clear() # free memory) @@ -598,7 +603,10 @@ def _setup_train_dataloader(self, cfg): phon_mode = self.vocab.set_phone_prob(self.vocab.phoneme_probability) with phon_mode: - dataset = instantiate(cfg.dataset, text_tokenizer=self.vocab,) + dataset = instantiate( + cfg.dataset, + text_tokenizer=self.vocab, + ) sampler = dataset.get_sampler(cfg.dataloader_params.batch_size, world_size=self.trainer.world_size) return torch.utils.data.DataLoader( @@ -611,7 +619,10 @@ def _setup_test_dataloader(self, cfg): phon_mode = self.vocab.set_phone_prob(0.0) with phon_mode: - dataset = instantiate(cfg.dataset, text_tokenizer=self.vocab,) + dataset = instantiate( + cfg.dataset, + text_tokenizer=self.vocab, + ) return torch.utils.data.DataLoader(dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params) diff --git a/nemo/collections/tts/models/mixer_tts.py b/nemo/collections/tts/models/mixer_tts.py index 1a44cd5b31c8..c260df22e3c0 100644 --- a/nemo/collections/tts/models/mixer_tts.py +++ b/nemo/collections/tts/models/mixer_tts.py @@ -123,29 +123,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.decoder = instantiate(cfg.decoder) self.proj = nn.Linear(self.decoder.d_model, cfg.n_mel_channels) - def _setup_normalizer(self, cfg): - if "text_normalizer" in cfg: - normalizer_kwargs = {} - - if "whitelist" in cfg.text_normalizer: - normalizer_kwargs["whitelist"] = self.register_artifact( - 'text_normalizer.whitelist', cfg.text_normalizer.whitelist - ) - - try: - import nemo_text_processing - - self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs) - except Exception as e: - logging.error(e) - raise ImportError( - "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details" - ) - - self.text_normalizer_call = self.normalizer.normalize - if "text_normalizer_call_kwargs" in cfg: - self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs - def _setup_tokenizer(self, cfg): text_tokenizer_kwargs = {} if "g2p" in cfg.text_tokenizer: @@ -163,12 +140,14 @@ def _setup_tokenizer(self, cfg): if "phoneme_dict" in cfg.text_tokenizer.g2p: g2p_kwargs["phoneme_dict"] = self.register_artifact( - 'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict, + 'text_tokenizer.g2p.phoneme_dict', + cfg.text_tokenizer.g2p.phoneme_dict, ) if "heteronyms" in cfg.text_tokenizer.g2p: g2p_kwargs["heteronyms"] = self.register_artifact( - 'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms, + 'text_tokenizer.g2p.heteronyms', + cfg.text_tokenizer.g2p.heteronyms, ) text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs) @@ -269,7 +248,10 @@ def _metrics( def run_aligner(self, text, text_len, text_mask, spect, spect_len, attn_prior): text_emb = self.symbol_emb(text) attn_soft, attn_logprob = self.aligner( - spect, text_emb.permute(0, 2, 1), mask=text_mask == 0, attn_prior=attn_prior, + spect, + text_emb.permute(0, 2, 1), + mask=text_mask == 0, + attn_prior=attn_prior, ) attn_hard = binarize_attention_parallel(attn_soft, text_len, spect_len) attn_hard_dur = attn_hard.sum(2)[:, 0, :] @@ -444,7 +426,16 @@ def training_step(self, batch, batch_idx): pitch = (pitch - self.pitch_mean) / self.pitch_std pitch[zero_pitch_idx] = 0.0 - (pred_spect, _, pred_log_durs, pred_pitch, attn_soft, attn_logprob, attn_hard, attn_hard_dur,) = self( + ( + pred_spect, + _, + pred_log_durs, + pred_pitch, + attn_soft, + attn_logprob, + attn_hard, + attn_hard_dur, + ) = self( text=text, text_len=text_len, pitch=pitch, @@ -454,7 +445,17 @@ def training_step(self, batch, batch_idx): lm_tokens=lm_tokens, ) - (loss, durs_loss, acc, acc_dist_1, acc_dist_3, pitch_loss, mel_loss, ctc_loss, bin_loss,) = self._metrics( + ( + loss, + durs_loss, + acc, + acc_dist_1, + acc_dist_3, + pitch_loss, + mel_loss, + ctc_loss, + bin_loss, + ) = self._metrics( pred_durs=pred_log_durs, pred_pitch=pred_pitch, true_durs=attn_hard_dur, @@ -496,7 +497,16 @@ def validation_step(self, batch, batch_idx): pitch = (pitch - self.pitch_mean) / self.pitch_std pitch[zero_pitch_idx] = 0.0 - (pred_spect, _, pred_log_durs, pred_pitch, attn_soft, attn_logprob, attn_hard, attn_hard_dur,) = self( + ( + pred_spect, + _, + pred_log_durs, + pred_pitch, + attn_soft, + attn_logprob, + attn_hard, + attn_hard_dur, + ) = self( text=text, text_len=text_len, pitch=pitch, @@ -506,7 +516,17 @@ def validation_step(self, batch, batch_idx): lm_tokens=lm_tokens, ) - (loss, durs_loss, acc, acc_dist_1, acc_dist_3, pitch_loss, mel_loss, ctc_loss, bin_loss,) = self._metrics( + ( + loss, + durs_loss, + acc, + acc_dist_1, + acc_dist_3, + pitch_loss, + mel_loss, + ctc_loss, + bin_loss, + ) = self._metrics( pred_durs=pred_log_durs, pred_pitch=pred_pitch, true_durs=attn_hard_dur, @@ -605,7 +625,9 @@ def validation_step(self, batch, batch_idx): "raw_texts": [NeuralType(optional=True)], "lm_model": NeuralType(optional=True), }, - output_types={"spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()),}, + output_types={ + "spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()), + }, ) def generate_spectrogram( self, @@ -694,7 +716,9 @@ def _loader(self, cfg): text_tokenizer=self.tokenizer, ) return torch.utils.data.DataLoader( # noqa - dataset=dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params, + dataset=dataset, + collate_fn=dataset.collate_fn, + **cfg.dataloader_params, ) def setup_training_data(self, cfg): @@ -749,7 +773,11 @@ def output_types(self): def input_example(self, max_text_len=10, max_lm_tokens_len=10): text = torch.randint( - low=0, high=len(self.tokenizer.tokens), size=(1, max_text_len), device=self.device, dtype=torch.long, + low=0, + high=len(self.tokenizer.tokens), + size=(1, max_text_len), + device=self.device, + dtype=torch.long, ) inputs = {'text': text} diff --git a/nemo/collections/tts/models/radtts.py b/nemo/collections/tts/models/radtts.py index 959720910f11..82f85d1ed6a2 100644 --- a/nemo/collections/tts/models/radtts.py +++ b/nemo/collections/tts/models/radtts.py @@ -296,7 +296,9 @@ def _loader(self, cfg): text_tokenizer=self.tokenizer, ) return torch.utils.data.DataLoader( # noqa - dataset=dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params, + dataset=dataset, + collate_fn=dataset.collate_fn, + **cfg.dataloader_params, ) def setup_training_data(self, cfg): @@ -315,7 +317,9 @@ def setup_test_data(self, cfg): "speaker": NeuralType(('B'), Index(), optional=True), "sigma": NeuralType(optional=True), }, - output_types={"spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()),}, + output_types={ + "spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()), + }, ) def generate_spectrogram(self, tokens: 'torch.tensor', speaker: int = 0, sigma: float = 1.0) -> torch.tensor: self.eval() @@ -350,12 +354,14 @@ def _setup_tokenizer(self, cfg): if "phoneme_dict" in cfg.text_tokenizer.g2p: g2p_kwargs["phoneme_dict"] = self.register_artifact( - 'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict, + 'text_tokenizer.g2p.phoneme_dict', + cfg.text_tokenizer.g2p.phoneme_dict, ) if "heteronyms" in cfg.text_tokenizer.g2p: g2p_kwargs["heteronyms"] = self.register_artifact( - 'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms, + 'text_tokenizer.g2p.heteronyms', + cfg.text_tokenizer.g2p.heteronyms, ) text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs) @@ -374,29 +380,6 @@ def _setup_tokenizer(self, cfg): self.text_tokenizer_pad_id = text_tokenizer_pad_id self.tokens = tokens - def _setup_normalizer(self, cfg): - if "text_normalizer" in cfg: - normalizer_kwargs = {} - - if "whitelist" in cfg.text_normalizer: - normalizer_kwargs["whitelist"] = self.register_artifact( - 'text_normalizer.whitelist', cfg.text_normalizer.whitelist - ) - - try: - import nemo_text_processing - - self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs) - self.text_normalizer_call = self.normalizer.normalize - except Exception as e: - logging.error(e) - raise ImportError( - "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details" - ) - self.text_normalizer_call = self.normalizer.normalize - if "text_normalizer_call_kwargs" in cfg: - self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs - def parse(self, text: str, normalize=False) -> torch.Tensor: if self.training: logging.warning("parse() is meant to be called in eval mode.") @@ -479,7 +462,11 @@ def input_example(self, max_batch=1, max_dim=400): inp[inp == pad_id] = pad_id - 1 if pad_id > 0 else pad_id + 1 inputs.update( - {'speaker_id': speaker, 'speaker_id_text': speaker, 'speaker_id_attributes': speaker,} + { + 'speaker_id': speaker, + 'speaker_id_text': speaker, + 'speaker_id_attributes': speaker, + } ) new_inputs = { 'text': inp, @@ -495,11 +482,24 @@ def input_example(self, max_batch=1, max_dim=400): return (new_inputs,) def forward_for_export( - self, text, batch_lengths, speaker_id, speaker_id_text, speaker_id_attributes, pitch, pace, volume, + self, + text, + batch_lengths, + speaker_id, + speaker_id_text, + speaker_id_attributes, + pitch, + pace, + volume, ): if self.export_config["enable_ragged_batches"]: text, pitch, pace, volume_tensor, lens = batch_from_ragged( - text, pitch, pace, batch_lengths=batch_lengths, padding_idx=self.tokenizer_pad, volume=volume, + text, + pitch, + pace, + batch_lengths=batch_lengths, + padding_idx=self.tokenizer_pad, + volume=volume, ) if volume is not None: volume = volume_tensor diff --git a/nemo/collections/tts/models/tacotron2.py b/nemo/collections/tts/models/tacotron2.py index 3fcdee9832ef..2fb005d80ca6 100644 --- a/nemo/collections/tts/models/tacotron2.py +++ b/nemo/collections/tts/models/tacotron2.py @@ -322,29 +322,6 @@ def on_validation_epoch_end(self): self.log('val_loss', avg_loss) self.validation_step_outputs.clear() # free memory - def _setup_normalizer(self, cfg): - if "text_normalizer" in cfg: - normalizer_kwargs = {} - - if "whitelist" in cfg.text_normalizer: - normalizer_kwargs["whitelist"] = self.register_artifact( - 'text_normalizer.whitelist', cfg.text_normalizer.whitelist - ) - - try: - import nemo_text_processing - - self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs) - except Exception as e: - logging.error(e) - raise ImportError( - "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details" - ) - - self.text_normalizer_call = self.normalizer.normalize - if "text_normalizer_call_kwargs" in cfg: - self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs - def _setup_tokenizer(self, cfg): text_tokenizer_kwargs = {} if "g2p" in cfg.text_tokenizer and cfg.text_tokenizer.g2p is not None: @@ -362,12 +339,14 @@ def _setup_tokenizer(self, cfg): if "phoneme_dict" in cfg.text_tokenizer.g2p: g2p_kwargs["phoneme_dict"] = self.register_artifact( - 'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict, + 'text_tokenizer.g2p.phoneme_dict', + cfg.text_tokenizer.g2p.phoneme_dict, ) if "heteronyms" in cfg.text_tokenizer.g2p: g2p_kwargs["heteronyms"] = self.register_artifact( - 'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms, + 'text_tokenizer.g2p.heteronyms', + cfg.text_tokenizer.g2p.heteronyms, ) text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs) diff --git a/nemo/collections/tts/models/vits.py b/nemo/collections/tts/models/vits.py index 319221d04ee0..4a891fa8823e 100644 --- a/nemo/collections/tts/models/vits.py +++ b/nemo/collections/tts/models/vits.py @@ -92,28 +92,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.automatic_optimization = False - def _setup_normalizer(self, cfg): - if "text_normalizer" in cfg: - normalizer_kwargs = {} - - if "whitelist" in cfg.text_normalizer: - normalizer_kwargs["whitelist"] = self.register_artifact( - 'text_normalizer.whitelist', cfg.text_normalizer.whitelist - ) - - try: - import nemo_text_processing - - self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs) - self.text_normalizer_call = self.normalizer.normalize - except Exception as e: - logging.error(e) - raise ImportError( - "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details" - ) - if "text_normalizer_call_kwargs" in cfg: - self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs - def _setup_tokenizer(self, cfg): text_tokenizer_kwargs = {} if "g2p" in cfg.text_tokenizer and cfg.text_tokenizer.g2p is not None: @@ -131,12 +109,14 @@ def _setup_tokenizer(self, cfg): if "phoneme_dict" in cfg.text_tokenizer.g2p: g2p_kwargs["phoneme_dict"] = self.register_artifact( - 'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict, + 'text_tokenizer.g2p.phoneme_dict', + cfg.text_tokenizer.g2p.phoneme_dict, ) if "heteronyms" in cfg.text_tokenizer.g2p: g2p_kwargs["heteronyms"] = self.register_artifact( - 'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms, + 'text_tokenizer.g2p.heteronyms', + cfg.text_tokenizer.g2p.heteronyms, ) text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs) @@ -164,8 +144,14 @@ def configure_optimizers(self): sched_config = optim_config.pop("sched", None) OmegaConf.set_struct(optim_config, True) - optim_g = instantiate(optim_config, params=self.net_g.parameters(),) - optim_d = instantiate(optim_config, params=self.net_d.parameters(),) + optim_g = instantiate( + optim_config, + params=self.net_g.parameters(), + ) + optim_d = instantiate( + optim_config, + params=self.net_d.parameters(), + ) if sched_config is not None: if sched_config.name == 'ExponentialLR': @@ -173,10 +159,14 @@ def configure_optimizers(self): scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=sched_config.lr_decay) elif sched_config.name == 'CosineAnnealing': scheduler_g = CosineAnnealing( - optimizer=optim_g, max_steps=sched_config.max_steps, min_lr=sched_config.min_lr, + optimizer=optim_g, + max_steps=sched_config.max_steps, + min_lr=sched_config.min_lr, ) scheduler_d = CosineAnnealing( - optimizer=optim_d, max_steps=sched_config.max_steps, min_lr=sched_config.min_lr, + optimizer=optim_d, + max_steps=sched_config.max_steps, + min_lr=sched_config.min_lr, ) else: raise ValueError("Unknown optimizer.") @@ -362,7 +352,9 @@ def _loader(self, cfg): text_tokenizer=self.tokenizer, ) return torch.utils.data.DataLoader( # noqa - dataset=dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params, + dataset=dataset, + collate_fn=dataset.collate_fn, + **cfg.dataloader_params, ) def train_dataloader(self): @@ -377,7 +369,10 @@ def train_dataloader(self): train_sampler = DistributedBucketSampler(dataset, **self.cfg.train_ds.batch_sampler) dataloader = torch.utils.data.DataLoader( - dataset, collate_fn=dataset.collate_fn, batch_sampler=train_sampler, **self.cfg.train_ds.dataloader_params, + dataset, + collate_fn=dataset.collate_fn, + batch_sampler=train_sampler, + **self.cfg.train_ds.dataloader_params, ) return dataloader @@ -412,7 +407,9 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]': return list_of_models @typecheck( - input_types={"tokens": NeuralType(('B', 'T_text'), TokenIndex(), optional=True),}, + input_types={ + "tokens": NeuralType(('B', 'T_text'), TokenIndex(), optional=True), + }, output_types={"audio": NeuralType(('B', 'T_audio'), AudioSignal())}, ) def convert_text_to_waveform(self, *, tokens, speakers=None): diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index a7107974fbaa..fb43224d59a9 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -176,6 +176,7 @@ def export( multiple_profiles: bool = False, gpt_attention_plugin: str = "auto", gemm_plugin: str = "auto", + use_mcore_path: bool = False, reduce_fusion: bool = True, fp8_quantized: Optional[bool] = None, fp8_kvcache: Optional[bool] = None, @@ -213,11 +214,11 @@ def export( multiple_profiles: (bool): enables multiple profiles feature of TRT-LLM. Default = False gpt_attention_plugin (str): enable the gpt attention plugin. Default = "auto" gemm_plugin (str): enable the gpt plugin. Default = "auto" + use_mcore_path (bool) : Use the more recent mcore path for export reduce_fusion (bool): enables fusing extra kernels after custom TRT-LLM allReduce fp8_quantized (Optional[bool]): enables exporting to FP8 TRT-LLM checkpoints. If not set, autodetects the type. fp8_kvcache (Optional[bool]): enables FP8 KV-cache quantization. If not set, autodetects the type. """ - if n_gpus is not None: warnings.warn( "Parameter n_gpus is deprecated and will be removed in the next release. " @@ -326,53 +327,169 @@ def export( "Supported model types are: {1}.".format(model_type, self.get_supported_models_list) ) - if model_type == "gpt" or model_type == "starcoder": - model_type = "gptnext" + model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir) + if use_mcore_path: + from megatron.core.export.data_type import DataType + from megatron.core.export.export_config import ExportConfig + from megatron.core.export.model_type import ModelType + from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import ( + DEFAULT_CONVERSION_DICT, + ) + from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper + from megatron.core.transformer.transformer_config import TransformerConfig + from tensorrt_llm.layers import MoeConfig + + def get_transformer_config(nemo_model_config): + normalization = nemo_model_config.get('normalization', 'layernorm') + transformer_config_normalization = 'LayerNorm' + layernorm_zero_centered_gamma = False + if normalization == 'layernorm1p': + layernorm_zero_centered_gamma = True + elif normalization == 'rmsnorm': + transformer_config_normalization = 'RMSNorm' + + conf = TransformerConfig( + num_layers=nemo_model_config.get('num_layers'), + moe_router_topk=nemo_model_config.get('moe_router_topk', 0), + num_attention_heads=nemo_model_config.get('num_attention_heads'), + num_query_groups=nemo_model_config.get( + 'num_query_groups', nemo_model_config['num_attention_heads'] + ), + kv_channels=nemo_model_config.get("kv_channels", None), + hidden_size=nemo_model_config.get('hidden_size'), + ffn_hidden_size=nemo_model_config.get('ffn_hidden_size'), + layernorm_epsilon=nemo_model_config.get('layernorm_epsilon'), + add_bias_linear=nemo_model_config.get('bias'), + num_moe_experts=nemo_model_config.get('num_moe_experts', None), + normalization=transformer_config_normalization, + layernorm_zero_centered_gamma=layernorm_zero_centered_gamma, + ) - if model_type == "mixtral": - model_type = "llama" + return conf + + # We build the transformer config using the nemo model config. + transformer_config = get_transformer_config(model_configs) + input_model_type = getattr(ModelType, model_type) + + # MCore export supports some default conversion dictionaries + mcore_model_conversion_dict = DEFAULT_CONVERSION_DICT[input_model_type] + # All Mcore conversion dicts start with "decoder.layers.4.blah.blah" , while nemo models start with "model.decoder.layers.4.blahblah". so we append model. to the keys + nemo_model_conversion_dict = { + f'model.{key}': value for key, value in mcore_model_conversion_dict.items() + } + + trtllm_helper = TRTLLMHelper( + transformer_config=transformer_config, + model_type=input_model_type, + trtllm_conversion_dict=nemo_model_conversion_dict, + position_embedding_type=model_configs.get('position_embedding_type'), + max_position_embeddings=model_configs.get('max_position_embeddings'), + rotary_percentage=model_configs.get('rotary_percentage', 1.0), + rotary_base=model_configs.get('rotary_base', 10000), + moe_tp_mode=model_configs.get('moe_tp_mode', 2), + multi_query_mode=model_configs.get("multi_query_mode", False), + activation=model_configs.get('activation', "gelu"), + seq_len_interpolation_factor=model_configs.get("seq_len_interpolation_factor"), + moe_renorm_mode=model_configs.get( + 'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE + ), + share_embeddings_and_output_weights=model_configs.get( + "share_embeddings_and_output_weights", False + ), + ) - model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir) - weights_dicts, model_configs = model_to_trtllm_ckpt( - model=model, - nemo_model_config=model_configs, - nemo_export_dir=nemo_export_dir, - decoder_type=model_type, - dtype=dtype, - tensor_parallel_size=tensor_parallelism_size, - pipeline_parallel_size=pipeline_parallelism_size, - gpus_per_node=gpus_per_node, - use_parallel_embedding=use_parallel_embedding, - use_embedding_sharing=use_embedding_sharing, - fp8_quantized=fp8_quantized, - fp8_kvcache=fp8_kvcache, - ) + input_dtype = getattr(DataType, dtype) + export_config = ExportConfig( + tensor_parallelism_size, + pipeline_parallelism_size, + use_parallel_embedding, + use_embedding_sharing, + ) - for weight_dict, model_config in zip(weights_dicts, model_configs): - build_and_save_engine( - max_input_len=max_input_len, - max_output_len=max_output_len, - max_batch_size=max_batch_size, - model_config=model_config, - model_weights=weight_dict, - model_dir=self.model_dir, - model_type=model_type, - lora_ckpt_list=self.lora_ckpt_list, - use_lora_plugin=use_lora_plugin, - max_lora_rank=max_lora_rank, - lora_target_modules=lora_target_modules, - max_prompt_embedding_table_size=max_prompt_embedding_table_size, - paged_kv_cache=paged_kv_cache, - remove_input_padding=remove_input_padding, - paged_context_fmha=paged_context_fmha, - max_num_tokens=max_num_tokens, - opt_num_tokens=opt_num_tokens, - max_seq_len=max_seq_len, - multiple_profiles=multiple_profiles, - gpt_attention_plugin=gpt_attention_plugin, - gemm_plugin=gemm_plugin, + trtllm_model_weights_list, trtllm_model_config_list = ( + trtllm_helper.get_trtllm_pretrained_config_and_model_weights( + model_state_dict=model, + export_config=export_config, + dtype=input_dtype, + state_dict_split_by_layer_numbers=False, + ) + ) + + for trtllm_model_weights, trtllm_model_config in zip( + trtllm_model_weights_list, trtllm_model_config_list + ): + trtllm_helper.build_and_save_engine( + max_input_len=max_input_len, + max_output_len=max_output_len, + max_batch_size=max_batch_size, + engine_dir=self.model_dir, + trtllm_model_weights=trtllm_model_weights, + trtllm_model_config=trtllm_model_config, + lora_ckpt_list=self.lora_ckpt_list, + use_lora_plugin=use_lora_plugin, + max_lora_rank=max_lora_rank, + lora_target_modules=lora_target_modules, + max_prompt_embedding_table_size=max_prompt_embedding_table_size, + paged_kv_cache=paged_kv_cache, + remove_input_padding=remove_input_padding, + paged_context_fmha=paged_context_fmha, + use_refit=False, + max_num_tokens=max_num_tokens, + max_seq_len=max_seq_len, + opt_num_tokens=opt_num_tokens, + max_beam_width=1, + tokens_per_block=128, + multiple_profiles=multiple_profiles, + gpt_attention_plugin=gpt_attention_plugin, + gemm_plugin=gemm_plugin, + ) + else: + if model_type == "gpt" or model_type == "starcoder": + model_type = "gptnext" + + if model_type == "mixtral": + model_type = "llama" + + weights_dicts, model_configs = model_to_trtllm_ckpt( + model=model, + nemo_model_config=model_configs, + nemo_export_dir=nemo_export_dir, + decoder_type=model_type, + dtype=dtype, + tensor_parallel_size=tensor_parallelism_size, + pipeline_parallel_size=pipeline_parallelism_size, + gpus_per_node=gpus_per_node, + use_parallel_embedding=use_parallel_embedding, + use_embedding_sharing=use_embedding_sharing, + fp8_quantized=fp8_quantized, + fp8_kvcache=fp8_kvcache, ) + for weight_dict, model_config in zip(weights_dicts, model_configs): + build_and_save_engine( + max_input_len=max_input_len, + max_output_len=max_output_len, + max_batch_size=max_batch_size, + model_config=model_config, + model_weights=weight_dict, + model_dir=self.model_dir, + model_type=model_type, + lora_ckpt_list=self.lora_ckpt_list, + use_lora_plugin=use_lora_plugin, + max_lora_rank=max_lora_rank, + lora_target_modules=lora_target_modules, + max_prompt_embedding_table_size=max_prompt_embedding_table_size, + paged_kv_cache=paged_kv_cache, + remove_input_padding=remove_input_padding, + paged_context_fmha=paged_context_fmha, + max_num_tokens=max_num_tokens, + opt_num_tokens=opt_num_tokens, + max_seq_len=max_seq_len, + multiple_profiles=multiple_profiles, + gpt_attention_plugin=gpt_attention_plugin, + gemm_plugin=gemm_plugin, + ) + tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") tokenizer_path_nemo2 = os.path.join(nemo_export_dir, "nemo_context") if os.path.exists(tokenizer_path): @@ -454,7 +571,6 @@ def convert_to_safe_tensors( weight_dict[k] = numpy_to_torch(v) safetensors.torch.save_file(weight_dict, os.path.join(self.model_dir, f'rank{rank}.safetensors')) - model_configs[0].to_json_file(os.path.join(self.model_dir, 'config.json')) tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") diff --git a/nemo/lightning/io/artifact/base.py b/nemo/lightning/io/artifact/base.py index ec451de9753b..7d2d608c4149 100644 --- a/nemo/lightning/io/artifact/base.py +++ b/nemo/lightning/io/artifact/base.py @@ -6,10 +6,10 @@ class Artifact(ABC, Generic[ValueT]): - def __init__(self, attr: str, required: bool = True): + def __init__(self, attr: str, required: bool = True, skip: bool = False): self.attr = attr self.required = required - self.skip = False + self.skip = skip @abstractmethod def dump(self, value: ValueT, absolute_dir: Path, relative_dir: Path) -> ValueT: @@ -18,3 +18,6 @@ def dump(self, value: ValueT, absolute_dir: Path, relative_dir: Path) -> ValueT: @abstractmethod def load(self, path: Path) -> ValueT: pass + + def __repr__(self): + return f"{type(self).__name__}(skip= {self.skip}, attr= {self.attr}, required= {self.required})" diff --git a/nemo/lightning/io/artifact/file.py b/nemo/lightning/io/artifact/file.py index 1364468cde0a..1cd63b706c9a 100644 --- a/nemo/lightning/io/artifact/file.py +++ b/nemo/lightning/io/artifact/file.py @@ -2,6 +2,7 @@ import shutil from pathlib import Path from typing import Union +import fiddle as fdl from nemo.lightning.io.artifact.base import Artifact @@ -19,8 +20,7 @@ class FileArtifact(Artifact[str]): def dump(self, value: str, absolute_dir: Path, relative_dir: Path) -> str: if not pathize(value).exists(): # This is Artifact is just a string. - self.skip = True - return value + return fdl.Config(FileArtifact, attr=value, skip=True) new_value = copy_file(value, absolute_dir, relative_dir) return str(new_value) @@ -65,8 +65,7 @@ class DirOrStringArtifact(DirArtifact): def dump(self, value: str, absolute_dir: Path, relative_dir: Path) -> str: if not pathize(value).exists(): # This is Artifact is just a string. - self.skip = True - return value + return fdl.Config(DirOrStringArtifact, attr=value, skip=True) return super().dump(value, absolute_dir, relative_dir) def load(self, path: str) -> str: diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py index 38fbda42c67d..e7ba67b277f8 100644 --- a/nemo/lightning/io/connector.py +++ b/nemo/lightning/io/connector.py @@ -134,7 +134,9 @@ class ModelConnector(Connector, Generic[SourceT, TargetT]): Loads a model from the specified path, optionally using a CPU-focused strategy, and returns the model and trainer. """ - def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = None) -> pl.Trainer: + def nemo_setup( + self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = None, *args, **kwargs + ) -> pl.Trainer: """ Sets up the model and trainer using a specified strategy, preparing it for training or inference. @@ -150,7 +152,7 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = _trainer = trainer or Trainer( devices=1, accelerator="cpu", - strategy=MegatronStrategy(ckpt_save_optimizer=False, always_save_context=True), + strategy=MegatronStrategy(ckpt_save_optimizer=False, always_save_context=True, *args, **kwargs), ) # Note: set trainer to fitting state to avoid the following code path. Feel free to refactor if we no longer # need to avoid this: diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py index aa74e2cf174c..27cb3b18b55b 100644 --- a/nemo/lightning/io/mixin.py +++ b/nemo/lightning/io/mixin.py @@ -584,8 +584,12 @@ def _io_path_elements_fn(x): def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "."): for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []): # Allow optional artifacts - if artifact.skip: + if artifact.skip or (not hasattr(cfg, artifact.attr) and not artifact.required): continue + + if not hasattr(cfg, artifact.attr) and artifact.required: + raise ValueError(f"Artifact '{artifact.attr}' is required but not provided") + current_val = getattr(cfg, artifact.attr) if current_val is None: if artifact.required: @@ -605,6 +609,15 @@ def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: P def _artifact_transform_load(cfg: fdl.Config, path: Path): for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []): + # We expect an artifact.attr to be a string or a fdl.Config. + # Some parameteres can be a string or a filepath. When those parameters are just strings, + # we will represent it with a fdl.Config, and will skip the rest of the loop (base-dir adjustment). + current_val = getattr(cfg, artifact.attr) + if isinstance(current_val, fdl.Config): + # artifact.attr is a string not a path. + setattr(cfg, artifact.attr, fdl.build(current_val).attr) + continue + if artifact.skip: continue current_val = getattr(cfg, artifact.attr) diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py index 5244939eb5fb..adf890a8fb11 100644 --- a/nemo/lightning/pytorch/callbacks/model_checkpoint.py +++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py @@ -49,7 +49,7 @@ class ModelCheckpoint(PTLModelCheckpoint): ``every_n_epochs`` or ``every_n_train_steps``. save_on_train_epoch_end: Whether to run checkpointing at the end of the training epoch save_optim_on_train_end: Whether to include the optimizer states in the final checkpoint - at the end of training. Only applicable when save_weights_only is ``True``. + at the end of training. Only applicable when save_weights_only is ``False``. always_save_context: Whether to dump the artifacts needed to reinintialize the current model, trainer, and dataloader to allow for reproducibility of experiments. save_context_on_train_end: Whether to dump the artifacts on_train_end regardless of whether diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py index c45281443136..2c29ac44124b 100644 --- a/nemo/lightning/pytorch/callbacks/peft.py +++ b/nemo/lightning/pytorch/callbacks/peft.py @@ -91,12 +91,26 @@ def __call__(self, model: nn.Module) -> nn.Module: Returns: nn.Module: The transformed model with PEFT applied. """ - - model.freeze() + self.freeze_model(model) model.walk(self.transform) return model + def freeze_model(self, model: nn.Module) -> None: + """Apply a default freeze method to the model. + + This method freezes all the model parameters. This method can be overridden by subclasses to + implement custom freeze strategies (e.g. freeze only parts of the model) + + Args: + model (nn.Module): The model to be fine-tuned. + + Returns: + nn.Module: The transformed model with PEFT applied. + """ + model.freeze() + model.train(mode=True) + def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str) -> None: super().setup(trainer, pl_module, stage=stage) diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py index 45905729b8b1..dfcc7c1650ce 100644 --- a/nemo/lightning/run/plugins.py +++ b/nemo/lightning/run/plugins.py @@ -287,16 +287,16 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor): tp_size = task.trainer.strategy.tensor_model_parallel_size cp_size = task.trainer.strategy.context_parallel_size if tp_size > 1 and cp_size > 1: - executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = 1 + executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" # Set LayerNorm SM margin to support the overlap with LayerNorm kernel if self.enable_layernorm_sm_margin: - executor.env_vars["NVTE_FWD_LAYERNORM_SM_MARGIN"] = self.layernorm_sm_margin - executor.env_vars["NVTE_BWD_LAYERNORM_SM_MARGIN"] = self.layernorm_sm_margin + executor.env_vars["NVTE_FWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin) + executor.env_vars["NVTE_BWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin) # Force Transformer Engine to use cuDNN attention over HazyResearch's Flash Attention - executor.env_vars["NVTE_FLASH_ATTN"] = 0 - executor.env_vars["NVTE_FUSED_ATTN"] = 1 + executor.env_vars["NVTE_FLASH_ATTN"] = "0" + executor.env_vars["NVTE_FUSED_ATTN"] = "1" # Improve perf by steering power to tensor cores, may not work on all systems if self.enable_vboost and isinstance(executor, run.SlurmExecutor): diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index 3d4b7189f56e..2bfb40e89e15 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -1169,6 +1169,16 @@ def configure_checkpointing( params.filename = f'{name}--{{{params.monitor}:.4f}}-{{epoch}}' if params.prefix is None: params.prefix = name + if params.always_save_nemo: + app_state = AppState() + if (app_state.tensor_model_parallel_size is not None and app_state.tensor_model_parallel_size > 1) or (app_state.pipeline_model_parallel_size is not None and app_state.pipeline_model_parallel_size > 1) or (app_state.context_parallel_size is not None and app_state.context_parallel_size > 1): + raise LoggerMisconfigurationError( + "always_save_nemo is set to True, please ensure that model parallel is not used." + f"tensor_model_parallel_size: {app_state.tensor_model_parallel_size}," + f"pipeline_model_parallel_size: {app_state.pipeline_model_parallel_size}," + f"context_parallel_size: {app_state.context_parallel_size}," + ) + NeMoModelCheckpoint.CHECKPOINT_NAME_LAST = params.filename + '-last' logging.debug(params.dirpath) diff --git a/nemo/utils/nemo_logging.py b/nemo/utils/nemo_logging.py index 95e17e5c5f6c..bcc7ad199603 100644 --- a/nemo/utils/nemo_logging.py +++ b/nemo/utils/nemo_logging.py @@ -76,7 +76,7 @@ def __init__(self, capture_warnings=True): self.rank = 0 if is_global_rank_zero() else "UNK" def _define_logger(self, capture_warnings=True): - """ Creates the logger if not already created. Called in init""" + """Creates the logger if not already created. Called in init""" # Use double-checked locking to avoid taking lock unnecessarily. if self._logger is not None: @@ -126,7 +126,7 @@ def record_factory(*args, **kwargs): self._logger.propagate = False def remove_stream_handlers(self): - """ Removes StreamHandler that log to stdout and stderr from the logger.""" + """Removes StreamHandler that log to stdout and stderr from the logger.""" if self._logger is None: raise RuntimeError("Impossible to set handlers if the Logger is not predefined") @@ -236,7 +236,7 @@ def set_verbosity(self, verbosity_level): @contextmanager def patch_stderr_handler(self, stream): - """ Sends messages that should log to stderr to stream instead. Useful for unittests """ + """Sends messages that should log to stderr to stream instead. Useful for unittests""" if self._logger is not None: try: old_stream = self._handlers["stream_stderr"].stream @@ -268,7 +268,7 @@ def patch_stderr_handler(self, stream): @contextmanager def patch_stdout_handler(self, stream): - """ Sends messages that should log to stdout to stream instead. Useful for unittests """ + """Sends messages that should log to stdout to stream instead. Useful for unittests""" if self._logger is not None: try: old_stream = self._handlers["stream_stdout"].stream @@ -339,6 +339,16 @@ def captureWarnings(self, capture): warnings.showwarning = self.old_warnings_showwarning self.old_warnings_showwarning = None + def _warning_is_ignored(self, category): + from warnings import filters + + # Search the filters + for action, msg, cat, mod, ln in filters: + # least-common demoninator if multiple filters for the same class. + if cat == category and action == 'ignore': + return True + return False + def _showwarning(self, message, category, filename, lineno, file=None, line=None): """ Implementation of showwarnings which redirects to logging. @@ -346,6 +356,8 @@ def _showwarning(self, message, category, filename, lineno, file=None, line=None with level logging.WARNING. """ s = warnings.formatwarning(message, category, filename, lineno, line) + if self._warning_is_ignored(category): + return self.warning("%s", s) def _logged_once(self, msg, mode): diff --git a/requirements/requirements_multimodal.txt b/requirements/requirements_multimodal.txt index 8b56c3974a25..18abe82c9f96 100644 --- a/requirements/requirements_multimodal.txt +++ b/requirements/requirements_multimodal.txt @@ -1,6 +1,6 @@ addict clip -decord +decord; sys_platform == 'linux' diffusers>=0.19.3 einops_exts imageio diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index 3d168ad3b12a..7ef03689b9b5 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -9,7 +9,7 @@ gdown h5py ijson jieba -mamba-ssm==2.2.2 +mamba-ssm==2.2.2; sys_platform == 'linux' markdown2 matplotlib>=3.3.2 #megatron_core>0.6.0 # add back once mcore on pypi is compatible again diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py index f395e34765d0..42d3e77ce4c8 100644 --- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py +++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py @@ -15,7 +15,7 @@ r""" Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint. Example to run this conversion script: - python convert_llama_hf_to_nemo.py \ + python convert_llama_hf_to_nemo_load.py \ --input_name_or_path \ --input_state_dict \ --output_path \ diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py index 940a9df5f9a8..f7096996e5b1 100644 --- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py +++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py @@ -15,7 +15,7 @@ r""" Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint. Example to run this conversion script: - python convert_llama_hf_to_nemo.py \ + python convert_llama_hf_to_nemo_save_dict.py \ --input_name_or_path \ --output_path --precision bf16 diff --git a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py index ba9012de01a8..796819c38ba4 100644 --- a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py @@ -65,6 +65,7 @@ def load_config(hf_model_name, nemo_config): logging.warning(f"Got unknown activation function {nemo_config.activation}") hf_config.rope_theta = nemo_config['rotary_base'] + hf_config.tie_word_embeddings = getattr(nemo_config, "share_embeddings_and_output_weights", False) return hf_config @@ -213,7 +214,13 @@ def convert(in_file, precision=None, cpu_only=True) -> None: output_layer_base_name = 'model.output_layer.weight' else: output_layer_base_name = 'model.language_model.output_layer.weight' - state_dict[hf_output_layer_weight_name] = param_to_weights(ckpt[output_layer_base_name]) + + if getattr(nemo_config, "share_embeddings_and_output_weights", False): + # tie_word_embeddings: True + state_dict[hf_output_layer_weight_name] = state_dict[embed_weights_base_name] + else: + # tie_word_embeddings: False + state_dict[hf_output_layer_weight_name] = param_to_weights(ckpt[output_layer_base_name]) return state_dict, nemo_config, dtype diff --git a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py index 29b56aa706fa..eeaee9aba461 100644 --- a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py +++ b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py @@ -16,14 +16,13 @@ Conversion script to convert zarr checkpoints into torch distributed checkpoint. Example to run this conversion script: python -m torch.distributed.launch --nproc_per_node= * \ - megatron_zarr_ckpt_to_torch_dist.py \ + convert_zarr_to_torch_dist.py \ --model_type \ --checkpoint_folder \ --checkpoint_name \ --path_to_save \ --tensor_model_parallel_size \ --pipeline_model_parallel_size \ - --hparams_file \ --gpus_per_node """ @@ -64,12 +63,14 @@ def get_args(): "--hparams_file", type=str, default=None, - required=True, + required=False, help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml", ) parser.add_argument("--path_to_save", type=str, default=None, required=True, help="Path to output ckpt files.") parser.add_argument( - "--save_to_nemo", action="store_true", help="If passed, output will be written as .nemo file.", + "--save_to_nemo", + action="store_true", + help="If passed, output will be written as .nemo file.", ) parser.add_argument("--gpus_per_node", type=int, required=True, default=None) parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None) @@ -81,7 +82,7 @@ def get_args(): default=None, help="If pipeline parallel size > 1, this is the rank at which the encoder ends and the decoder begins.", ) - parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1)) + parser.add_argument("--local-rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1)) parser.add_argument("--cluster_type", required=False, default=None, help="Whether on BCP platform") parser.add_argument( "--precision", @@ -93,7 +94,18 @@ def get_args(): ) parser.add_argument( - "--model_type", type=str, required=True, default="gpt", choices=["gpt", "sft", "bert"], + "--model_type", + type=str, + required=True, + default="gpt", + choices=["gpt", "sft", "bert"], + ), + parser.add_argument( + "--ckpt_format", + type=str, + required=False, + default="torch_dist", + choices=["zarr", "torch_dist"], ) args = parser.parse_args() @@ -114,7 +126,7 @@ def convert(local_rank, rank, world_size, args): 'precision': args.precision, }, 'model': { - 'native_amp_init_scale': 2 ** 32, + 'native_amp_init_scale': 2**32, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'gradient_as_bucket_view': True, @@ -167,7 +179,7 @@ def convert(local_rank, rank, world_size, args): ) with open_dict(model.cfg): - model.cfg.torch_distributed_checkpoint = True + model.cfg.dist_ckpt_format = args.ckpt_format model._save_restore_connector = NLPSaveRestoreConnector() save_file_path = args.path_to_save diff --git a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py index 31fe822573ce..4715f4826493 100644 --- a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py +++ b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py @@ -53,8 +53,8 @@ def stt_en_fastconformer_transducer_large(): 8, True, marks=pytest.mark.xfail( - reason="""Cannot instantiate the -body cuda graph of a conditional node with a persistent kernel (in this case, + reason="""Cannot instantiate the +body cuda graph of a conditional node with a persistent kernel (in this case, a persistent LSTM), which is triggered in cudnn by using a batch size of 8.""" ), ), diff --git a/tests/collections/asr/decoding/rnnt_alignments_check.py b/tests/collections/asr/decoding/test_rnnt_alignments.py similarity index 94% rename from tests/collections/asr/decoding/rnnt_alignments_check.py rename to tests/collections/asr/decoding/test_rnnt_alignments.py index ec0656cbce49..5c43af28b1d4 100644 --- a/tests/collections/asr/decoding/rnnt_alignments_check.py +++ b/tests/collections/asr/decoding/test_rnnt_alignments.py @@ -13,10 +13,6 @@ # limitations under the License. -# NOTE: the file name does not contain "test" on purpose to avoid executing -# these tests outside of the CI machines environment, where test data is -# stored - from pathlib import Path from typing import Union @@ -27,6 +23,7 @@ from nemo.collections.asr.models import EncDecRNNTBPEModel from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.asr.parts.utils.transcribe_utils import prepare_audio_data DEVICES = [] @@ -65,7 +62,7 @@ def get_rnnt_alignments( loop_labels: bool = True, use_cuda_graph_decoder=False, device="cuda", -): +) -> list[Hypothesis]: cfg = OmegaConf.structured(TranscriptionConfig()) cfg.rnnt_decoding.confidence_cfg.preserve_frame_confidence = True cfg.rnnt_decoding.preserve_alignments = True @@ -74,12 +71,13 @@ def get_rnnt_alignments( cfg.rnnt_decoding.greedy.loop_labels = loop_labels cfg.rnnt_decoding.greedy.use_cuda_graph_decoder = use_cuda_graph_decoder cfg.dataset_manifest = str(manifest_path) - filepaths = prepare_audio_data(cfg)[0][:10] # selecting 10 files only + filepaths = prepare_audio_data(cfg)[0][:8] # selecting 8 files only + # NB: 9th file has the same transcription but a bit different alignment for batched/non-batched decoding model = model.to(device) model.change_decoding_strategy(cfg.rnnt_decoding) - transcriptions = model.transcribe( + transcriptions: list[Hypothesis] = model.transcribe( audio=filepaths, batch_size=cfg.batch_size, num_workers=cfg.num_workers, diff --git a/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py b/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py index 67174974f9a3..e0b9862f23e1 100644 --- a/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py +++ b/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py @@ -59,6 +59,7 @@ def get_args(): strategy=nl.MegatronStrategy( ckpt_load_optimizer=False, ckpt_save_optimizer=False, + ckpt_async_save=False, tensor_model_parallel_size=1, ), plugins=nl.MegatronMixedPrecision( diff --git a/tests/collections/llm/gpt/model/test_mistral.py b/tests/collections/llm/gpt/model/test_mistral.py index 365bb35b2725..025ea35dd6e9 100644 --- a/tests/collections/llm/gpt/model/test_mistral.py +++ b/tests/collections/llm/gpt/model/test_mistral.py @@ -1,6 +1,6 @@ import torch.nn.functional as F -from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralNeMo2407Config12B, MistralNeMo2407Config123B +from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralNeMoConfig12B, MistralNeMoConfig123B def test_mistral_config7b(): @@ -25,7 +25,7 @@ def test_mistral_config7b(): def test_mistral_nemo_config_12b(): - config = MistralNeMo2407Config12B() + config = MistralNeMoConfig12B() assert config.normalization == "RMSNorm" assert config.activation_func == F.silu assert config.position_embedding_type == "rope" @@ -49,7 +49,7 @@ def test_mistral_nemo_config_12b(): def test_mistral_nemo_config_123b(): - config = MistralNeMo2407Config123B() + config = MistralNeMoConfig123B() assert config.normalization == "RMSNorm" assert config.activation_func == F.silu assert config.position_embedding_type == "rope" diff --git a/tests/collections/llm/gpt_finetuning.py b/tests/collections/llm/gpt_finetuning.py index 9eca287669cd..7eaa7744729c 100644 --- a/tests/collections/llm/gpt_finetuning.py +++ b/tests/collections/llm/gpt_finetuning.py @@ -19,6 +19,7 @@ from nemo import lightning as nl from nemo.collections import llm +from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer ## NOTE: This script is present for github-actions testing only. @@ -43,6 +44,7 @@ def get_args(): parser.add_argument('--mbs', type=int, default=1, help="micro batch size") parser.add_argument('--tp_size', type=int, default=1, help="tensor parallel size") parser.add_argument('--pp_size', type=int, default=1, help="pipeline parallel size") + parser.add_argument('--packed', action='store_true', help="use packed sequence dataset") return parser.parse_args() @@ -97,7 +99,16 @@ def get_args(): else: peft = None - squad = llm.SquadDataModule(seq_length=2048, micro_batch_size=args.mbs, global_batch_size=8, num_workers=0) + packed_sequence_specs = ( + PackedSequenceSpecs(packed_sequence_size=2048, tokenizer_model_name="dummy_tokenizer") if args.packed else None + ) + dolly = llm.DollyDataModule( + seq_length=2048, + micro_batch_size=args.mbs, + global_batch_size=8, + num_workers=0, + packed_sequence_specs=packed_sequence_specs, + ) tokenizer = get_nmt_tokenizer(tokenizer_model=os.path.join(args.restore_path, "dummy_tokenizer.model")) llama3_8b = llm.LlamaModel(Llama3ConfigCI(), tokenizer=tokenizer) @@ -109,7 +120,7 @@ def get_args(): llm.finetune( model=llama3_8b, - data=squad, + data=dolly, trainer=trainer, peft=peft, log=logger, diff --git a/tests/collections/llm/megatron_t5_finetuning.py b/tests/collections/llm/megatron_t5_finetuning.py index 76a23d36975b..f54e858cfb43 100644 --- a/tests/collections/llm/megatron_t5_finetuning.py +++ b/tests/collections/llm/megatron_t5_finetuning.py @@ -21,6 +21,7 @@ def get_args(): parser = argparse.ArgumentParser(description='Train a small T5 model using NeMo 2.0') parser.add_argument('--devices', type=int, help="Number of devices to use for training") parser.add_argument('--max-steps', type=int, help="Number of steps to train for") + parser.add_argument('--peft', type=str, default='none', help="none | lora") parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to") parser.add_argument('--experiment-name', type=str, help="name of experiment") parser.add_argument('--wandb-project', type=str, default=None, help="wandb project name") @@ -34,9 +35,12 @@ def get_args(): args = get_args() + special_tokens = {} + special_tokens['additional_special_tokens'] = [f'' for i in range(100)] tokenizer = get_nmt_tokenizer( "megatron", "BertWordPieceCase", + special_tokens=special_tokens, ) data = SquadDataModule( @@ -69,7 +73,6 @@ def get_args(): pipeline_model_parallel_size=1, pipeline_dtype=torch.float32, ckpt_load_optimizer=False, - # ckpt_load_optimizer=True, ) checkpoint_callback = ModelCheckpoint( every_n_train_steps=5000, @@ -93,6 +96,11 @@ def get_args(): config=opt_config, ) + if args.peft == 'lora': + peft = llm.peft.LoRA() + else: + peft = None + trainer = nl.Trainer( devices=args.devices, max_steps=args.max_steps, @@ -125,6 +133,7 @@ def get_args(): resume=resume, data=data, trainer=trainer, + peft=peft, log=nemo_logger, optim=opt, ) diff --git a/tests/collections/llm/megatron_t5_pretraining.py b/tests/collections/llm/megatron_t5_pretraining.py index 5d8f55a7f26f..a5460be3d154 100644 --- a/tests/collections/llm/megatron_t5_pretraining.py +++ b/tests/collections/llm/megatron_t5_pretraining.py @@ -50,10 +50,13 @@ def get_args(): args = get_args() + special_tokens = {} + special_tokens['additional_special_tokens'] = [f'' for i in range(100)] tokenizer = get_nmt_tokenizer( "megatron", "BertWordPieceCase", vocab_file=args.vocab_path, + special_tokens=special_tokens, ) data = PreTrainingDataModule( paths=args.data_path, diff --git a/tests/collections/llm/recipes/test_llama3_70b.py b/tests/collections/llm/recipes/test_llama3_70b.py index a842975846dd..d47b674b7b70 100644 --- a/tests/collections/llm/recipes/test_llama3_70b.py +++ b/tests/collections/llm/recipes/test_llama3_70b.py @@ -31,7 +31,7 @@ def test_trainer(self, recipe_module): assert trainer_config.__fn_or_cls__ == Trainer assert trainer_config.accelerator == "gpu" assert trainer_config.devices == 8 - assert trainer_config.num_nodes == 1 + assert trainer_config.num_nodes == 4 # Check strategy configuration assert isinstance(trainer_config.strategy, run.Config) @@ -79,10 +79,8 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_ assert recipe.trainer.num_nodes == num_nodes assert recipe.trainer.devices == num_gpus_per_node - def test_pretrain_recipe_performance(self, recipe_module): - recipe = recipe_module.pretrain_recipe_performance( - name="test_perf", dir="/tmp", num_nodes=1, num_gpus_per_node=8 - ) + def test_pretrain_performance_optimizations(self, recipe_module): + recipe = recipe_module.pretrain_recipe(performance_mode=True) assert any( isinstance(cb, run.Config) and cb.__fn_or_cls__ == MegatronCommOverlapCallback for cb in recipe.trainer.callbacks diff --git a/tests/collections/llm/recipes/test_llama3_70b_16k.py b/tests/collections/llm/recipes/test_llama3_70b_16k.py index 60940b062a87..17f0ec5ebd99 100644 --- a/tests/collections/llm/recipes/test_llama3_70b_16k.py +++ b/tests/collections/llm/recipes/test_llama3_70b_16k.py @@ -29,15 +29,15 @@ def test_trainer(self, recipe_module): assert trainer_config.__fn_or_cls__ == Trainer assert trainer_config.accelerator == "gpu" assert trainer_config.devices == 8 - assert trainer_config.num_nodes == 2 + assert trainer_config.num_nodes == 4 # Check strategy configuration assert isinstance(trainer_config.strategy, run.Config) assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" - assert trainer_config.strategy.tensor_model_parallel_size == 2 - assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.tensor_model_parallel_size == 8 + assert trainer_config.strategy.pipeline_model_parallel_size == 2 assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None assert trainer_config.strategy.context_parallel_size == 2 assert trainer_config.strategy.sequence_parallel is True @@ -61,14 +61,37 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_ assert recipe.trainer.num_nodes == num_nodes assert recipe.trainer.devices == num_gpus_per_node - def test_trainer_parallelism_options(self, recipe_module): + def test_valid_trainer_parallelism(self, recipe_module): trainer_config = recipe_module.trainer() - assert trainer_config.strategy.tensor_model_parallel_size == 2 - assert trainer_config.strategy.pipeline_model_parallel_size == 4 - assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5 - assert trainer_config.strategy.context_parallel_size == 2 - assert trainer_config.strategy.sequence_parallel is True + + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + + assert trainer_config.strategy.expert_model_parallel_size == 1 + + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + % trainer_config.devices + == 0 + ) + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + / trainer_config.devices + % trainer_config.num_nodes + == 0 + ) + + if trainer_config.strategy.pipeline_model_parallel_size != 1: + assert trainer_config.strategy.pipeline_dtype is not None + + if trainer_config.strategy.tensor_model_parallel_size == 1: + assert trainer_config.strategy.sequence_parallel is False def test_model_config_parameters(self, recipe_module): model_config = recipe_module.model() diff --git a/tests/collections/llm/recipes/test_llama3_70b_64k.py b/tests/collections/llm/recipes/test_llama3_70b_64k.py index 89813162fae1..e9f496dfdd2e 100644 --- a/tests/collections/llm/recipes/test_llama3_70b_64k.py +++ b/tests/collections/llm/recipes/test_llama3_70b_64k.py @@ -38,7 +38,7 @@ def test_trainer(self, recipe_module): assert trainer_config.strategy.tensor_model_parallel_size == 8 assert trainer_config.strategy.pipeline_model_parallel_size == 4 assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None assert trainer_config.strategy.context_parallel_size == 8 assert trainer_config.strategy.sequence_parallel is True @@ -67,14 +67,37 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_ assert recipe.trainer.num_nodes == num_nodes assert recipe.trainer.devices == num_gpus_per_node - def test_trainer_parallelism_options(self, recipe_module): + def test_valid_trainer_parallelism(self, recipe_module): trainer_config = recipe_module.trainer() - assert trainer_config.strategy.tensor_model_parallel_size == 8 - assert trainer_config.strategy.pipeline_model_parallel_size == 4 - assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5 - assert trainer_config.strategy.context_parallel_size == 8 - assert trainer_config.strategy.sequence_parallel is True + + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + + assert trainer_config.strategy.expert_model_parallel_size == 1 + + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + % trainer_config.devices + == 0 + ) + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + / trainer_config.devices + % trainer_config.num_nodes + == 0 + ) + + if trainer_config.strategy.pipeline_model_parallel_size != 1: + assert trainer_config.strategy.pipeline_dtype is not None + + if trainer_config.strategy.tensor_model_parallel_size == 1: + assert trainer_config.strategy.sequence_parallel is False def test_model_config_parameters(self, recipe_module): model_config = recipe_module.model() diff --git a/tests/collections/llm/recipes/test_llama3_8b.py b/tests/collections/llm/recipes/test_llama3_8b.py index df4f05eec2ae..88fab6d6325a 100644 --- a/tests/collections/llm/recipes/test_llama3_8b.py +++ b/tests/collections/llm/recipes/test_llama3_8b.py @@ -90,10 +90,8 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_ assert recipe.trainer.num_nodes == num_nodes assert recipe.trainer.devices == num_gpus_per_node - def test_pretrain_recipe_performance(self, recipe_module): - recipe = recipe_module.pretrain_recipe_performance( - name="test_perf", dir="/tmp", num_nodes=1, num_gpus_per_node=8 - ) + def test_pretrain_performance_optimizations(self, recipe_module): + recipe = recipe_module.pretrain_recipe(performance_mode=True) assert any(cb.__fn_or_cls__.__name__ == "MegatronCommOverlapCallback" for cb in recipe.trainer.callbacks) def test_trainer_parallelism_options(self, recipe_module): diff --git a/tests/collections/llm/recipes/test_llama3_8b_16k.py b/tests/collections/llm/recipes/test_llama3_8b_16k.py index d7f3bd40ecb7..fe75f01236ab 100644 --- a/tests/collections/llm/recipes/test_llama3_8b_16k.py +++ b/tests/collections/llm/recipes/test_llama3_8b_16k.py @@ -29,15 +29,15 @@ def test_trainer(self, recipe_module): assert trainer_config.__fn_or_cls__ == Trainer assert trainer_config.accelerator == "gpu" assert trainer_config.devices == 8 - assert trainer_config.num_nodes == 1 + assert trainer_config.num_nodes == 2 # Check strategy configuration assert isinstance(trainer_config.strategy, run.Config) assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" - assert trainer_config.strategy.tensor_model_parallel_size == 2 - assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.tensor_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_model_parallel_size == 2 assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None assert trainer_config.strategy.context_parallel_size == 2 assert trainer_config.strategy.sequence_parallel is True @@ -61,14 +61,37 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_ assert recipe.trainer.num_nodes == num_nodes assert recipe.trainer.devices == num_gpus_per_node - def test_trainer_parallelism_options(self, recipe_module): + def test_valid_trainer_parallelism(self, recipe_module): trainer_config = recipe_module.trainer() - assert trainer_config.strategy.tensor_model_parallel_size == 2 - assert trainer_config.strategy.pipeline_model_parallel_size == 4 - assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5 - assert trainer_config.strategy.context_parallel_size == 2 - assert trainer_config.strategy.sequence_parallel is True + + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + + assert trainer_config.strategy.expert_model_parallel_size == 1 + + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + % trainer_config.devices + == 0 + ) + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + / trainer_config.devices + % trainer_config.num_nodes + == 0 + ) + + if trainer_config.strategy.pipeline_model_parallel_size != 1: + assert trainer_config.strategy.pipeline_dtype is not None + + if trainer_config.strategy.tensor_model_parallel_size == 1: + assert trainer_config.strategy.sequence_parallel is False def test_model_config_parameters(self, recipe_module): model_config = recipe_module.model() diff --git a/tests/collections/llm/recipes/test_llama3_8b_64k.py b/tests/collections/llm/recipes/test_llama3_8b_64k.py index f489e12dc55f..0316b736341a 100644 --- a/tests/collections/llm/recipes/test_llama3_8b_64k.py +++ b/tests/collections/llm/recipes/test_llama3_8b_64k.py @@ -29,15 +29,15 @@ def test_trainer(self, recipe_module): assert trainer_config.__fn_or_cls__ == Trainer assert trainer_config.accelerator == "gpu" assert trainer_config.devices == 8 - assert trainer_config.num_nodes == 1 + assert trainer_config.num_nodes == 4 # Check strategy configuration assert isinstance(trainer_config.strategy, run.Config) assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" - assert trainer_config.strategy.tensor_model_parallel_size == 2 - assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.tensor_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_model_parallel_size == 2 assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None assert trainer_config.strategy.context_parallel_size == 4 assert trainer_config.strategy.sequence_parallel is True @@ -61,14 +61,37 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_ assert recipe.trainer.num_nodes == num_nodes assert recipe.trainer.devices == num_gpus_per_node - def test_trainer_parallelism_options(self, recipe_module): + def test_valid_trainer_parallelism(self, recipe_module): trainer_config = recipe_module.trainer() - assert trainer_config.strategy.tensor_model_parallel_size == 2 - assert trainer_config.strategy.pipeline_model_parallel_size == 4 - assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5 - assert trainer_config.strategy.context_parallel_size == 4 - assert trainer_config.strategy.sequence_parallel is True + + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + + assert trainer_config.strategy.expert_model_parallel_size == 1 + + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + % trainer_config.devices + == 0 + ) + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + / trainer_config.devices + % trainer_config.num_nodes + == 0 + ) + + if trainer_config.strategy.pipeline_model_parallel_size != 1: + assert trainer_config.strategy.pipeline_dtype is not None + + if trainer_config.strategy.tensor_model_parallel_size == 1: + assert trainer_config.strategy.sequence_parallel is False def test_model_config_parameters(self, recipe_module): model_config = recipe_module.model() diff --git a/tests/collections/llm/recipes/test_mistral.py b/tests/collections/llm/recipes/test_mistral.py index 490f26a363fc..a7d83edcc370 100644 --- a/tests/collections/llm/recipes/test_mistral.py +++ b/tests/collections/llm/recipes/test_mistral.py @@ -6,7 +6,7 @@ from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel from nemo.collections.llm.peft.lora import LoRA -from nemo.collections.llm.recipes import mistral +from nemo.collections.llm.recipes import mistral_7b as mistral from nemo.lightning import AutoResume, Trainer diff --git a/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py b/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py index 9f52b7117e82..62d6e0e31917 100644 --- a/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py +++ b/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py @@ -31,15 +31,15 @@ def test_trainer(self, recipe_module): assert trainer_config.__fn_or_cls__ == Trainer assert trainer_config.accelerator == "gpu" assert trainer_config.devices == 8 - assert trainer_config.num_nodes == 2 + assert trainer_config.num_nodes == 4 # Check strategy configuration assert isinstance(trainer_config.strategy, run.Config) assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" - assert trainer_config.strategy.tensor_model_parallel_size == 2 - assert trainer_config.strategy.pipeline_model_parallel_size == 4 + assert trainer_config.strategy.tensor_model_parallel_size == 4 + assert trainer_config.strategy.pipeline_model_parallel_size == 2 assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None assert trainer_config.strategy.context_parallel_size == 4 assert trainer_config.strategy.sequence_parallel is True assert trainer_config.strategy.expert_model_parallel_size == 1 @@ -69,15 +69,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_ assert recipe.trainer.num_nodes == num_nodes assert recipe.trainer.devices == num_gpus_per_node - def test_trainer_parallelism_options(self, recipe_module): + def test_valid_trainer_parallelism(self, recipe_module): trainer_config = recipe_module.trainer() - assert trainer_config.strategy.tensor_model_parallel_size == 2 - assert trainer_config.strategy.pipeline_model_parallel_size == 4 - assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8 - assert trainer_config.strategy.context_parallel_size == 4 - assert trainer_config.strategy.sequence_parallel is True - assert trainer_config.strategy.expert_model_parallel_size == 1 + + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + % trainer_config.devices + == 0 + ) + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + / trainer_config.devices + % trainer_config.num_nodes + == 0 + ) + + if trainer_config.strategy.pipeline_model_parallel_size != 1: + assert trainer_config.strategy.pipeline_dtype is not None + + if trainer_config.strategy.tensor_model_parallel_size == 1: + assert trainer_config.strategy.sequence_parallel is False def test_model_config_parameters(self, recipe_module): model_config = recipe_module.model() diff --git a/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py b/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py index f508e6dfd585..9ff93a89f438 100644 --- a/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py +++ b/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py @@ -35,11 +35,11 @@ def test_trainer(self, recipe_module): # Check strategy configuration assert isinstance(trainer_config.strategy, run.Config) assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" - assert trainer_config.strategy.tensor_model_parallel_size == 4 + assert trainer_config.strategy.tensor_model_parallel_size == 8 assert trainer_config.strategy.pipeline_model_parallel_size == 4 assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 4 - assert trainer_config.strategy.context_parallel_size == 8 + assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None + assert trainer_config.strategy.context_parallel_size == 4 assert trainer_config.strategy.sequence_parallel is True assert trainer_config.strategy.expert_model_parallel_size == 1 @@ -63,15 +63,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_ assert recipe.trainer.num_nodes == num_nodes assert recipe.trainer.devices == num_gpus_per_node - def test_trainer_parallelism_options(self, recipe_module): + def test_valid_trainer_parallelism(self, recipe_module): trainer_config = recipe_module.trainer() - assert trainer_config.strategy.tensor_model_parallel_size == 4 - assert trainer_config.strategy.pipeline_model_parallel_size == 4 - assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 4 - assert trainer_config.strategy.context_parallel_size == 8 - assert trainer_config.strategy.sequence_parallel is True - assert trainer_config.strategy.expert_model_parallel_size == 1 + + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + % trainer_config.devices + == 0 + ) + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + / trainer_config.devices + % trainer_config.num_nodes + == 0 + ) + + if trainer_config.strategy.pipeline_model_parallel_size != 1: + assert trainer_config.strategy.pipeline_dtype is not None + + if trainer_config.strategy.tensor_model_parallel_size == 1: + assert trainer_config.strategy.sequence_parallel is False def test_model_config_parameters(self, recipe_module): model_config = recipe_module.model() diff --git a/tests/collections/llm/recipes/test_nemotron4_15b_16k.py b/tests/collections/llm/recipes/test_nemotron4_15b_16k.py index e0b4e1f56eb8..6c1f5d90e160 100644 --- a/tests/collections/llm/recipes/test_nemotron4_15b_16k.py +++ b/tests/collections/llm/recipes/test_nemotron4_15b_16k.py @@ -47,3 +47,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_ recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) assert recipe.trainer.num_nodes == num_nodes assert recipe.trainer.devices == num_gpus_per_node + + def test_valid_trainer_parallelism(self, recipe_module): + trainer_config = recipe_module.pretrain_recipe().trainer + + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + + assert trainer_config.strategy.expert_model_parallel_size == 1 + + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + % trainer_config.devices + == 0 + ) + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + / trainer_config.devices + % trainer_config.num_nodes + == 0 + ) + + if trainer_config.strategy.pipeline_model_parallel_size != 1: + assert trainer_config.strategy.pipeline_dtype is not None + + if trainer_config.strategy.tensor_model_parallel_size == 1: + assert trainer_config.strategy.sequence_parallel is False diff --git a/tests/collections/llm/recipes/test_nemotron4_15b_64k.py b/tests/collections/llm/recipes/test_nemotron4_15b_64k.py index 9525039eb90e..8ed35fb81893 100644 --- a/tests/collections/llm/recipes/test_nemotron4_15b_64k.py +++ b/tests/collections/llm/recipes/test_nemotron4_15b_64k.py @@ -47,3 +47,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_ recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) assert recipe.trainer.num_nodes == num_nodes assert recipe.trainer.devices == num_gpus_per_node + + def test_valid_trainer_parallelism(self, recipe_module): + trainer_config = recipe_module.pretrain_recipe().trainer + + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + + assert trainer_config.strategy.expert_model_parallel_size == 1 + + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + % trainer_config.devices + == 0 + ) + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + / trainer_config.devices + % trainer_config.num_nodes + == 0 + ) + + if trainer_config.strategy.pipeline_model_parallel_size != 1: + assert trainer_config.strategy.pipeline_dtype is not None + + if trainer_config.strategy.tensor_model_parallel_size == 1: + assert trainer_config.strategy.sequence_parallel is False diff --git a/tests/collections/llm/recipes/test_nemotron4_22b_16k.py b/tests/collections/llm/recipes/test_nemotron4_22b_16k.py index 1e501b447d45..6b4a581348e0 100644 --- a/tests/collections/llm/recipes/test_nemotron4_22b_16k.py +++ b/tests/collections/llm/recipes/test_nemotron4_22b_16k.py @@ -47,3 +47,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_ recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) assert recipe.trainer.num_nodes == num_nodes assert recipe.trainer.devices == num_gpus_per_node + + def test_valid_trainer_parallelism(self, recipe_module): + trainer_config = recipe_module.pretrain_recipe().trainer + + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + + assert trainer_config.strategy.expert_model_parallel_size == 1 + + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + % trainer_config.devices + == 0 + ) + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + / trainer_config.devices + % trainer_config.num_nodes + == 0 + ) + + if trainer_config.strategy.pipeline_model_parallel_size != 1: + assert trainer_config.strategy.pipeline_dtype is not None + + if trainer_config.strategy.tensor_model_parallel_size == 1: + assert trainer_config.strategy.sequence_parallel is False diff --git a/tests/collections/llm/recipes/test_nemotron4_22b_64k.py b/tests/collections/llm/recipes/test_nemotron4_22b_64k.py index c37a45793aff..68a238a93338 100644 --- a/tests/collections/llm/recipes/test_nemotron4_22b_64k.py +++ b/tests/collections/llm/recipes/test_nemotron4_22b_64k.py @@ -47,3 +47,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_ recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) assert recipe.trainer.num_nodes == num_nodes assert recipe.trainer.devices == num_gpus_per_node + + def test_valid_trainer_parallelism(self, recipe_module): + trainer_config = recipe_module.pretrain_recipe().trainer + + assert isinstance(trainer_config.strategy, run.Config) + assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" + + assert trainer_config.strategy.expert_model_parallel_size == 1 + + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + % trainer_config.devices + == 0 + ) + assert ( + trainer_config.strategy.tensor_model_parallel_size + * trainer_config.strategy.pipeline_model_parallel_size + * trainer_config.strategy.context_parallel_size + * trainer_config.strategy.expert_model_parallel_size + / trainer_config.devices + % trainer_config.num_nodes + == 0 + ) + + if trainer_config.strategy.pipeline_model_parallel_size != 1: + assert trainer_config.strategy.pipeline_dtype is not None + + if trainer_config.strategy.tensor_model_parallel_size == 1: + assert trainer_config.strategy.sequence_parallel is False diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py index a0b69927ecc0..fa2eeae9b538 100644 --- a/tests/core/test_exp_manager.py +++ b/tests/core/test_exp_manager.py @@ -29,6 +29,7 @@ from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy from nemo.constants import NEMO_ENV_VARNAME_VERSION from nemo.core.classes import ModelPT +from nemo.utils.app_state import AppState from nemo.utils.callbacks import NeMoModelCheckpoint from nemo.utils.exp_manager import ( CheckpointMisconfigurationError, @@ -1097,3 +1098,74 @@ def test_doesnt_silently_start_from_scratch_dist(self, tmp_path): restored_trainer, {"resume_if_exists": True, "resume_ignore_no_checkpoint": True, "explicit_log_dir": str(test_dir)}, ) + + @pytest.mark.unit + def test_save_nemo_not_comp_with_model_parallel(self, tmp_path): + """ + Ensure that always_save_nemo is not compatible with model parallelism. + """ + + test_dir = tmp_path / "test" + + with pytest.raises(LoggerMisconfigurationError): + appstate = AppState() + appstate.tensor_model_parallel_size = 2 + appstate.pipeline_model_parallel_size = 1 + appstate.context_parallel_size = 1 + test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1) + exp_manager( + test_trainer, + { + "checkpoint_callback_params": { + "always_save_nemo": True, + }, + "explicit_log_dir": str(test_dir), + } + ) + + with pytest.raises(LoggerMisconfigurationError): + appstate = AppState() + appstate.tensor_model_parallel_size = 1 + appstate.pipeline_model_parallel_size = 2 + appstate.context_parallel_size = 1 + test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1) + exp_manager( + test_trainer, + { + "checkpoint_callback_params": { + "always_save_nemo": True, + }, + "explicit_log_dir": str(test_dir), + }, + ) + + with pytest.raises(LoggerMisconfigurationError): + appstate = AppState() + appstate.tensor_model_parallel_size = 1 + appstate.pipeline_model_parallel_size = 1 + appstate.context_parallel_size = 2 + test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1) + exp_manager( + test_trainer, + { + "checkpoint_callback_params": { + "always_save_nemo": True, + }, + "explicit_log_dir": str(test_dir), + }, + ) + + appstate = AppState() + appstate.tensor_model_parallesl_size = 1 + appstate.pipeline_model_parallel_size = 1 + appstate.context_parallel_size = 1 + test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1) + exp_manager( + test_trainer, + { + "checkpoint_callback_params": { + "always_save_nemo": True, + }, + "explicit_log_dir": str(test_dir), + }, + ) diff --git a/tests/lightning/test_nemo_run.py b/tests/lightning/test_nemo_run.py index 8d7814bfe530..947930c84847 100644 --- a/tests/lightning/test_nemo_run.py +++ b/tests/lightning/test_nemo_run.py @@ -17,8 +17,8 @@ ("llama3_70b_16k", "pretrain_recipe", "llama3_70b_16k_pretrain"), ("llama3_70b_64k", "pretrain_recipe", "llama3_70b_64k_pretrain"), ("llama31_405b", "pretrain_recipe", "llama31_405b_pretrain"), - ("mistral", "pretrain_recipe", "mistral_pretrain"), - ("mistral", "finetune_recipe", "mistral_finetune"), + ("mistral_7b", "pretrain_recipe", "mistral_pretrain"), + ("mistral_7b", "finetune_recipe", "mistral_finetune"), ("mixtral_8x7b", "pretrain_recipe", "mixtral_8x7b_pretrain"), ("mixtral_8x7b", "finetune_recipe", "mixtral_8x7b_finetune"), ("mixtral_8x7b_16k", "pretrain_recipe", "mixtral_8x7b_16k_pretrain"), diff --git a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb deleted file mode 100644 index 608685254a0d..000000000000 --- a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb +++ /dev/null @@ -1,827 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "\"\"\"\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run this cell to set up dependencies.\n", - "\"\"\"\n", - "# If you're using Google Colab and not running locally, run this cell\n", - "\n", - "# install NeMo\n", - "BRANCH = 'main'\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "from nemo.collections import nlp as nemo_nlp\n", - "from nemo.utils.exp_manager import exp_manager\n", - "from nemo.utils import logging\n", - "\n", - "import os\n", - "import wget\n", - "import torch\n", - "import pytorch_lightning as pl\n", - "from omegaconf import OmegaConf" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Task Description\n", - "**Joint Intent and Slot classification** - is a task of classifying an Intent and detecting all relevant Slots (Entities)\n", - "for this Intent in a query.\n", - "For example, in the query: `What is the weather in Santa Clara tomorrow morning?`, we would like to classify the query\n", - "as a `weather` Intent, and detect `Santa Clara` as a `location` slot and `tomorrow morning` as a `date_time` slot.\n", - "Intents and Slots names are usually task specific and defined as labels in the training data.\n", - "This is a fundamental step that is executed in any task-driven Conversational Assistant.\n", - "\n", - "Our Bert based model implementation enables to train and then detect both of these tasks together.\n", - "\n", - "**Multi Label Joint Intent and Slot classification** - is very similar to the task above, but instead of only classifying a single Intent, the task can predict multiple different intents for each query. For example, for the query `Yes, please tell me the weather`, we might want the intents for this utterance to be `yes` and `weather`. You can skip to that tutorial [here](#multi-label)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Dataset and NeMo data format\n", - "\n", - "In this tutorial we are going to use a virtual assistant interaction data set that can be downloaded from here: https://github.com/xliuhw/NLU-Evaluation-Data.\n", - "There are about 10K training and 1K testing queries which cover 64 various Intents and 55 Slots. \n", - "\n", - "To work with NeMo NLP classification model, this dataset should be first converted to the NeMo format, which requires next files:\n", - "- **dict.intents.csv** - list of all intent names in the data. One line per an intent name.\n", - "- **dict.slots.csv** - list of all slot names in the data. One line per a slot name. It is possible to use both: B- I- notations, for separating between first and intermediate tokens for multi token slots. Or just use one slot type for each token of multi token slot. Our recommendation is to use later one, since it is simpler and there is no visible degradation in performance.\n", - "- **train.tsv/test.tsv** - contain original queries, one per line, and intent number separated by tab. For example: `what alarms do i have set right now\t0`. Intent numbers are according to the intent line in the intent dictionary file (dict.intents.csv) starting from 0. First line of these files contains a header line: `sentence \\tab label`.\n", - "- **train_slot.tvs/test_slot.tsv** - contain one line per a query, where instead each token there is a number of the token from the slots dictionary file (dict.slots.csv), starting from 0. Last 'out-of scope' token is usually located in the last line of the dictionary. Example: `54 0 0 54 54 12 12` (numbers separated by space). No header line in these files.\n", - "\n", - "NeMo provides **import_dataset.py** converter for few reference datasets (Assistant / Atis / Snips) which converts them to the NeMo data format for the Intent and Slot classification model. If you have your own annotated dataset in a different format, you will need to write a data converter. Possible recommended format for your own annotation, is to have one text file per all examples of one intent. With one line per query in a form like: `did i set an alarm to [alarm_type : wake up] in the [timeofday : morning]`, using brackets to define slot names. This is very similar to the assistant format from this example and you can use its converter to NeMo format with small changes. \n", - "\n", - "You can run this utility as follows:\n", - "\n", - "**python examples/nlp/intent_slot_classification/data/import_datasets.py --dataset_name=assistant --source_data_dir=source_dir_name --target_data_dir=target_dir_name**\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Download, preprocess and explore the dataset\n", - "## Download the dataset and convert it to the NeMo format" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# you can replace DATA_DIR and NEMO_DIR with your own locations\n", - "DATA_DIR = \".\"\n", - "NEMO_DIR = '.'\n", - "\n", - "# download the converter files from github for the purpose of this tutorial\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py', NEMO_DIR)\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py', NEMO_DIR)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "# download and unzip the example dataset from github\n", - "print('Downloading dataset...')\n", - "wget.download('https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip', DATA_DIR)\n", - "! unzip {DATA_DIR}/NLU-Evaluation-Data-master.zip -d {DATA_DIR}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# convert the dataset to the NeMo format\n", - "!python {NEMO_DIR}/import_datasets.py --dataset_name=assistant --source_data_dir={DATA_DIR}/NLU-Evaluation-Data-master --target_data_dir={DATA_DIR}/nemo_format\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data exploration\n", - "You can see the dataset in both the original and NeMo's formats. We have here 65 different Intents and 55 Slots, which could be typical commands for virtual assistants. Out of scope slot has the name 'O' and is the last in the dictionary of Slots. And we can see examples of queries and also format of training intent and slot files. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# list of queries divided by intent files in the original training dataset\n", - "! ls -l {DATA_DIR}/NLU-Evaluation-Data-master/dataset/trainset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# print all intents from the NeMo format intent dictionary\n", - "!echo 'Intents: ' $(wc -l < {DATA_DIR}/nemo_format/dict.intents.csv)\n", - "! cat {DATA_DIR}/nemo_format/dict.intents.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# print all slots from the NeMo format slot dictionary\n", - "!echo 'Slots: ' $(wc -l < {DATA_DIR}/nemo_format/dict.slots.csv)\n", - "! cat {DATA_DIR}/nemo_format/dict.slots.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# examples from the intent training file\n", - "! head -n 10 {DATA_DIR}/nemo_format/train.tsv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# examples from the slot training file\n", - "! head -n 10 {DATA_DIR}/nemo_format/train_slots.tsv" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Training model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Model configuration\n", - "\n", - "Our Joint Intent and Slot classification model is comprised of the pretrained [BERT](https://arxiv.org/pdf/1810.04805.pdf) model with an Intent and Slot Classification layer on top of it.\n", - "\n", - "All model and training parameters are defined in the **intent_slot_classification_config.yaml** config file. This file is located in the folder **examples/nlp/intent_slot_classification/conf/**. It contains 2 main sections:\n", - "- **model**: All arguments that are related to the Model - language model, token classifier, optimizer and schedulers, datasets and any other related information\n", - "\n", - "- **trainer**: Any argument to be passed to PyTorch Lightning\n", - "\n", - "We will download the config file from repository for the purpose of the tutorial. If you have a version of NeMo installed locally, you can use it from the above folder." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# download the model config file from repository for the purpose of this example\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml', NEMO_DIR)\n", - "\n", - "# print content of the config file\n", - "config_file = \"intent_slot_classification_config.yaml\"\n", - "print(config_file)\n", - "config = OmegaConf.load(config_file)\n", - "print(OmegaConf.to_yaml(config))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setting up Data within the config\n", - "\n", - "Among other things, the config file contains dictionaries called train_ds and validation_ds. These are configurations used to setup the Dataset and DataLoaders of the corresponding config.\n", - "\n", - "The converter utility creates both training and evaluation files in the same directory, so we need to specify `model.data_dir` parameter to this directory. Also notice that some config lines, including `model.data_dir`, have `???` in place of paths, this means that values for these fields are required to be specified by the user.\n", - "\n", - "`config.model.intent_loss_weight` parameter - is a balance of training loss between Intent and Slot losses, a number between 0 to 1. Its default value is 0.6 which gives slightly higher priority to the Intent loss and it empirically works quite well. You can experiment with this value if you like.\n", - "Also you can try to change `config.model.class_balancing` parameter to `weighted_loss` and see if you get better accuracy.\n", - "\n", - "Let's now add the data directory path to the config." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "config.model.data_dir = f'{DATA_DIR}/nemo_format'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Building the PyTorch Lightning Trainer\n", - "\n", - "NeMo models are primarily PyTorch Lightning modules - and therefore are entirely compatible with the PyTorch Lightning ecosystem. `config.trainer.max_epochs` - param defines number of training epochs. Usually 50-100 epochs or less should be enough to train on your data. Let's instantiate the Trainer object." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# lets modify some trainer configs\n", - "# checks if we have GPU available and uses it\n", - "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n", - "config.trainer.devices = 1\n", - "config.trainer.accelerator = accelerator\n", - "\n", - "config.trainer.precision = 16 if torch.cuda.is_available() else 32\n", - "\n", - "# for mixed precision training, uncomment the line below (precision should be set to 16 and amp_level to O1):\n", - "# config.trainer.amp_level = O1\n", - "\n", - "# remove distributed training flags\n", - "config.trainer.strategy = 'auto'\n", - "\n", - "# setup a small number of epochs for demonstration purposes of this tutorial\n", - "config.trainer.max_epochs = 5\n", - "\n", - "trainer = pl.Trainer(**config.trainer)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setting up a NeMo Experiment\n", - "\n", - "NeMo has an experiment manager that handles logging and checkpointing for us, so let's use it. Model check points during training will be saved in this directory. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))\n", - "# the exp_dir provides a path to the current experiment for easy access\n", - "print(str(exp_dir))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initializing the model and Training\n", - "\n", - "Initial statistics of the dataset will be displayed at the beginning of the training and then Intent and Slot classification report will be displayed after each training epoch." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# initialize the model\n", - "model = nemo_nlp.models.IntentSlotClassificationModel(config.model, trainer=trainer)\n", - "\n", - "# train\n", - "trainer.fit(model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After training for 5 epochs, which should take no more than few minutes, you can expect training precision for this data set to be around these numbers (the accuracy will gradually continue to improve for this dataset up to about 50 epochs of training): \n", - "```\n", - "Intents:\n", - " label precision recall f1 support \n", - " alarm_query (label_id: 0) 94.74 94.74 94.74 19\n", - " alarm_remove (label_id: 1) 100.00 100.00 100.00 11\n", - " alarm_set (label_id: 2) 85.71 94.74 90.00 19\n", - " audio_volume_down (label_id: 3) 0.00 0.00 0.00 8\n", - " audio_volume_mute (label_id: 4) 100.00 86.67 92.86 15\n", - " audio_volume_up (label_id: 5) 56.52 100.00 72.22 13\n", - " calendar_query (label_id: 6) 55.00 57.89 56.41 19\n", - " calendar_remove (label_id: 7) 88.89 84.21 86.49 19\n", - " calendar_set (label_id: 8) 81.25 68.42 74.29 19\n", - " cooking_recipe (label_id: 9) 86.36 100.00 92.68 19\n", - " datetime_convert (label_id: 10) 0.00 0.00 0.00 8\n", - " datetime_query (label_id: 11) 65.52 100.00 79.17 19\n", - " email_addcontact (label_id: 12) 100.00 12.50 22.22 8\n", - " email_query (label_id: 13) 83.33 78.95 81.08 19\n", - " email_querycontact (label_id: 14) 62.50 78.95 69.77 19\n", - " email_sendemail (label_id: 15) 70.83 89.47 79.07 19\n", - " general_affirm (label_id: 16) 95.00 100.00 97.44 19\n", - " general_commandstop (label_id: 17) 100.00 100.00 100.00 19\n", - " general_confirm (label_id: 18) 100.00 100.00 100.00 19\n", - " general_dontcare (label_id: 19) 100.00 100.00 100.00 19\n", - " general_explain (label_id: 20) 100.00 94.74 97.30 19\n", - " general_joke (label_id: 21) 100.00 100.00 100.00 12\n", - " general_negate (label_id: 22) 95.00 100.00 97.44 19\n", - " general_praise (label_id: 23) 100.00 94.74 97.30 19\n", - " general_quirky (label_id: 24) 40.00 10.53 16.67 19\n", - " general_repeat (label_id: 25) 100.00 100.00 100.00 19\n", - " iot_cleaning (label_id: 26) 84.21 100.00 91.43 16\n", - " iot_coffee (label_id: 27) 94.74 94.74 94.74 19\n", - " iot_hue_lightchange (label_id: 28) 94.44 89.47 91.89 19\n", - " iot_hue_lightdim (label_id: 29) 100.00 83.33 90.91 12\n", - " iot_hue_lightoff (label_id: 30) 89.47 89.47 89.47 19\n", - " iot_hue_lighton (label_id: 31) 0.00 0.00 0.00 3\n", - " iot_hue_lightup (label_id: 32) 81.25 92.86 86.67 14\n", - " iot_wemo_off (label_id: 33) 60.00 100.00 75.00 9\n", - " iot_wemo_on (label_id: 34) 100.00 14.29 25.00 7\n", - " lists_createoradd (label_id: 35) 78.95 78.95 78.95 19\n", - " lists_query (label_id: 36) 78.95 78.95 78.95 19\n", - " lists_remove (label_id: 37) 90.00 94.74 92.31 19\n", - " music_likeness (label_id: 38) 70.59 66.67 68.57 18\n", - " music_query (label_id: 39) 77.78 73.68 75.68 19\n", - " music_settings (label_id: 40) 0.00 0.00 0.00 7\n", - " news_query (label_id: 41) 77.78 73.68 75.68 19\n", - " play_audiobook (label_id: 42) 90.00 94.74 92.31 19\n", - " play_game (label_id: 43) 80.00 84.21 82.05 19\n", - " play_music (label_id: 44) 53.85 73.68 62.22 19\n", - " play_podcasts (label_id: 45) 89.47 89.47 89.47 19\n", - " play_radio (label_id: 46) 93.75 78.95 85.71 19\n", - " qa_currency (label_id: 47) 95.00 100.00 97.44 19\n", - " qa_definition (label_id: 48) 85.00 89.47 87.18 19\n", - " qa_factoid (label_id: 49) 45.16 73.68 56.00 19\n", - " qa_maths (label_id: 50) 100.00 100.00 100.00 14\n", - " qa_stock (label_id: 51) 95.00 100.00 97.44 19\n", - " recommendation_events (label_id: 52) 94.44 89.47 91.89 19\n", - " recommendation_locations (label_id: 53) 94.74 94.74 94.74 19\n", - " recommendation_movies (label_id: 54) 100.00 100.00 100.00 10\n", - " social_post (label_id: 55) 90.00 94.74 92.31 19\n", - " social_query (label_id: 56) 94.74 100.00 97.30 18\n", - " takeaway_order (label_id: 57) 93.75 78.95 85.71 19\n", - " takeaway_query (label_id: 58) 85.71 94.74 90.00 19\n", - " transport_query (label_id: 59) 83.33 78.95 81.08 19\n", - " transport_taxi (label_id: 60) 100.00 100.00 100.00 18\n", - " transport_ticket (label_id: 61) 89.47 89.47 89.47 19\n", - " transport_traffic (label_id: 62) 100.00 100.00 100.00 19\n", - " weather_query (label_id: 63) 100.00 89.47 94.44 19\n", - " -------------------\n", - " micro avg 85.04 85.04 85.04 1076\n", - " macro avg 81.13 80.81 79.36 1076\n", - " weighted avg 84.10 85.04 83.54 1076\n", - " \n", - "Slots:\n", - " label precision recall f1 support \n", - " alarm_type (label_id: 0) 0.00 0.00 0.00 0\n", - " app_name (label_id: 1) 0.00 0.00 0.00 6\n", - " artist_name (label_id: 2) 0.00 0.00 0.00 21\n", - " audiobook_author (label_id: 3) 0.00 0.00 0.00 1\n", - " audiobook_name (label_id: 4) 0.00 0.00 0.00 18\n", - " business_name (label_id: 5) 60.00 56.60 58.25 53\n", - " business_type (label_id: 6) 0.00 0.00 0.00 24\n", - " change_amount (label_id: 7) 0.00 0.00 0.00 25\n", - " coffee_type (label_id: 8) 0.00 0.00 0.00 4\n", - " color_type (label_id: 9) 0.00 0.00 0.00 12\n", - " cooking_type (label_id: 10) 0.00 0.00 0.00 0\n", - " currency_name (label_id: 11) 84.09 75.51 79.57 49\n", - " date (label_id: 12) 57.95 91.07 70.83 112\n", - " definition_word (label_id: 13) 0.00 0.00 0.00 20\n", - " device_type (label_id: 14) 74.55 51.25 60.74 80\n", - " drink_type (label_id: 15) 0.00 0.00 0.00 0\n", - " email_address (label_id: 16) 0.00 0.00 0.00 14\n", - " email_folder (label_id: 17) 0.00 0.00 0.00 1\n", - " event_name (label_id: 18) 100.00 13.24 23.38 68\n", - " food_type (label_id: 19) 51.72 69.77 59.41 43\n", - " game_name (label_id: 20) 60.00 14.29 23.08 21\n", - " game_type (label_id: 21) 0.00 0.00 0.00 0\n", - " general_frequency (label_id: 22) 0.00 0.00 0.00 9\n", - " house_place (label_id: 23) 93.33 42.42 58.33 33\n", - " ingredient (label_id: 24) 0.00 0.00 0.00 6\n", - " joke_type (label_id: 25) 0.00 0.00 0.00 4\n", - " list_name (label_id: 26) 0.00 0.00 0.00 21\n", - " meal_type (label_id: 27) 0.00 0.00 0.00 0\n", - " media_type (label_id: 28) 0.00 0.00 0.00 37\n", - " movie_name (label_id: 29) 0.00 0.00 0.00 0\n", - " movie_type (label_id: 30) 0.00 0.00 0.00 0\n", - " music_album (label_id: 31) 0.00 0.00 0.00 0\n", - " music_descriptor (label_id: 32) 0.00 0.00 0.00 3\n", - " music_genre (label_id: 33) 0.00 0.00 0.00 9\n", - " news_topic (label_id: 34) 0.00 0.00 0.00 17\n", - " order_type (label_id: 35) 0.00 0.00 0.00 17\n", - " person (label_id: 36) 44.86 92.31 60.38 52\n", - " personal_info (label_id: 37) 0.00 0.00 0.00 20\n", - " place_name (label_id: 38) 71.25 77.03 74.03 148\n", - " player_setting (label_id: 39) 0.00 0.00 0.00 1\n", - " playlist_name (label_id: 40) 0.00 0.00 0.00 1\n", - " podcast_descriptor (label_id: 41) 0.00 0.00 0.00 13\n", - " podcast_name (label_id: 42) 0.00 0.00 0.00 4\n", - " radio_name (label_id: 43) 66.67 10.53 18.18 38\n", - " relation (label_id: 44) 0.00 0.00 0.00 17\n", - " song_name (label_id: 45) 0.00 0.00 0.00 22\n", - " time (label_id: 46) 70.27 78.20 74.02 133\n", - " time_zone (label_id: 47) 0.00 0.00 0.00 9\n", - " timeofday (label_id: 48) 0.00 0.00 0.00 28\n", - " transport_agency (label_id: 49) 0.00 0.00 0.00 9\n", - " transport_descriptor (label_id: 50) 0.00 0.00 0.00 0\n", - " transport_name (label_id: 51) 0.00 0.00 0.00 4\n", - " transport_type (label_id: 52) 78.38 82.86 80.56 35\n", - " weather_descriptor (label_id: 53) 0.00 0.00 0.00 17\n", - " O (label_id: 54) 92.42 98.80 95.50 5920\n", - " -------------------\n", - " micro avg 89.10 89.10 89.10 7199\n", - " macro avg 21.86 18.56 18.18 7199\n", - " weighted avg 84.42 89.10 86.01 7199\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Evaluation\n", - "To see how the model performs, we can evaluate the performance of the trained model on a test data file. Here we will reload the model from the `.nemo` file saved during training. By default, the `.nemo` file contains the final checkpoint. We will use the same trainer for testing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# extract the path of the best checkpoint from the training, you may update it to any other saved checkpoint file\n", - "checkpoint_path = trainer.checkpoint_callback.best_model_path\n", - "\n", - "# load the model from this checkpoint\n", - "eval_model = nemo_nlp.models.IntentSlotClassificationModel.load_from_checkpoint(checkpoint_path=checkpoint_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# we will setup testing data reusing the same config (test section)\n", - "eval_model.setup_test_data(test_data_config=config.model.test_ds)\n", - "\n", - "# run the evaluation on the test dataset\n", - "trainer.test(model=eval_model, ckpt_path=None, verbose=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Inference from Examples\n", - "Next step to see how the trained model will classify Intents and Slots for given queries from this domain. To improve the predictions you may need to train the model for more than 5 epochs.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "queries = [\n", - " 'set alarm for seven thirty am',\n", - " 'lower volume by fifty percent',\n", - " 'what is my schedule for tomorrow',\n", - "]\n", - "\n", - "pred_intents, pred_slots = eval_model.predict_from_examples(queries, config.model.test_ds)\n", - "\n", - "logging.info('The prediction results of some sample queries with the trained model:')\n", - "for query, intent, slots in zip(queries, pred_intents, pred_slots):\n", - " logging.info(f'Query : {query}')\n", - " logging.info(f'Predicted Intent: {intent}')\n", - " logging.info(f'Predicted Slots: {slots}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Training Script\n", - "\n", - "If you have NeMo installed locally (eg. cloned from the Github), you can also train the model with the example script: `examples/nlp/intent_slot_classification/intent_slot_classification.py.`\n", - "This script contains an example on how to train, evaluate and perform inference with the IntentSlotClassificationModel.\n", - "\n", - "To run a training script, use:\n", - "\n", - "`cd examples/nlp/intent_slot_classification`\n", - "\n", - "`python intent_slot_classification.py model.data_dir=PATH_TO_DATA_DIR`\n", - "\n", - "By default, this script uses examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.py config file, and you may update all the params inside of this config file or alternatively providing them in the command line." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "# Multi-Label Intent Classification\n", - "---\n", - "\n", - "As mentioned above, our multi-label model will be very similar the single intent classification model, with the added functionality of predicting multiple different intents for a single query. For example, the query `show all flights and fares from denver to san francisco` would have intents `atis_airfare` and `atis_flight`. From our list of intents found in `dict.intents.csv`, the model checks whether each individual intent is suitable for the given query.\n", - "\n", - "For this tutorial, we will be using the ATIS (Airline Travel Information System) dataset, converting it to a multi-label data format, and then using the new data to train our model.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download the dataset and convert it to the NeMo format" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# download the converter files from github for the purpose of this tutorial\n", - "DATA_DIR = './multiatis'\n", - "NEMO_DIR = './atis'\n", - "\n", - "!mkdir {DATA_DIR}\n", - "!mkdir {NEMO_DIR}\n", - "\n", - "\n", - "files = [f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.dict.intent.csv', \n", - " f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.dict.slots.csv',\n", - " f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.dict.vocab.csv',\n", - " f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.test.intent.csv',\n", - " f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.test.pkl', \n", - " f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.test.query.csv',\n", - " f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.test.slots.csv', \n", - " f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.train.intent.csv',\n", - " f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.train.pkl',\n", - " f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.train.query.csv',\n", - " f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.train.slots.csv']\n", - "\n", - " \n", - "for file in files:\n", - " wget.download(file, DATA_DIR)\n", - "\n", - "\n", - "# download the converter files from github for the purpose of this tutorial\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py', NEMO_DIR)\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py', NEMO_DIR)\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/convert_datasets.py', NEMO_DIR)\n", - "\n", - "# Get original atis dataset\n", - "!python {NEMO_DIR}/import_datasets.py --dataset_name=atis --source_data_dir={DATA_DIR} --target_data_dir={DATA_DIR}/nemo_format\n", - "# Script will create new files at {DATA_DIR}/new_format\n", - "!mkdir {DATA_DIR}/new_format\n", - "!python {NEMO_DIR}/convert_datasets.py --source_data_dir={DATA_DIR}/nemo_format --target_data_dir={DATA_DIR}/new_format" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Augmentation (Optional)\n", - "---\n", - "\n", - "In scenarios when we don't have many training examples with multiple intent labels, data augmentation can be very useful. This can be done by concatenating utterances together, and adding it to our training data. Some ways of concatenating include adding a period or \\\"and\\\" between the two utterances. A script has been provided below to help with augmentation, but it can be changed depending on your use case." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# download the data augmentation script\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/augment_training_data.py', NEMO_DIR)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The script augment_training_data.py allows for four command line arguments to be passed in: \n", - "\n", - "source_data_dir: directory that contains the original multi-label data
\n", - "target_data_dir: directory to store the new data directory
\n", - "num_mixed: number of new utterances to add to dataset per class pair (utterances with labels 1 and 2)
\n", - "link_string: string that is in between the two utterances (\".\", \"\", \"and\", \"with\")
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!python {NEMO_DIR}/augment_training_data.py --source_data_dir={DATA_DIR}/new_format --target_data_dir={DATA_DIR}/augmented_data --num_mixed=10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Training the Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# download the model config file from repository for the purpose of this example\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml', NEMO_DIR)\n", - "\n", - "# print content of the config file\n", - "config_file = f\"{NEMO_DIR}/multi_label_intent_slot_classification_config.yaml\"\n", - "print(config_file)\n", - "config = OmegaConf.load(config_file)\n", - "print(OmegaConf.to_yaml(config))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "config.model.data_dir = f\"{DATA_DIR}/new_format\"\n", - "config.model.validation_ds.prefix = \"dev\"\n", - "config.model.test_ds.prefix = \"dev\"\n", - "config.model.class_balancing = \"weighted_loss\"\n", - "config.trainer.max_epochs = 5\n", - "run_name = \"test\"\n", - "\n", - "# checks if we have GPU available and uses it\n", - "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n", - "config.trainer.devices = 1\n", - "config.trainer.accelerator = accelerator\n", - "\n", - "# remove distributed training flags\n", - "config.trainer.strategy = 'auto'\n", - "\n", - "trainer = pl.Trainer(**config.trainer)\n", - "config.exp_manager.exp_dir = os.path.join(DATA_DIR, \"output/\" + run_name)\n", - "config.exp_manager.create_checkpoint_callback = True\n", - "\n", - "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))\n", - "model = nemo_nlp.models.MultiLabelIntentSlotClassificationModel(config.model, trainer=trainer)\n", - "trainer.fit(model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Evaluation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To see how the model performs, we can evaluate the performance of the trained model on a test data file. Here we will reload the model from the `.nemo` file saved during training. By default, the `.nemo` file contains the final checkpoint. We will use the same trainer for testing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# specify checkpoint path with .nemo file\n", - "checkpoint_path = os.path.join(exp_dir, \"checkpoints\", \"MultiLabelIntentSlot.nemo\")\n", - "\n", - "# load the model from this checkpoint\n", - "eval_model = nemo_nlp.models.MultiLabelIntentSlotClassificationModel.restore_from(checkpoint_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Optimizing Threshold\n", - "\n", - "As mentioned above, when classifying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n", - "\n", - "We need to use these probabilities to generate final label predictions of 0 or 1 for each label. While we can use 0.5 as the probability threshold, it is usually the case that there is a better threshold to use depending on the metric we want to optimize. For this tutorial, we will be finding the threshold that gives us the best micro-F1 score on the validation set. After running the `optimize_threshold` method, the threshold attribute for our model will be updated." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "eval_model.optimize_threshold(config.model.test_ds, 'dev')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "eval_model.threshold" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Inference from Examples\n", - "Similar to the previous example we can run inference to see how the trained model will classify Intents and Slots for given queries from this domain. To improve the predictions you may need to train the model for more than 10 epochs.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "queries = [\n", - " 'i would like to find a flight from charlotte to las vegas that makes a stop in st. louis',\n", - " 'on april first i need a ticket from tacoma to san jose departing before 7 am',\n", - " 'how much is the limousine service in boston',\n", - "]\n", - "\n", - "# We use the optimized threshold for predictions\n", - "pred_intents, pred_slots, pred_list = eval_model.predict_from_examples(queries, config.model.test_ds)\n", - "logging.info('The prediction results of some sample queries with the trained model:')\n", - " \n", - "for query, intent, slots in zip(queries, pred_intents, pred_slots):\n", - " logging.info(f'Query : {query}')\n", - " logging.info(f'Predicted Intents: {intent}')\n", - " logging.info(f'Predicted Slots: {slots}')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.12" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -}