diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml
index 2084dd174e5d..67bc69b1f8a5 100644
--- a/.github/workflows/cherry-pick-release-commit.yml
+++ b/.github/workflows/cherry-pick-release-commit.yml
@@ -120,7 +120,7 @@ jobs:
                     "type": "section",
                     "text": {
                       "type": "mrkdwn",
-                      "text": ":alert: Cherrypick bot 🤖: Hey <@'$USERNAME'>: Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: <@${{ secrets.SLACK_WEBHOOK_ADMIN }}>"
+                      "text": ":alert: Cherrypick bot 🤖: Hey <@'$USERNAME'>: Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>"
                     }
                   }
                 ]
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index fe7daaac0c95..345482e9a1a8 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -131,16 +131,16 @@ jobs:
         ### \'\'
   
   # L0: GPU unit tests
-  OPTIONAL_L0_Unit_Tests_GPU_ASR:
+  L0_Unit_Tests_GPU_ASR:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure
        TIMEOUT: 20
+       # TODO: remove this hack
        SCRIPT: |
-         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads
-       IS_OPTIONAL: true
+         python -c "from nemo.collections.asr.models import ASRModel" && NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads
 
   L0_Unit_Tests_GPU_Audio:
      needs: [cicd-test-container-setup]
@@ -1212,18 +1212,6 @@ jobs:
         matmul_precision=medium
       AFTER_SCRIPT: |
         rm -rf preds.json
-  
-
-  # L2: Transducer alignment
-  OPTIONAL_L2_Transducer_alignment_Running_pytest:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_Transducer_alignment_Running_pytest') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1 --with_downloads
-      IS_OPTIONAL: true
 
   # L2: Segmentation Tool
   L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav:
@@ -1345,275 +1333,6 @@ jobs:
                 pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \
                 output_manifest=preds.json
 
-  # L2: Duplex Text Normalization
-  L2_Duplex_Text_Normalization_with_Tarred_dataset:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Duplex_Text_Normalization_with_Tarred_dataset') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        cd examples/nlp/duplex_text_normalization && \
-        python duplex_text_normalization_train.py \
-        data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \
-        mode=tn \
-        lang=en \
-        tagger_model.do_training=false \
-        decoder_model.transformer=t5-small \
-        data.validation_ds.batch_size=2 \
-        data.train_ds.use_cache=false \
-        data.validation_ds.use_cache=false \
-        data.test_ds.batch_size=2 \
-        data.train_ds.decoder_data_augmentation=false \
-        data.train_ds.num_workers=2 \
-        decoder_trainer.devices=[0,1] \
-        decoder_trainer.accelerator="gpu" \
-        data.train_ds.use_tarred_dataset=true \
-        +decoder_trainer.fast_dev_run=true \
-        decoder_exp_manager.create_checkpoint_callback=false \
-        data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \
-        data.test_ds.use_cache=false \
-        data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv
-
-  # L2: Intent and Slot Classification Tasks
-  L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/intent_slot_classification && \
-        python intent_slot_classification.py \
-        model.data_dir=/home/TestData/nlp/retail \
-        model.validation_ds.prefix=dev \
-        model.test_ds.prefix=dev \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=true \
-        exp_manager.exp_dir=checkpoints
-      AFTER_SCRIPT: |
-        rm -rf checkpoints
-
-  L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/intent_slot_classification && \
-        python multi_label_intent_slot_classification.py \
-        model.data_dir=/home/TestData/nlp/new_multiatis \
-        model.validation_ds.prefix=dev \
-        model.test_ds.prefix=dev \
-        trainer.devices=1 \
-        +trainer.fast_dev_run=true \
-        exp_manager.exp_dir=checkpoints2
-      AFTER_SCRIPT: |
-        rm -rf checkpoints2
-
-    # TODO: add when megatron-bert is supported again
-    # stage("L2: Model Parallel Size 2 Megatron Text Classification") {
-    #   when {
-    #     anyOf{
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   steps{
-    #     cd examples/nlp/text_classification && \
-    #     python text_classification_with_bert.py \
-    #     trainer.devices=[0,1] \
-    #     trainer.accelerator="gpu" \
-    #     trainer.num_nodes=1 \
-    #     trainer.precision=16 \
-    #     trainer.gradient_clip_val=1.0 \
-    #     +trainer.fast_dev_run=true \
-    #     model.dataset.num_classes=6 \
-    #     model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-    #     model.train_ds.batch_size=4 \
-    #     model.language_model.pretrained_model_name=megatron-bert-uncased \
-    #     model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \
-    #     model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
-    #     model.nemo_path=null \
-    #     ~model.infer_samples \
-    #     exp_manager=null
-    #   }
-    # }
-
-    # stage("L2: Model Parallel Size 2 Megatron Autoresume") {
-    #   when {
-    #     anyOf{
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   steps{
-    #     cd examples/nlp/text_classification && \
-    #     python text_classification_with_bert.py \
-    #     trainer.devices=[0,1] \
-    #     trainer.accelerator="gpu" \
-    #     trainer.num_nodes=1 \
-    #     trainer.precision=16 \
-    #     trainer.gradient_clip_val=1.0 \
-    #     trainer.max_epochs=1 \
-    #     +trainer.fast_dev_run=true \
-    #     model.dataset.num_classes=6 \
-    #     model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-    #     model.train_ds.batch_size=4 \
-    #     model.language_model.pretrained_model_name=megatron-bert-uncased \
-    #     model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \
-    #     model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
-    #     model.nemo_path=null \
-    #     ~model.infer_samples \
-    #     +exp_manager.explicit_log_dir=/home/TestData/nlp/mp_autoresume \
-    #     +exp_manager.resume_if_exists=true
-    #   }
-    # }
-
-    # stage("L2: Model Parallel Size 2 Megatron Evaluation from .nemo") {
-    #   when {
-    #     anyOf{
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   steps{
-    #     cd examples/nlp/text_classification && \
-    #     python model_parallel_text_classification_evaluation.py \
-    #     trainer.devices=[0,1] \
-    #     trainer.accelerator="gpu" \
-    #     trainer.num_nodes=1 \
-    #     model.dataset.num_classes=6 \
-    #     model.test_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
-    #     model.nemo_path=/home/TestData/nlp/mp_2_nemo/retail_text_class_350M.nemo \
-    #     exp_manager=null
-    #   }
-    # }
-
-    # stage("L2: Model Parallel Size 2 Megatron Train from .nemo") {
-    #   when {
-    #     anyOf{
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   steps{
-    #     cd examples/nlp/token_classification && \
-    #     python token_classification_train.py \
-    #     pretrained_model=/home/TestData/nlp/mp_2_nemo/ner_350M.nemo \
-    #     model.dataset.data_dir=/home/TestData/nlp/ner/ \
-    #     model.train_ds.batch_size=2 \
-    #     model.dataset.use_cache=false \
-    #     trainer.devices=[0,1] \
-    #     trainer.accelerator="gpu" \
-    #     +trainer.fast_dev_run=true \
-    #     model.dataset.class_balancing="weighted_loss" \
-    #     exp_manager=null
-    #   }
-    # }
-
-
-  # L2: Parallel NLP Examples 2
-  L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/token_classification && \
-        python token_classification_train.py \
-        pretrained_model=ner_en_bert \
-        model.dataset.data_dir=/home/TestData/nlp/ner/ \
-        model.train_ds.batch_size=2 \
-        model.dataset.use_cache=false \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=true \
-        model.dataset.class_balancing="weighted_loss" \
-        exp_manager.exp_dir=null
-
-  L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/token_classification && \
-        data_dir="$(mktemp -d -p "$(pwd)")" && \
-        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-        python punctuation_capitalization_train_evaluate.py \
-          pretrained_model=punctuation_en_bert \
-          model.train_ds.ds_item="${data_dir}" \
-          model.validation_ds.ds_item="${data_dir}" \
-          model.test_ds.ds_item="${data_dir}" \
-          +model.train_ds.use_cache=false \
-          +model.validation_ds.use_cache=false \
-          +model.test_ds.use_cache=false \
-          trainer.devices=1 \
-          trainer.accelerator="gpu" \
-          +trainer.fast_dev_run=true \
-          exp_manager.exp_dir=null;
-
-        rm -rf "${data_dir}"
-
-  L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/token_classification && \
-        python token_classification_train.py \
-        model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=true \
-        model.dataset.use_cache=false \
-        model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \
-        exp_manager.exp_dir=null
-
-  L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/token_classification/token_classification_evaluate.py \
-        model.dataset.data_dir=/home/TestData/nlp/ner/ \
-        model.dataset.use_cache=false \
-        pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo
-
-  L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        data_dir="$(mktemp -d -p "$(pwd)")" && \
-        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-        python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-          +do_training=false \
-          +do_testing=true \
-          model.test_ds.ds_item="${data_dir}" \
-          ~model.train_ds \
-          ~model.validation_ds \
-          +model.test_ds.use_cache=false \
-          pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo;
-
-        rm -rf "${data_dir}"
-
-
   # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed
   L2_Pretraining_BERT_pretraining_from_Text:
     needs: [cicd-test-container-setup]
@@ -1990,313 +1709,6 @@ jobs:
         model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
       AFTER_SCRIPT: |
         rm -rf examples/nlp/machine_translation/megatron_nmt_results
-
-  L2_Megatron_BART_Perceiver_MIM_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Perceiver_MIM_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.arch=perceiver \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="swiglu" \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="swiglu" \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.micro_batch_size=2 \
-        model.global_batch_size=4 \
-        model.data.data_impl=text_mmap \
-        model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-        model.data.splits_string="\"800,100,100\"" \
-        model.data.whole_word_masking=False \
-        model.tokenizer.library=sentencepiece \
-        model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-        ++model.hiddens.enc_output_name=z \
-        ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
-        ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
-        ++model.hiddens.loss.mim.cls_name=a_mim \
-        ++model.hiddens.loss.mim.loss_weight=0.5
-        # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
-        # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.arch=perceiver \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="swiglu" \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="swiglu" \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.micro_batch_size=2 \
-        model.global_batch_size=4 \
-        model.data.data_impl=text_mmap \
-        model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-        model.data.splits_string="\"800,100,100\"" \
-        model.data.whole_word_masking=False \
-        model.tokenizer.library=sentencepiece \
-        model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-        ++model.hiddens.enc_output_name=z \
-        ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
-        ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
-        ++model.hiddens.loss.mim.cls_name=a_mim \
-        ++model.hiddens.loss.mim.loss_weight=0.5
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/megatron_mim_results
-
-    # stage("L2: NMT Bottleneck Fallback") {
-    #   when {
-    #     anyOf {
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   parallel {
-    #     stage("L2: seq2seq (no bottleneck)") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=nll \
-    #           model.encoder.arch=seq2seq \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=params \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \
-    #           model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
-    #           model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null \
-    #         }
-    #     }
-    #   }
-    # }
-    # stage("L2: NMT Bottleneck Architecture") {
-    #   when {
-    #     anyOf {
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   parallel {
-    #     stage("Bridge Encoder (identity)") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=nll \
-    #           model.encoder.arch=bridge \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=identity \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null
-    #         }
-    #     }
-    #     stage("Perceiver Encoder (params)") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=nll \
-    #           model.encoder.arch=perceiver \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=params \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null
-    #         }
-    #     }
-    #   }
-    # }
-    # stage("L2: NMT Bottleneck LVM") {
-    #   when {
-    #     anyOf {
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   parallel {
-    #     stage("VAE") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=vae \
-    #           model.encoder.arch=perceiver \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=params \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null
-    #         }
-    #     }
-    #     stage("MIM") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=mim \
-    #           model.encoder.arch=perceiver \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=params \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null
-    #         }
-    #     }
-    #   }
-    # }
         
   L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism:
     needs: [cicd-test-container-setup]
@@ -2366,82 +1778,10 @@ jobs:
         model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
         model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
 
-  L2_Megatron_Bert_Pretraining_and_Resume_Training:
+  L2_Megatron_Core_Bert_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Bert_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.sequence_parallel=True \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=20 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/bert_pretrain_results
-        rm -rf examples/nlp/language_modeling/bert_index_mappings
-
-  L2_Megatron_Core_Bert_Pretraining_and_Resume_Training:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_Bert_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_Bert_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -2508,228 +1848,6 @@ jobs:
         rm -rf examples/nlp/language_modeling/bert_pretrain_results
         rm -rf examples/nlp/language_modeling/bert_index_mappings
 
-  L2_Megatron_RETRO_Pretraining_and_Resume_Training:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_RETRO_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_retro_pretraining.py \
-        trainer.num_nodes=1 \
-        trainer.devices=2 \
-        trainer.precision=bf16 \
-        trainer.accelerator=gpu \
-        model.data.data_prefix=["none"] \
-        exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
-        model.mcore_gpt=True \
-        model.tensor_model_parallel_size=1 \
-        model.pipeline_model_parallel_size=1 \
-        model.optim.name=distributed_fused_adam \
-        model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
-        model.data.num_workers=4 \
-        model.micro_batch_size=1 \
-        model.data.shuffle_documents=False \
-        trainer.val_check_interval=30 \
-        +trainer.num_sanity_val_steps=0 \
-        model.init_method_std=0.023 \
-        model.optim.lr=6.0e-4 \
-        model.megatron_amp_O2=True \
-        model.data.splits_string="\"98,2,0\"" \
-        model.data.dataloader_type=cyclic \
-        trainer.max_steps=10
-
-        python examples/nlp/language_modeling/megatron_retro_pretraining.py \
-        trainer.num_nodes=1 \
-        trainer.devices=2 \
-        trainer.precision=bf16 \
-        trainer.accelerator=gpu \
-        model.data.data_prefix=["none"] \
-        exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
-        model.mcore_gpt=True \
-        model.tensor_model_parallel_size=1 \
-        model.pipeline_model_parallel_size=1 \
-        model.optim.name=distributed_fused_adam \
-        model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
-        model.data.num_workers=4 \
-        model.micro_batch_size=1 \
-        model.data.shuffle_documents=False \
-        trainer.val_check_interval=30 \
-        +trainer.num_sanity_val_steps=0 \
-        model.init_method_std=0.023 \
-        model.optim.lr=6.0e-4 \
-        model.megatron_amp_O2=True \
-        model.data.splits_string="\"98,2,0\"" \
-        model.data.dataloader_type=cyclic \
-        trainer.max_steps=20
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/mcore_retro_results
-
-  L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
-        trainer.devices=2 \
-        trainer.num_nodes=1 \
-        trainer.accelerator=gpu \
-        trainer.accumulate_grad_batches=1 \
-        trainer.limit_val_batches=2 \
-        exp_manager.resume_if_exists=True \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        trainer.val_check_interval=10 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
-        model.data.data_prefix= \
-        model.data.knn_index= \
-        model.data.retrieval_prefix= \
-        model.tensor_model_parallel_size=2 \
-        model.micro_batch_size=4 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.chunk_size=32 \
-        model.enc_num_layers=2 \
-        model.dec_num_layers=2 \
-        model.enc_cross_attention=[1] \
-        model.dec_cross_attention=[1] \
-        +model.data.mock=True
-
-            python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
-            trainer.devices=2 \
-            trainer.num_nodes=1 \
-            trainer.accelerator=gpu \
-            trainer.accumulate_grad_batches=1 \
-            trainer.limit_val_batches=2 \
-            exp_manager.resume_if_exists=True \
-            trainer.max_steps=20 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            trainer.val_check_interval=10 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
-            model.data.data_prefix= \
-            model.data.knn_index= \
-            model.data.retrieval_prefix= \
-            model.tensor_model_parallel_size=2 \
-            model.micro_batch_size=4 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.chunk_size=32 \
-            model.enc_num_layers=2 \
-            model.dec_num_layers=2 \
-            model.enc_cross_attention=[1] \
-            model.dec_cross_attention=[1] \
-            +model.data.mock=True
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/retro_legacy_results
-
-  # L2_Megatron_RETRO_muTransfer_Pretraining_Performance:
-  #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure
-  #   container:
-  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #     options: 
-  #       # --user 0:128
-  #       --device=/dev/nvidia0
-  #       --gpus all
-  #       --shm-size=8g 
-  #       --env TRANSFORMERS_OFFLINE=0 
-  #       --env HYDRA_FULL_ERROR=1
-  #       --volume /mnt/datadrive/TestData:/home/TestData
-  #   steps:
-  #       - name: Checkout repository
-  #         uses: actions/checkout@v4
-  #       - run: |
-  #           python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \
-  #               trainer.devices=2 \
-  #               trainer.num_nodes=1 \
-  #               trainer.accelerator=gpu \
-  #               trainer.accumulate_grad_batches=1 \
-  #               trainer.max_steps=100 \
-  #               trainer.log_every_n_steps=1 \
-  #               trainer.precision=16 \
-  #               trainer.val_check_interval=100 \
-  #               trainer.limit_val_batches=0 \
-  #               trainer.gradient_clip_val=1.0 \
-  #               +trainer.num_sanity_val_steps=0 \
-  #               exp_manager.exp_dir=examples/nlp/language_modeling/retro_results/ \
-  #               +exp_manager.version=smalltest \
-  #               model.data.neighbors=2 \
-  #               model.megatron_amp_O2=False \
-  #               model.apply_query_key_layer_scaling=False \
-  #               model.tensor_model_parallel_size=1 \
-  #               model.optim.name=muadamw \
-  #               model.optim.weight_decay=0.1 \
-  #               model.optim.betas=[0.9,0.95] \
-  #               model.optim.lr=6e-4 \
-  #               model.optim.sched.warmup_steps=1000 \
-  #               model.optim.sched.constant_steps=0 \
-  #               model.optim.sched.min_lr=6e-5 \
-  #               model.add_position_embedding=False \
-  #               model.enc_num_layers=2 \
-  #               model.dec_num_layers=6 \
-  #               model.enc_cross_attention=[0] \
-  #               model.dec_cross_attention=[3,5] \
-  #               model.hidden_size=96 \
-  #               model.ffn_hidden_size=384 \
-  #               model.init_method_std=0.023 \
-  #               model.num_attention_heads=12 \
-  #               model.max_position_embeddings=1024 \
-  #               model.encoder_seq_length=1024 \
-  #               model.tokenizer.library=megatron \
-  #               model.tokenizer.type=GPT2BPETokenizer \
-  #               model.tokenizer.merge_file=/home/TestData/nlp/megatron_retro/gpt2-merges.txt \
-  #               model.tokenizer.vocab_file=/home/TestData/nlp/megatron_retro/gpt2-vocab.json \
-  #               model.data.data_prefix=[/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document] \
-  #               model.data.knn_index=[/home/TestData/nlp/megatron_retro/knn2_map_wiki_test.idx] \
-  #               model.data.retrieval_prefix=/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document \
-  #               model.data.index_mapping_dir=/home/TestData/nlp/megatron_retro \
-  #               model.data.num_workers=8 \
-  #               model.micro_batch_size=8 \
-  #               model.normalization=rmsnorm \
-  #               model.transformer_block_type=pre_ln \
-  #               model.bias_activation_fusion=True \
-  #               model.bias_dropout_add_fusion=False \
-  #               model.masked_softmax_fusion=True \
-  #               model.hidden_dropout=0 \
-  #               model.attention_dropout=0 \
-  #               model.fp32_residual_connection=True \
-  #               model.shape_file=/home/TestData/nlp/megatron_retro/o1_rel_shape_info_tiny.yaml
-
-  #               python -c "import pandas as pd
-  #               import pathlib
-  #               from pandas.testing import assert_frame_equal
-  #               from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
-  #               import torch
-  #               if not (torch.cuda.is_available() and "A100" in torch.cuda.get_device_name()):
-  #                   import sys
-  #                   sys.exit(0)
-  #               event_file = list(pathlib.Path("examples/nlp/language_modeling/retro_results/megatron_retro/smalltest").glob("events.out.tfevents*"))[0]
-  #               ea = EventAccumulator(str(event_file)).Reload()
-  #               vals = []
-  #               for i in ea.Scalars("reduced_train_loss"):
-  #                   vals.append(i.value)
-  #               training_curve = pd.DataFrame({"loss": vals})
-  #               gt_curve = pd.read_csv("/home/TestData/nlp/megatron_retro/expected_learning_curve.csv")
-  #               assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"
-
-  #               rm -rf examples/nlp/language_modeling/retro_results
-  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-  #         if: "failure()"
-
   L2_RAG_Pipeline_Indexing:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -2766,22 +1884,6 @@ jobs:
         generating.inference.temperature=1.0 \
         generating.query="Which art schools did I applied to?"
 
-  L2_BioMegatron_Bert_NER_Task:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_BioMegatron_Bert_NER_Task') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/token_classification/token_classification_train.py \
-        exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \
-        trainer.max_epochs=1 \
-        model.dataset.data_dir=/home/TestData/nlp/ner \
-        model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \
-        model.tokenizer.tokenizer_name=null
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/token_classification_results
-
   L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -3936,103 +3038,6 @@ jobs:
       AFTER_SCRIPT: |
         rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo
 
-  L2_Megatron_T5_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=swiglu \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=relative \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=fast-swiglu \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type=pre_ln \
-        model.decoder.transformer_block_type=pre_ln \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False
-
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=swiglu \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=relative \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=fast-swiglu \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type=pre_ln \
-        model.decoder.transformer_block_type=pre_ln \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/t5_pretrain_results
-        rm -rf examples/nlp/language_modeling/t5_index_mappings
-
   L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4488,18 +3493,6 @@ jobs:
         rm -rf examples/nlp/language_modeling/t5_pretrain_results
         rm -rf examples/nlp/language_modeling/t5_index_mappings
 
-  L2_Megatron_T5_Eval:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Eval') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_t5_eval.py \
-            --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            --prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
-            --tensor_model_parallel_size 1
-
   L2_Megatron_Core_T5_Eval:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4512,196 +3505,6 @@ jobs:
             --prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
             --tensor_model_parallel_size 1
 
-  L2_Megatron_BART_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="reglu" \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="reglu" \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}"
-
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=5 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="reglu" \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="reglu" \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}"
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/bart_pretrain_results
-
-  L2_Megatron_BART_Pretraining_and_Resume_Training_PP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=geglu \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=geglu \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.respect_document_boundaries=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
-
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=geglu \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=geglu \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.respect_document_boundaries=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/bart_pretrain_results
-
- 
-  L2_Megatron_T5_PEFT_Lora_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_PEFT_Lora_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-
-        python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=9999 \
-        trainer.max_steps=3 \
-        trainer.val_check_interval=3 \
-        ++trainer.limit_val_batches=2 \
-        trainer.precision=16 \
-        exp_manager.exp_dir=/tmp/nlp_t5_lora_tuning_tp2 \
-        model.pipeline_model_parallel_size=1 \
-        model.tensor_model_parallel_size=2 \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
-        model.peft.peft_scheme=lora \
-        model.answer_only_loss=True \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[1.0] \
-        model.data.train_ds.num_workers=0 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]
-
-        python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
-        model.peft.restore_from_path=/tmp/nlp_t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
-        model.peft.restore_from_ckpt_name=null \
-        model.peft.restore_from_hparams_path=null \
-        model.tensor_model_parallel_size=2 \
-        trainer.devices=2 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-        model.data.test_ds.names=[quarel4] \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.data.test_ds.tokens_to_generate=10 \
-        model.data.test_ds.write_predictions_to_file=True \
-        model.data.test_ds.output_file_path_prefix=/tmp/nlp_t5_lora_tuning_tp2/out \
-        inference.greedy=True \
-        inference.repetition_penalty=1.0 \
-        inference.outfile_path=/tmp/nlp_t5_lora_tuning_tp2/out.jsonl
-
   L2_Megatron_Core_T5_PEFT_Lora_TP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -5084,7 +3887,7 @@ jobs:
         rm -rf tests/collections/llm/gpt_pretrain_results
         rm -rf tests/collections/llm/gpt_index_mappings
 
-  OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check:
+  L2_NeMo_2_GPT_DDP_Param_Parity_check:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check') || needs.cicd-test-container-setup.outputs.all == 'true'
@@ -5092,7 +3895,7 @@ jobs:
       RUNNER: self-hosted-azure
       SCRIPT: |
 
-        python tests/lightning/test_ddp_parity_checker.py \
+        TORCHDYNAMO_DISABLE=1 python tests/lightning/test_ddp_parity_checker.py \
         --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
         --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
         --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document
@@ -5100,8 +3903,7 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf tests/collections/llm/gpt_pretrain_results
         rm -rf tests/collections/llm/gpt_index_mappings
-      IS_OPTIONAL: true
-      
+
   L2_NeMo_2_SSM_Pretraining:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -5182,6 +3984,22 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf tests/collections/llm/t5_finetune_results/${{ github.run_id }}
 
+  L2_NeMo_2_T5_LoRA:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_T5_LoRA') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_finetuning.py \
+        --devices=2 \
+        --max-steps=250 \
+        --peft=lora \
+        --experiment-dir=tests/collections/llm/t5_peft_results/${{ github.run_id }} \
+        --checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_150steps
+      AFTER_SCRIPT: |
+        rm -rf tests/collections/llm/t5_peft_results/${{ github.run_id }}
+
   L2_NeMo_2_Mixtral_Pretraining:
       needs: [cicd-test-container-setup]
       uses: ./.github/workflows/_test_template.yml
@@ -5205,7 +4023,7 @@ jobs:
         --restore_path /home/TestData/nemo2_ckpt/llama_68M \
         --devices 2 \
         --max_steps 3 \
-        --experiment_dir tests/collections/llm/${{ github.run_id }} \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
         --peft none \
         --tp_size 1 \
         --pp_size 1 \
@@ -5215,14 +4033,12 @@ jobs:
         --restore_path /home/TestData/nemo2_ckpt/llama_68M \
         --devices 2 \
         --max_steps 6 \
-        --experiment_dir tests/collections/llm/${{ github.run_id }} \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
         --peft none \
         --tp_size 1 \
         --pp_size 1 \
         --mbs 1
 
-      AFTER_SCRIPT: |
-        rm -rf tests/collections/llm/${{ github.run_id }}
 
   L2_NeMo_2_GPT_SFT_TP1PP1_MBS2:
     needs: [cicd-test-container-setup]
@@ -5236,7 +4052,7 @@ jobs:
         --restore_path /home/TestData/nemo2_ckpt/llama_68M \
         --devices 2 \
         --max_steps 3 \
-        --experiment_dir tests/collections/llm/${{ github.run_id }} \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
         --peft none \
         --tp_size 1 \
         --pp_size 1 \
@@ -5246,14 +4062,12 @@ jobs:
         --restore_path /home/TestData/nemo2_ckpt/llama_68M \
         --devices 2 \
         --max_steps 6 \
-        --experiment_dir tests/collections/llm/${{ github.run_id }} \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
         --peft none \
         --tp_size 1 \
         --pp_size 1 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf tests/collections/llm/${{ github.run_id }}
 
   L2_NeMo_2_GPT_SFT_TP1PP2_MBS2:
     needs: [cicd-test-container-setup]
@@ -5267,7 +4081,7 @@ jobs:
         --restore_path /home/TestData/nemo2_ckpt/llama_68M \
         --devices 2 \
         --max_steps 3 \
-        --experiment_dir tests/collections/llm/${{ github.run_id }} \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
         --peft none \
         --tp_size 1 \
         --pp_size 2 \
@@ -5277,14 +4091,12 @@ jobs:
         --restore_path /home/TestData/nemo2_ckpt/llama_68M \
         --devices 2 \
         --max_steps 6 \
-        --experiment_dir tests/collections/llm/${{ github.run_id }} \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
         --peft none \
         --tp_size 1 \
         --pp_size 2 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf tests/collections/llm/${{ github.run_id }}
 
   L2_NeMo_2_GPT_SFT_TP2PP1_MBS2:
     needs: [cicd-test-container-setup]
@@ -5298,7 +4110,7 @@ jobs:
         --restore_path /home/TestData/nemo2_ckpt/llama_68M \
         --devices 2 \
         --max_steps 3 \
-        --experiment_dir tests/collections/llm/${{ github.run_id }} \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
         --peft none \
         --tp_size 2 \
         --pp_size 1 \
@@ -5308,14 +4120,41 @@ jobs:
         --restore_path /home/TestData/nemo2_ckpt/llama_68M \
         --devices 2 \
         --max_steps 6 \
-        --experiment_dir tests/collections/llm/${{ github.run_id }} \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
         --peft none \
         --tp_size 2 \
         --pp_size 1 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf tests/collections/llm/${{ github.run_id }}
+
+  L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+
+        python tests/collections/llm/gpt_finetuning.py \
+        --restore_path /home/TestData/nemo2_ckpt/llama_68M \
+        --devices 2 \
+        --max_steps 3 \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
+        --peft none \
+        --tp_size 1 \
+        --pp_size 1 \
+        --mbs 1 --packed
+        
+        python tests/collections/llm/gpt_finetuning.py \
+        --restore_path /home/TestData/nemo2_ckpt/llama_68M \
+        --devices 2 \
+        --max_steps 6 \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
+        --peft none \
+        --tp_size 1 \
+        --pp_size 1 \
+        --mbs 1 --packed
+
 
   L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1:
     needs: [cicd-test-container-setup]
@@ -5329,7 +4168,7 @@ jobs:
         --restore_path /home/TestData/nemo2_ckpt/llama_68M \
         --devices 2 \
         --max_steps 3 \
-        --experiment_dir tests/collections/llm/${{ github.run_id }} \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
         --peft lora \
         --tp_size 1 \
         --pp_size 1 \
@@ -5339,14 +4178,12 @@ jobs:
         --restore_path /home/TestData/nemo2_ckpt/llama_68M \
         --devices 2 \
         --max_steps 6 \
-        --experiment_dir tests/collections/llm/${{ github.run_id }} \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
         --peft lora \
         --tp_size 1 \
         --pp_size 1 \
         --mbs 1
 
-      AFTER_SCRIPT: |
-        rm -rf tests/collections/llm/${{ github.run_id }}
 
   L2_NeMo_2_GPT_LoRA_TP1PP1_MBS2:
     needs: [cicd-test-container-setup]
@@ -5360,7 +4197,7 @@ jobs:
         --restore_path /home/TestData/nemo2_ckpt/llama_68M \
         --devices 2 \
         --max_steps 3 \
-        --experiment_dir tests/collections/llm/${{ github.run_id }} \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
         --peft lora \
         --tp_size 1 \
         --pp_size 1 \
@@ -5370,14 +4207,12 @@ jobs:
         --restore_path /home/TestData/nemo2_ckpt/llama_68M \
         --devices 2 \
         --max_steps 6 \
-        --experiment_dir tests/collections/llm/${{ github.run_id }} \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
         --peft lora \
         --tp_size 1 \
         --pp_size 1 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf tests/collections/llm/${{ github.run_id }}
 
   L2_NeMo_2_GPT_LoRA_TP1PP2_MBS2:
     needs: [cicd-test-container-setup]
@@ -5391,7 +4226,7 @@ jobs:
         --restore_path /home/TestData/nemo2_ckpt/llama_68M \
         --devices 2 \
         --max_steps 3 \
-        --experiment_dir tests/collections/llm/${{ github.run_id }} \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
         --peft lora \
         --tp_size 1 \
         --pp_size 2 \
@@ -5401,14 +4236,12 @@ jobs:
         --restore_path /home/TestData/nemo2_ckpt/llama_68M \
         --devices 2 \
         --max_steps 6 \
-        --experiment_dir tests/collections/llm/${{ github.run_id }} \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
         --peft lora \
         --tp_size 1 \
         --pp_size 2 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf tests/collections/llm/${{ github.run_id }}
 
   L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2:
     needs: [cicd-test-container-setup]
@@ -5422,7 +4255,7 @@ jobs:
         --restore_path /home/TestData/nemo2_ckpt/llama_68M \
         --devices 2 \
         --max_steps 3 \
-        --experiment_dir tests/collections/llm/${{ github.run_id }} \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
         --peft lora \
         --tp_size 2 \
         --pp_size 1 \
@@ -5432,14 +4265,39 @@ jobs:
         --restore_path /home/TestData/nemo2_ckpt/llama_68M \
         --devices 2 \
         --max_steps 6 \
-        --experiment_dir tests/collections/llm/${{ github.run_id }} \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
         --peft lora \
         --tp_size 2 \
         --pp_size 1 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf tests/collections/llm/${{ github.run_id }}
+  L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+
+        python tests/collections/llm/gpt_finetuning.py \
+        --restore_path /home/TestData/nemo2_ckpt/llama_68M \
+        --devices 2 \
+        --max_steps 3 \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
+        --peft lora \
+        --tp_size 1 \
+        --pp_size 1 \
+        --mbs 1 --packed
+        
+        python tests/collections/llm/gpt_finetuning.py \
+        --restore_path /home/TestData/nemo2_ckpt/llama_68M \
+        --devices 2 \
+        --max_steps 6 \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
+        --peft lora \
+        --tp_size 1 \
+        --pp_size 1 \
+        --mbs 1 --packed
 
   L2_NeMo_2_NeMo_Mcore_Mixtral_bitexact:
     needs: [cicd-test-container-setup]
@@ -5456,7 +4314,7 @@ jobs:
       - gpu-test
       - cicd-test-container-setup
 
-      #- OPTIONAL_L0_Unit_Tests_GPU_ASR
+      - L0_Unit_Tests_GPU_ASR
       - L0_Unit_Tests_GPU_Audio
       - L0_Unit_Tests_GPU_Common
       - L0_Unit_Tests_GPU_LLM
@@ -5507,19 +4365,10 @@ jobs:
       - L2_ASR_Adapters_Linear_Adapters
       - L2_ASR_Adapters_RelPos_MHA_Adapters
       - L2_Speech_Transcription_Speech_to_Text_Transcribe
-      #- OPTIONAL_L2_Transducer_alignment_Running_pytest
       - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav
       - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
       - L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
       - L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference
-      - L2_Duplex_Text_Normalization_with_Tarred_dataset
-      - L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification
-      - L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification
-      - L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test
-      - L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test
-      - L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1
-      - L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification
-      - L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation
       - L2_Pretraining_BERT_pretraining_from_Text
       - L2_Pretraining_BERT_from_Preprocessed
       - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN
@@ -5530,15 +4379,10 @@ jobs:
       - L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation
       - L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation
       - L2_Megatron_NMT_Training_TP2
-      - L2_Megatron_BART_Perceiver_MIM_Training_TP2
       - L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism
-      - L2_Megatron_Bert_Pretraining_and_Resume_Training
       - L2_Megatron_Core_Bert_Pretraining_and_Resume_Training
-      - L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training
-      - L2_Megatron_RETRO_Pretraining_and_Resume_Training
       - L2_RAG_Pipeline_Indexing
       - L2_RAG_Pipeline_Generating
-      - L2_BioMegatron_Bert_NER_Task
       - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_Skip_Train
       - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2
@@ -5559,18 +4403,13 @@ jobs:
       - L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len
       - L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2
       - L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2
-      - L2_Megatron_T5_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_Pretraining_and_Resume_Training_PP2
       - L2_Megatron_T5_w_Mixture_of_Expert_Pretraining
       - L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_T5_Eval
       - L2_Megatron_Core_T5_Eval
-      - L2_Megatron_BART_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_BART_Pretraining_and_Resume_Training_PP2
-      - L2_Megatron_T5_PEFT_Lora_TP2
       - L2_Megatron_Core_T5_PEFT_Lora_TP2
       - L2_Megatron_Mock_Data_Generation_MockGPTDataset
       - L2_Megatron_Mock_Data_Generation_MockT5Dataset
@@ -5583,20 +4422,23 @@ jobs:
       - Speech_Checkpoints_tests
       - L2_Stable_Diffusion_Training
       - L2_NeMo_2_GPT_Pretraining_no_transformer_engine
-      #- OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check
+      - L2_NeMo_2_GPT_DDP_Param_Parity_check
       - L2_NeMo_2_HF_MODEL_IMPORT
       - L2_NeMo_2_SSM_Pretraining
       - L2_NeMo_2_SSM_Finetuning
       - L2_NeMo_2_T5_Pretraining
       - L2_NeMo_2_T5_Finetuning
+      - L2_NeMo_2_T5_LoRA
       - L2_NeMo_2_GPT_SFT_TP1PP1_MBS1
       - L2_NeMo_2_GPT_SFT_TP1PP1_MBS2
       - L2_NeMo_2_GPT_SFT_TP1PP2_MBS2
       - L2_NeMo_2_GPT_SFT_TP2PP1_MBS2
+      - L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED
       - L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1
       - L2_NeMo_2_GPT_LoRA_TP1PP1_MBS2
       - L2_NeMo_2_GPT_LoRA_TP1PP2_MBS2
       - L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2
+      - L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED
       - L2_NeMo_2_Mixtral_Pretraining
       - L2_PTQ_Llama2_INT8_SQ
       - L2_PTQ_Llama2_FP8
@@ -5744,4 +4586,4 @@ jobs:
 
       - name: "Pipeline not successful, set exit code to 1"
         if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }}
-        run: exit 1
\ No newline at end of file
+        run: exit 1
diff --git a/Dockerfile.ci b/Dockerfile.ci
index c4bc96e50c3d..f01025873628 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.17.0
-ARG MCORE_TAG=228dc2045b9a985e051f4fba4bbb1f579cc76e6f
+ARG MCORE_TAG=db7d37b54ef96e35f7afc56e29fffb60f5c957b9
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
diff --git a/docs/source/asr/asr_language_modeling_and_customization.rst b/docs/source/asr/asr_language_modeling_and_customization.rst
index 02fed8b89760..0b4f7a7e730f 100644
--- a/docs/source/asr/asr_language_modeling_and_customization.rst
+++ b/docs/source/asr/asr_language_modeling_and_customization.rst
@@ -19,8 +19,7 @@ N-gram Language Modeling
 In this approach, an N-gram LM is trained on text data, then it is used in fusion with beam search decoding to find the
 best candidates. The beam search decoders in NeMo support language models trained with KenLM library (
 `https://github.com/kpu/kenlm <https://github.com/kpu/kenlm>`__).
-The beam search decoders and KenLM library are not installed by default in NeMo, and you need to install them to be
-able to use beam search decoding and N-gram LM.
+The beam search decoders and KenLM library are not installed by default in NeMo. You need to install them to be able to use beam search decoding and N-gram LM.
 Please refer to `scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh <https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh>`__
 on how to install them. Alternatively, you can build Docker image
 `scripts/installers/Dockerfile.ngramtools <https://github.com/NVIDIA/NeMo/blob/stable/scripts/installers/Dockerfile.ngramtools>`__ with all the necessary dependencies.
@@ -34,25 +33,20 @@ the scores produced by the N-gram LM into its score calculations as the followin
     final_score = acoustic_score + beam_alpha*lm_score + beam_beta*seq_length
 
 where acoustic_score is the score predicted by the acoustic encoder and lm_score is the one estimated by the LM.
-Parameter 'beam_alpha' specifies amount of importance to place on the N-gram language model, and 'beam_beta' is a
-penalty term to consider the sequence length in the scores. Larger alpha means more importance on the LM and less
-importance on the acoustic model. Negative values for beta will give penalty to longer sequences and make the decoder
-to prefer shorter predictions, while positive values would result in longer candidates.
+The parameter 'beam_alpha' determines the weight given to the N-gram language model, while 'beam_beta' is a penalty term that accounts for sequence length in the scores. A larger 'beam_alpha' places more emphasis on the language model and less on the acoustic model. Negative values for 'beam_beta' penalize longer sequences, encouraging the decoder to prefer shorter predictions. Conversely, positive values for 'beam_beta' favor longer candidates.
 
 .. _train-ngram-lm:
 
 Train N-gram LM
 ===============
 
-The script to train an N-gram language model with KenLM can be found at
+The script to train an N-gram language model with KenLM can be found at:
 `scripts/asr_language_modeling/ngram_lm/train_kenlm.py <https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/ngram_lm/train_kenlm.py>`__.
 
-This script would train an N-gram language model with KenLM library which can be used with the beam search decoders
-on top of the ASR models. This script supports both character level and BPE level encodings and models which are
-detected automatically from the type of the model.
+This script trains an N-gram language model with the KenLM library which can then be used with the beam search decoders on top of the ASR models. This script also supports both character-level and BPE-level encodings and models which are detected automatically from the model type.
 
 
-You may train the N-gram model as the following:
+You can train the N-gram model using the following:
 
 .. code-block::
 
@@ -65,14 +59,13 @@ You may train the N-gram model as the following:
 
 The `train_paths` parameter allows for various input types, such as a list of text files, JSON manifests, or directories, to be used as the training data.
 If the file's extension is anything other than `.json`, it assumes that data format is plain text. For plain text format, each line should contain one
-sample. For JSON manifest file, the file need to contain json formatted samples per each line like this:
+sample. For the JSON manifests, the file must contain JSON-formatted samples per each line like this:
 
 .. code-block::
 
     {"audio_filepath": "/data_path/file1.wav", "text": "The transcript of the audio file."}
 
-It just extracts the `text` field from each line to create the training text file. After the N-gram model is trained,
-it is stored at the path specified by `kenlm_model_file`.
+This code extracts the `text` field from each line to create the training text file. After the N-gram model is trained, it is stored at the path specified by `kenlm_model_file`.
 
 The following is the list of the arguments for the training script:
 
@@ -98,7 +91,8 @@ The following is the list of the arguments for the training script:
 | verbose          | int       | 1           | Verbose level.                                                                                                                 |
 +------------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
 
-** Note: Recommend to use 6 as the order of the N-gram model for BPE-based models. Higher orders may need the re-compilation of KenLM to support it.
+..note::
+It is recommended that you use 6 as the order of the N-gram model for BPE-based models. Higher orders may require re-compiling KenLM to support them.
 
 Evaluate by Beam Search Decoding and N-gram LM
 ==============================================
@@ -107,9 +101,9 @@ NeMo's beam search decoders are capable of using the KenLM's N-gram models to fi
 The script to evaluate an ASR model with beam search decoding and N-gram models can be found at
 `scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py <https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py>`__.
 
-This script has a large number of possible argument overrides, therefore it is advised to use ``python eval_beamsearch_ngram.py --help`` to see the full list of arguments.
+This script has a large number of possible argument overrides; therefore, it is recommended that you use ``python eval_beamsearch_ngram.py --help`` to see the full list of arguments.
 
-You may evaluate an ASR model as the following:
+You can evaluate an ASR model using the following:
 
 .. code-block::
 
@@ -124,25 +118,18 @@ You may evaluate an ASR model as the following:
            decoding_mode=beamsearch_ngram \
            decoding_strategy="<Beam library such as beam, pyctcdecode or flashlight>"
 
-It can evaluate a model in the three following modes by setting the argument `--decoding_mode`:
+It can evaluate a model in the following three modes by setting the argument `--decoding_mode`:
 
-*  greedy: Just greedy decoding is done, and no beam search decoding is performed.
-*  beamsearch: The beam search decoding is done but without using the N-gram language model, final results would be equivalent to setting the weight of LM (beam_beta) to zero.
+*  greedy: Just greedy decoding is done and no beam search decoding is performed.
+*  beamsearch: The beam search decoding is done, but without using the N-gram language model. Final results are equivalent to setting the weight of LM (beam_beta) to zero.
 *  beamsearch_ngram: The beam search decoding is done with N-gram LM.
 
-The `beamsearch` mode would evaluate by beam search decoding without any language model.
-It would report the performances in terms of Word Error Rate (WER) and Character Error Rate (CER). Moreover,
-the WER/CER of the model when the best candidate is selected among the candidates is also reported as the best WER/CER.
-It can be an indicator of how good the predicted candidates are.
+In `beamsearch` mode, the evaluation is performed using beam search decoding without any language model. The performance is reported in terms of Word Error Rate (WER) and Character Error Rate (CER). Moreover, when the best candidate is selected among the candidates, it is also reported as the best WER/CER. This can serve as an indicator of the quality of the predicted candidates.
+
 
-The script would initially load the ASR model and predict the outputs of the model's encoder as log probabilities.
-This part would be computed in batches on a device selected by `--device`, which can be CPU (`--device=cpu`) or a
-single GPU (`--device=cuda:0`). The batch size of this part can get specified by `--acoustic_batch_size`. You may use
-the largest batch size feasible to speed up the step of calculating the log probabilities. You may also use `--use_amp`
-to speed up the calculation of log probabilities and make it possible to use larger sizes for `--acoustic_batch_size`.
-Currently multi-GPU is not supported for calculating the log probabilities, but using `--probs_cache_file` can help.
-It stores the log probabilities produced from the model's encoder into a pickle file so that next time the first step
-can get skipped.
+The script initially loads the ASR model and predicts the outputs of the model's encoder as log probabilities. This part is computed in batches on a device specified by --device, which can be either a CPU (`--device=cpu`) or a single GPU (`--device=cuda:0`).
+The batch size for this part is specified by `--acoustic_batch_size`. Using the largest feasible batch size can speed up the calculation of log probabilities. Additionally, you can use `--use_amp` to accelerate the calculation and allow for larger --acoustic_batch_size values.
+Currently, multi-GPU support is not available for calculating log probabilities. However, using `--probs_cache_file` can help. This option stores the log probabilities produced by the model’s encoder in a pickle file, allowing you to skip the first step in future runs.
 
 The following is the list of the important arguments for the evaluation script:
 
@@ -191,8 +178,7 @@ The following is the list of the important arguments for the evaluation script:
 | text_processing.separate_punctuation | bool     | ``True``         | Whether to separate punctuation with the previous word by space.        |
 +--------------------------------------+----------+------------------+-------------------------------------------------------------------------+
 
-Width of the beam search (`--beam_width`) specifies the number of top candidates/predictions the beam search decoder
-would search for. Larger beams result in more accurate but slower predictions.
+The width of the beam search (`--beam_width`) specifies the number of top candidates or predictions the beam search decoder will consider. Larger beam widths result in more accurate but slower predictions.
 
 .. note::
 
@@ -200,14 +186,13 @@ would search for. Larger beams result in more accurate but slower predictions.
     Therefore it is possible to forward arguments for various beam search libraries such as ``flashlight``
     and ``pyctcdecode`` via the ``decoding`` subconfig.
 
-There is also a tutorial to learn more about evaluating the ASR models with N-gram LM here:
+To learn more about evaluating the ASR models with N-gram LM, refer to the tutorial here: Offline ASR Inference with Beam Search and External Language Model Rescoring
 `Offline ASR Inference with Beam Search and External Language Model Rescoring <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/Offline_ASR.ipynb>`_
 
 Beam Search Engines
 -------------------
 
-NeMo ASR CTC supports multiple beam search engines for decoding. The default engine is ``beam`` which is the OpenSeq2Seq
-decoding library.
+NeMo ASR CTC supports multiple beam search engines for decoding. The default engine is beam, which is the OpenSeq2Seq decoding library.
 
 OpenSeq2Seq (``beam``)
 ~~~~~~~~~~~~~~~~~~~~~~
@@ -220,10 +205,9 @@ The config for this decoding library is described above.
 Flashlight (``flashlight``)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Flashlight is a C++ library for ASR decoding provided at `https://github.com/flashlight/flashlight <https://github.com/flashlight/flashlight>`_. It is a CPU and CUDA-based beam search engine that is quite efficient and supports
-char and subword models. It requires an ARPA KenLM file.
+Flashlight is a C++ library for ASR decoding provided at `https://github.com/flashlight/flashlight <https://github.com/flashlight/flashlight>`_. It is a CPU- and CUDA-based beam search engine that is quite efficient and supports char and subword models. It requires an ARPA KenLM file.
 
-It supports several advanced features such as lexicon based / lexicon free decoding, beam pruning threshold, and more.
+It supports several advanced features, such as lexicon-based decoding, lexicon-free decoding, beam pruning threshold, and more.
 
 .. code-block:: python
 
@@ -254,9 +238,10 @@ It supports several advanced features such as lexicon based / lexicon free decod
 PyCTCDecode (``pyctcdecode``)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-PyCTCDecode is a Python library for ASR decoding provided at `https://github.com/kensho-technologies/pyctcdecode <https://github.com/kensho-technologies/pyctcdecode>`_. It is a CPU-based beam search engine that is somewhat efficient for a pure python library, and supports char and subword models. It requires a character/subword KenLM ARPA / BINARY model to be provided.
+PyCTCDecode is a Python library for ASR decoding provided at `https://github.com/kensho-technologies/pyctcdecode <https://github.com/kensho-technologies/pyctcdecode>`_. It is a CPU-based beam search engine that is somewhat efficient for a pure Python library, and supports char and subword models. It requires a character/subword KenLM ARPA / BINARY model to be provided.
+
 
-It has advanced features such as word boosting which can be useful for transcript customization.
+It has advanced features, such as word boosting, which can be useful for transcript customization.
 
 .. code-block:: python
 
@@ -283,10 +268,8 @@ Hyperparameter Grid Search
 --------------------------
 
 Beam search decoding with N-gram LM has three main hyperparameters: `beam_width`, `beam_alpha`, and `beam_beta`.
-The accuracy of the model is dependent to the values of these parameters, specially beam_alpha and beam_beta.
-You may specify a single or list of values for each of these parameters to perform grid search. It would perform the
-beam search decoding on all the combinations of the these three hyperparameters.
-For instance, the following set of parameters would results in 2*1*2=4 beam search decodings:
+The accuracy of the model is dependent on the values of these parameters, specifically, beam_alpha and beam_beta. To perform grid search, you can specify a single value or a list of values for each of these parameters. In this case, it would perform the beam search decoding on all combinations of the three hyperparameters.
+For example, the following set of parameters would result in 212=4 beam search decodings:
 
 .. code-block::
 
@@ -296,10 +279,10 @@ For instance, the following set of parameters would results in 2*1*2=4 beam sear
                         beam_beta=[1.0,0.5]
 
 
-Beam search ngram decoding for Transducer models (RNNT and HAT)
+Beam Search ngram Decoding for Transducer Models (RNNT and HAT)
 ===============================================================
 
-The similar script to evaluate an RNNT/HAT model with beam search decoding and N-gram models can be found at
+You can also find a similar script to evaluate an RNNT/HAT model with beam search decoding and N-gram models at:
 `scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py <https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py>`_
 
 .. code-block::
@@ -325,10 +308,7 @@ The similar script to evaluate an RNNT/HAT model with beam search decoding and N
 Neural Rescoring
 ****************
 
-In this approach a neural network is used which can gives scores to a candidate. A candidate is the text transcript predicted by the decoder of the ASR model.
-The top K candidates produced by the beam search decoding (beam width of K) are given to a neural language model to rank them.
-Ranking can be done by a language model which gives a score to each candidate.
-This score is usually combined with the scores from the beam search decoding to produce the final scores and rankings.
+When using the neural rescoring approach, a neural network is used to score candidates. A candidate is the text transcript predicted by the ASR model’s decoder. The top K candidates produced by beam search decoding (with a beam width of K) are given to a neural language model for ranking. The language model assigns a score to each candidate, which is usually combined with the scores from beam search decoding to produce the final scores and rankings.
 
 Train Neural Rescorer
 =====================
@@ -338,8 +318,8 @@ It trains a ``TransformerLMModel`` which can be used as a neural rescorer for an
 
 :doc:`../nlp/language_modeling`
 
-You may also use a pretrained language model from HuggingFace library like Transformer-XL and GPT instead of training your model.
-Models like BERT and RoBERTa are not supported by this script as they are trained as a Masked Language Model and are not efficient and effective to score sentences out of the box.
+You can also use a pretrained language model from the Hugging Face library, such as Transformer-XL and GPT, instead of training your model.
+Models like BERT and RoBERTa are not supported by this script because they are trained as Masked Language Models. As a result, they are not efficient or effective for scoring sentences out of the box.
 
 
 Evaluation
@@ -349,20 +329,18 @@ Given a trained TransformerLMModel `.nemo` file or a pretrained HF model, the sc
 `scripts/asr_language_modeling/neural_rescorer/eval_neural_rescorer.py <https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/neural_rescorer/eval_neural_rescorer.py>`__
 can be used to re-score beams obtained with ASR model. You need the `.tsv` file containing the candidates produced
 by the acoustic model and the beam search decoding to use this script. The candidates can be the result of just the beam
-search decoding or the result of fusion with an N-gram LM. You may generate this file by specifying `--preds_output_folder` for
+search decoding or the result of fusion with an N-gram LM. You can generate this file by specifying `--preds_output_folder` for
 `scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py <https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py>`__.
 
-The neural rescorer would rescore the beams/candidates by using two parameters of `rescorer_alpha` and `rescorer_beta` as the following:
+The neural rescorer would rescore the beams/candidates by using two parameters of `rescorer_alpha` and `rescorer_beta`, as follows:
 
 .. code-block::
 
     final_score = beam_search_score + rescorer_alpha*neural_rescorer_score + rescorer_beta*seq_length
 
-Parameter `rescorer_alpha` specifies amount of importance to place on the neural rescorer model, and `rescorer_beta` is
-a penalty term to consider the sequence length in the scores. They have similar effects like the parameters
-`beam_alpha` and `beam_beta` of beam search decoder and N-gram LM.
+The parameter `rescorer_alpha` specifies the importance placed on the neural rescorer model, while `rescorer_beta` is a penalty term that accounts for sequence length in the scores. These parameters have similar effects to `beam_alpha` and `beam_beta` in the beam search decoder and N-gram language model.
 
-You may follow the following steps to evaluate a neural LM:
+Use the following steps to evaluate a neural LM:
 
 #. Obtain `.tsv` file with beams and their corresponding scores. Scores can be from a regular beam search decoder or
    in fusion with an N-gram LM scores. For a given beam size `beam_size` and a number of examples
@@ -383,7 +361,7 @@ You may follow the following steps to evaluate a neural LM:
         --beta=[the value for the parameter rescorer_beta]
         --scores_output_file=[the optional path to store the rescored candidates]
 
-The candidates along with their new scores would be stored at the file specified by `--scores_output_file`.
+The candidates, along with their new scores, are stored at the file specified by `--scores_output_file`.
 
 The following is the list of the arguments for the evaluation script:
 
@@ -391,27 +369,27 @@ The following is the list of the arguments for the evaluation script:
 | **Argument**        |**Type**| **Default**      | **Description**                                                         |
 +---------------------+--------+------------------+-------------------------------------------------------------------------+
 | lm_model            | str    | Required         | The path of the '.nemo' file of an ASR model, or the name of a          |
-|                     |        |                  | HuggingFace pretrained model like 'transfo-xl-wt103' or 'gpt2'          |
+|                     |        |                  | Hugging Face pretrained model like 'transfo-xl-wt103' or 'gpt2'.        |
 +---------------------+--------+------------------+-------------------------------------------------------------------------+
-| eval_manifest       | str    | Required         | Path to the evaluation manifest file (.json manifest file)              |
+| eval_manifest       | str    | Required         | Path to the evaluation manifest file (.json manifest file).             |
 +---------------------+--------+------------------+-------------------------------------------------------------------------+
-| beams_file          | str    | Required         | path to beams file (.tsv) containing the candidates and their scores    |
+| beams_file          | str    | Required         | Path to beams file (.tsv) containing the candidates and their scores.   |
 +---------------------+--------+------------------+-------------------------------------------------------------------------+
-| beam_size           | int    | Required         | The width of the beams (number of candidates) generated by the decoder  |
+| beam_size           | int    | Required         | The width of the beams (number of candidates) generated by the decoder. |
 +---------------------+--------+------------------+-------------------------------------------------------------------------+
 | alpha               | float  | None             | The value for parameter rescorer_alpha                                  |
-|                     |        |                  | Not passing value would enable linear search for rescorer_alpha         |
+|                     |        |                  | Not passing value would enable linear search for rescorer_alpha.        |
 +---------------------+--------+------------------+-------------------------------------------------------------------------+
 | beta                | float  | None             | The value for parameter rescorer_beta                                   |
-|                     |        |                  | Not passing value would enable linear search for rescorer_beta          |
+|                     |        |                  | Not passing value would enable linear search for rescorer_beta.         |
 +---------------------+--------+------------------+-------------------------------------------------------------------------+
-| batch_size          | int    | 16               | The batch size used to calculate the scores                             |
+| batch_size          | int    | 16               | The batch size used to calculate the scores.                            |
 +---------------------+--------+------------------+-------------------------------------------------------------------------+
-| max_seq_length      | int    | 512              | Maximum sequence length (in tokens) for the input                       |
+| max_seq_length      | int    | 512              | Maximum sequence length (in tokens) for the input.                      |
 +---------------------+--------+------------------+-------------------------------------------------------------------------+
-| scores_output_file  | str    | None             | The optional file to store the rescored beams                           |
+| scores_output_file  | str    | None             | The optional file to store the rescored beams.                          |
 +---------------------+--------+------------------+-------------------------------------------------------------------------+
-| use_amp             | bool   | ``False``        | Whether to use AMP if available calculate the scores                    |
+| use_amp             | bool   | ``False``        | Whether to use AMP if available calculate the scores.                   |
 +---------------------+--------+------------------+-------------------------------------------------------------------------+
 | device              | str    | cuda             | The device to load LM model onto to calculate the scores                |
 |                     |        |                  | It can be 'cpu', 'cuda', 'cuda:0', 'cuda:1', ...                        |
@@ -421,7 +399,7 @@ The following is the list of the arguments for the evaluation script:
 Hyperparameter Linear Search
 ----------------------------
 
-This script also supports linear search for parameters `alpha` and `beta`. If any of the two is not
+The hyperparameter linear search script also supports linear search for parameters `alpha` and `beta`. If any of the two is not
 provided, a linear search is performed to find the best value for that parameter. When linear search is used, initially
 `beta` is set to zero and the best value for `alpha` is found, then `alpha` is fixed with
 that value and another linear search is done to find the best value for `beta`.
@@ -435,15 +413,11 @@ Then check the WER curves and decide on the best values for each parameter. Fina
 Word Boosting
 =============
 
-The Flashlight decoder supports word boosting during CTC decoding using a KenLM binary and corresponding lexicon. Word boosting only
-works in lexicon decoding mode, it does not work in lexicon-free mode. Word boosting allows one to bias the decoder for certain words,
-such that you can manually increase or decrease the probability of emitting certain words. This can be very helpful if you have certain
-uncommon or industry-specific words which you want to ensure transcribe correctly.
+The Flashlight decoder supports word boosting during CTC decoding using a KenLM binary and corresponding lexicon. Word boosting only works in lexicon-decoding mode and does not function in lexicon-free mode. It allows you to bias the decoder for certain words by manually increasing or decreasing the probability of emitting specific words. This can be very helpful if you have uncommon or industry-specific terms that you want to ensure are transcribed correctly.
 
-For more information on word boosting, `here <https://docs.nvidia.com/deeplearning/riva/user-guide/docs/asr/asr-customizing.html#word-boosting>`__
+For more information, go to `word boosting <https://docs.nvidia.com/deeplearning/riva/user-guide/docs/asr/asr-customizing.html#word-boosting>`__
 
-In order to use word boosting in Nemo, you need to create a simple tab-separated text file which contains each word to be boosted, followed by
-tab, and then the boosted score for that word.
+To use word boosting in NeMo, create a simple tab-separated text file. Each line should contain a word to be boosted, followed by a tab, and then the boosted score for that word.
 
 For example:
 
@@ -460,7 +434,7 @@ squelch words so they show up less frequently. The recommended range for the boo
 
 The boost file handles both in-vocabulary words and OOV words just fine, so you can specify both IV and OOV words with corresponding scores.
 
-You can then pass this file to your flashlight config object during decoding:
+You can then pass this file to your Flashlight config object during decoding:
 
 .. code-block::
 
@@ -476,13 +450,13 @@ You can then pass this file to your flashlight config object during decoding:
 Combine N-gram Language Models
 ==============================
 
-Before combining N-gram LMs install required OpenGrm NGram library using `scripts/installers/install_opengrm.sh <https://github.com/NVIDIA/NeMo/blob/stable/scripts/installers/install_opengrm.sh>`__.
+Before combining N-gram LMs, install the required OpenGrm NGram library using `scripts/installers/install_opengrm.sh <https://github.com/NVIDIA/NeMo/blob/stable/scripts/installers/install_opengrm.sh>`__.
 Alternatively, you can use Docker image `scripts/installers/Dockerfile.ngramtools <https://github.com/NVIDIA/NeMo/blob/stable/scripts/installers/Dockerfile.ngramtools>`__ with all the necessary dependencies.
 
-To combine two N-gram language models, you can use the script ngram_merge.py located at
-`scripts/asr_language_modeling/ngram_lm/ngram_merge.py <https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/ngram_lm/ngram_merge.py>`__.
+Alternatively, you can use the Docker image at:
+`scripts/asr_language_modeling/ngram_lm/ngram_merge.py <https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/ngram_lm/ngram_merge.py>`__, which includes all the necessary dependencies.
 
-This script interpolate two ARPA N-gram language models and creates a KenLM binary file that can be used with the beam search decoders on top of ASR models.
+This script interpolates two ARPA N-gram language models and creates a KenLM binary file that can be used with the beam search decoders on top of ASR models.
 You can specify weights (`--alpha` and `--beta`) for each of the models (`--ngram_a` and `--ngram_b`) correspondingly: `alpha` * `ngram_a` + `beta` * `ngram_b`.
 This script supports both character level and BPE level encodings and models which are detected automatically from the type of the model.
 
@@ -500,7 +474,7 @@ To combine two N-gram models, you can use the following command:
 
 
 
-If you provide `--test_file` and `--nemo_model_file`, the script will calculate the perplexity of the resulting N-gram model on the test set.
+If you provide `--test_file` and `--nemo_model_file`, This script supports both character-level and BPE-level encodings and models, which are detected automatically based on the type of the model.
 Note, the result of each step during the process is cached in the temporary file in the `--out_path`, to speed up further run.
 You can use the `--force` flag to discard the cache and recalculate everything from scratch.
 
@@ -528,13 +502,13 @@ The following is the list of the arguments for the opengrm script:
 +----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
 | ngram_bin_path       | str    | Required         | The path to the bin folder of OpenGrm Ngram. It is a folder named `bin` under where OpenGrm Ngram is installed. |
 +----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
-| arpa_a               | str    | Required         | Path to the ARPA N-gram model file A                                                                            |
+| arpa_a               | str    | Required         | Path to the ARPA N-gram model file A.                                                                           |
 +----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
-| alpha                | float  | Required         | Weight of N-gram model A                                                                                        |
+| alpha                | float  | Required         | Weight of N-gram model A.                                                                                       |
 +----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
-| arpa_b               | int    | Required         | Path to the ARPA N-gram model file B                                                                            |
+| arpa_b               | int    | Required         | Path to the ARPA N-gram model file B.                                                                           |
 +----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
-| beta                 | float  | Required         | Weight of N-gram model B                                                                                        |
+| beta                 | float  | Required         | Weight of N-gram model B.                                                                                       |
 +----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
 | out_path             | str    | Required         | Path for writing temporary and resulting files.                                                                 |
 +----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
@@ -544,7 +518,7 @@ The following is the list of the arguments for the opengrm script:
 +----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
 | nemo_model_file      | str    | None             | The path to '.nemo' file of the ASR model, or name of a pretrained NeMo model.                                  |
 +----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
-| force                | bool   | ``False``        | Whether to recompile and rewrite all files                                                                      |
+| force                | bool   | ``False``        | Whether to recompile and rewrite all files.                                                                     |
 +----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
 
 .. _wfst-ctc-decoding:
@@ -612,7 +586,7 @@ Quick start example
 
 
 ***************************************************
-Context-biasing (word boosting) without external LM
+Context-biasing (Word Boosting) without External LM
 ***************************************************
 
 NeMo toolkit supports a fast context-biasing method for CTC and Transducer (RNN-T) ASR models with CTC-based Word Spotter.
@@ -627,14 +601,14 @@ Scheme of the CTC-WS method:
 .. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.22.0/asset-post-v1.22.0-ctcws_scheme_1.png
     :align: center
     :alt: CTC-WS scheme
-    :scale: 40%
+    :width: 80%
 
 High-level overview of the context-biasing words replacement with CTC-WS method:
 
 .. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.22.0/asset-post-v1.22.0-ctcws_scheme_2.png
     :align: center
     :alt: CTC-WS high level overview
-    :scale: 40%
+    :width: 80%
 
 More details about CTC-WS context-biasing can be found in the `tutorial <https://github.com/NVIDIA/NeMo/tree/main/tutorials/asr/ASR_Context_Biasing.ipynb>`__.
 
@@ -662,9 +636,9 @@ The main script for CTC-WS context-biasing in NeMo is:
 Context-biasing is managed by ``apply_context_biasing`` parameter [true or false].
 Other important context-biasing parameters are:
 
-*  ``beam_threshold`` - threshold for CTC-WS beam pruning
-*  ``context_score`` - per token weight for context biasing
-*  ``ctc_ali_token_weight`` - per token weight for CTC alignment (prevents false acceptances of context-biasing words)
+*  ``beam_threshold`` - threshold for CTC-WS beam pruning.
+*  ``context_score`` - per token weight for context biasing.
+*  ``ctc_ali_token_weight`` - per token weight for CTC alignment (prevents false acceptances of context-biasing words).
 
 All the context-biasing parameters are selected according to the default values in the script.
 You can tune them according to your data and ASR model (list all the values in the [] separated by commas)
diff --git a/docs/source/conf.py b/docs/source/conf.py
index c599f630d7f7..6d26def7369f 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -66,6 +66,7 @@
     'taming',
     'cytoolz',  # for adapters
     'megatron',  # for nlp
+    "open_clip",
 ]
 
 _skipped_autodoc_mock_imports = ['wrapt', 'numpy']
diff --git a/docs/source/features/optimizations/activation_recomputation.rst b/docs/source/features/optimizations/activation_recomputation.rst
index 67de4401a4bc..3792e17c4e57 100644
--- a/docs/source/features/optimizations/activation_recomputation.rst
+++ b/docs/source/features/optimizations/activation_recomputation.rst
@@ -1,52 +1,42 @@
 Activation Recomputation
 ========================
 
-The input activations of network layers are stored in the device memory to compute the gradients in back-propagation.
-The input activation stores easily saturate the device memory when training a LLM with a large sequence length or a large micro-batch size.
-Check-pointing a few activations and recomputing the rest of activations is a common technique to reduce the need of device memory.
+The input activations of network layers are stored in device memory and are used to compute gradients during back-propagation. When training a LLM with a long sequence length or a large micro-batch size, these input activations can quickly saturate device memory. Checkpointing a few activations and recomputing the rest is a common technique to reduce device memory usage.
 
 Transformer Layer Recomputation
 -------------------------------
 
-NeMo supports Transformer layer recomputation that checkpoints the input of each Transformer layer and recomputes the activations on the rest of the layers.
-Transformer layer recomputation significantly reduces the activation memory usage.
-However, this approach increases per-Transformer layer computation cost by 30%, which comes from re-executing the entire layer forwarding computation.
-NeMo also supports partial Transformer layer recomputation, which is beneficial when recomputing a few Transformer layers would fit the training workload on GPU memory.
-This would avoid recomputing the rest of layers.
+NeMo supports transformer layer recomputation, which checkpoints the input of each transformer layer and recomputes the activations for the remaining layers. This technique significantly reduces activation memory usage. However, it increases the per-transformer layer computation cost by 30% due to re-executing the entire layer’s forward computation.
+NeMo also supports partial transformer layer recomputation, which is beneficial when recomputing a few transformer layers help to reduce enough GPU memory for model to fit. This approach avoids the need to recompute the rest of the layers.
 
 Transformer layer recomputation is enabled by setting ``activations_checkpoint_granularity=full``.
-The number of Transformer layers to recompute can be set using ``activations_checkpoint_num_layers`` along with ``activations_checkpoint_method=block``.
-If one sets ``activations_checkpoint_num_layers`` as the total number of layers, the inputs of all Transformer layers are check-pointed and recomputed.
+The number of transformer layers to recompute can be set using ``activations_checkpoint_num_layers`` along with ``activations_checkpoint_method=block``.
+If you set ``activations_checkpoint_num_layers`` as the total number of layers, the inputs of all transformer layers are checkpointed and recomputed.
 When training with the pipeline parallelism, ``activations_checkpoint_num_layers`` indicates the layers per pipeline stage.
-If the virtual pipelining is used, ``activations_checkpoint_num_layers`` means the layers per virtual pipeline stage.
+When using virtual pipelining, ``activations_checkpoint_num_layers`` specifies the number of layers per virtual pipeline stage.
 
-NeMo also supports checkpointing the input to a block of multiple consecutive Transformer layers meaning that a block of Transformer layers becomes the recomputation granularity.
-This can further save activation memory at the cost of increasing the recomputation buffer memory.
-Thus, it is only beneficial for memory savings when the model has many Transformer layers or the intermediate layers of a Transformer layer hold relatively small activation stores.
-This recomputation mode can be enabled by setting ``activations_checkpoint_method=uniform``, and the number of Transformer layers per recomputation block is set using ``activations_checkpoint_num_layers``.
+NeMo also supports checkpointing the input to a block of multiple consecutive transformer layers, meaning that a block of transformer layers becomes the recomputation granularity. This approach can save activation memory but increases the recomputation buffer memory. Thus, it is only beneficial for memory savings when the model has many transformer layers or when the intermediate layers of a transformer layer hold relatively small activation stores.
+This recomputation mode can be enabled by setting ``activations_checkpoint_method=uniform``, with the number of transformer layers per recomputation block set using ``activations_checkpoint_num_layers``.
 
 Self-attention Recomputation
 ----------------------------
 
 NeMo supports the self-attention recomputation that checkpoints the inputs of each self-attention block and recomputes the intermediate input activations.
-This is a cost-efficient recomputation method; achieves high memory saving with lost recomputation cost.
-The intermediate layers of the self-attention block accounts for the majority portion the activation memory.
+This cost-efficient method achieves high memory savings with minimal recomputation cost.
+The intermediate layers of the self-attention block accounts for the majority of the activation memory.
 This is because the input sizes of softmax, dropout, and qkv dot-product attention layers have the memory complexity of the sequence length square.
 However, their recomputation cost is relatively smaller than the other linear projection layers that are linear with the hidden size square.
 
 Self-attention recomputation is hard-enabled when using FlashAttention, which is supported in Transformer Engine.
-Also, a user can use the self-attention recomputation without FlashAttention by setting ``activations_checkpoint_granularity=selective``.
-
+Also, you can use the self-attention recomputation without FlashAttention by setting ``activations_checkpoint_granularity=selective``.
 Scheme of full and selective checkpointing granularity:
 
 .. image:: https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/asset-post-activation-recomputation-exampe-2.jpg
     :align: center
     :alt: activation-recomputation-example-2
-    :scale: 50%
 
 Scheme of uniform and block checkpointing method (full checkpointing granularity):
 
 .. image:: https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/asset-post-activation-recomputation-exampe-1.jpg
     :align: center
     :alt: activation-recomputation-example-1
-    :scale: 50%
\ No newline at end of file
diff --git a/docs/source/features/optimizations/sequence_packing.rst b/docs/source/features/optimizations/sequence_packing.rst
index 69e45f1e6a12..40c04ce65350 100644
--- a/docs/source/features/optimizations/sequence_packing.rst
+++ b/docs/source/features/optimizations/sequence_packing.rst
@@ -1,25 +1,21 @@
 Sequence Packing
 ================
 
+This section explains how to use the sequence packing training technique with Supervised Fine-Tuning (SFT) and Parameter-Efficient Fine-Tuning (PEFT).
+
 Sequence Packing for SFT/PEFT
 -----------------------------
 
 Overview
-^^^^^^^^
+########
 
-When finetuning a large language model with either full-parameter or parameter-efficient finetuning, GPU
-underutilization is a common problem due to an inefficient data pipeline. This is because most finetuning datasets have
-a skewed distribution of sequence lengths, with many short sequences and a few long sequences, following Zipf’s Law.
-Transformer models can only take in fixed length inputs, so the input has to be padded with many unused pad tokens,
-which is inefficient in two ways:
+When fine-tuning a large language model, whether using SFT or PEFT methods, GPU underutilization often occurs due to an inefficient data pipeline. This inefficiency arises because most fine-tuning datasets have a skewed distribution of sequence lengths, with many short sequences and a few long ones, following Zipf’s Law. Since transformer models require fixed-length inputs, shorter sequences must be padded with unused tokens, leading to two main inefficiencies:
 
 - Computation performed on the pad values is eventually ignored for model output, resulting in wasted FLOPs.
 - Micro batch size is often limited by the batch which contains longer sequences, so that most other micro batches have
   underutilized GPU memory.
 
-Sequence packing is a training technique where multiple training sequences (examples) are concatenated together into
-one long sequence (pack). This eliminates the need for padding and allows more tokens to be processed in each
-micro batch, maximizing both GPU compute and GPU memory.
+Sequence packing is a training technique where multiple training sequences (examples) are concatenated into one long sequence (pack). This method eliminates the need for padding, allowing more tokens to be processed in each micro batch. As a result, it maximizes both GPU compute and GPU memory utilization.
 
 While sequences for pretraining can be concatenated naively, this is not the case for SFT and instruction fine-tuning
 where each input sequence should be treated individually. The conventional solution is to build an extended attention
@@ -27,7 +23,7 @@ mask to mark the sequence id each token belongs to, and mask out attention value
 increases the complexity of attention from :math:`\sum_i {s_i}^2` to :math:`\Big({\sum_i {s_i}}\Big)^2`, where :math:`s_i` is the
 length of the ith subsequence. In practice, the conventional solution puts a limit on the length of packing.
 Instead, NeMo provides a highly optimized version of sequence packing which makes use of variable-length attention
-kernels in FlashAttention and TransformerEngine. With this, attention values between sequences are never calculated,
+kernels in FlashAttention and TransformerEngine. With this approach, attention values between sequences are never calculated,
 so the complexity of attention remains at :math:`\sum_i {s_i}^2`. This allows packing sequences to arbitrary lengths so
 that GPU memory can be fully utilized.
 
@@ -40,31 +36,30 @@ All things considered, NeMo’s implementation of sequence packing provides [#f1
 
 
 How to run SFT/PEFT with packed sequence
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+########################################
 
 Prepare Dataset
-"""""""""""""""
+^^^^^^^^^^^^^^^
 
 We provide a convenient script to pack your SFT or PEFT dataset.
 This script assumes that you already have a prepared dataset file for SFT/PEFT training in NeMo. If you do not, please
 follow `this <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/llama2sft.html#prepare-data>`_ to
-download and prepare the dolly dataset as an example.
+download and prepare the Dolly dataset as an example.
 You will get a file named training.jsonl. The rest of this tutorial also assumes you already have a recipe for
 training with the unpacked dataset.
 
 Two main steps are run in this script:
 
-1. The online processing code in GPTSFTDataset is run (including prompt template manipulation, sequence length
-   truncation, tokenization, etc) and the result is an array of tokenized sequences, represented by indices).
-2. The sequences are grouped by length, and a packing algorithm is run.
+1. The online processing code in GPTSFTDataset is run. This includes tasks such as prompt template manipulation, sequence length truncation, and tokenization. The result is an array of tokenized sequences, represented by indices.
+2. The tokenized sequences are grouped by length and a packing algorithm is run.
 
 You can read more about packing algorithms `here <https://en.wikipedia.org/wiki/Bin_packing_problem#Offline_algorithms>`_.
-Currently, two variants of *first fit* are supported.
-- *first_fit_decreasing* sorts the sequences in decreasing order before applying the first-fit algorithm. It generates a
+Currently, two variants of ``first_fit`` are supported.
+- ``first_fit_decreasing`` sorts the sequences in decreasing order before applying the first-fit algorithm. It generates a
 more optimal packing, but it tends to keep all short sequences together, which may have an impact for convergence.
-- *first_fit_shuffle* runs first-fit in a random order. Packing is less optimal but it keeps the dataset order random.
-The recommendation is to run *first_fit_shuffle* and check the packed sequence lengths. If they are similar to the
-target length (i.e. efficient packing), then use shuffle. Otherwise try *first_fit_decreasing*.
+- ``first_fit_shuffle`` runs first-fit in a random order. Packing is less optimal but it keeps the dataset order random.
+The recommendation is to run ``first_fit_shuffle`` and check the packed sequence lengths. If they are similar to the
+target length (i.e. efficient packing), then use shuffle. Otherwise try ``first_fit_decreasing``.
 
     .. code-block:: bash
 
@@ -79,32 +74,28 @@ target length (i.e. efficient packing), then use shuffle. Otherwise try *first_f
 
 .. note::
 
-    Note 1. If your model or dataset requires non-default configs for conventional SFT/PEFT training in NeMo, you will
-    need to pass in the same configs to ``model.data.train_ds`` as you would for training with unpacked dataset.
+    1. If your model or dataset requires non-default configs for conventional SFT/PEFT training in NeMo, you will need to pass in the same configs to ``model.data.train_ds`` as you would for training with an unpacked dataset.
 
-    Note 2. ``model.data.train_ds.max_seq_length`` is the length to truncate each sequence before packing multiple sequences
-    to the size of packed sequence (``pack_size``). ``max_seq_length`` should be set to the same value as unpacked data,
-    and can be determined by examining the distribution of sequence lengths in the dataset.
+    2. ``model.data.train_ds.max_seq_length`` is the length to which each sequence is truncated before packing multiple sequences to the size of packed sequence (``pack_size``). ``max_seq_length`` should be set to the same value as unpacked data and can be determined by examining the distribution of sequence lengths in the dataset.
 
-    Note 3. ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for
-    each pack size. The output files are named ``<output_folder>/packed_{pack_size}_seed{seed}.npy``.
+    3. ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for each pack size. The output files are named ``<output_folder>/packed_{pack_size}_seed{seed}.npy``.
     This argument is a list because you will likely want to experiment with a few ``pack_sizes`` to find out which length
     can fill the GPU memory without exceeding it. Adjusting ``pack_size`` is analogous to adjusting the micro batch size in
     the unpacked case.
 
 
 Adjust Training Config
-""""""""""""""""""""""
+^^^^^^^^^^^^^^^^^^^^^^
 
-To train with packed sequences, you need to change four items in the SFT/PEFT config file
+To train with packed sequences, you need to change four items in the SFT/PEFT config file.
 
-1. Turn on the packed_sequence flag
+1. Turn on the packed_sequence flag:
 
     .. code-block:: bash
 
         ++model.data.train_ds.packed_sequence=True
 
-2. Use the new dataset file instead of the original jsonl file
+2. Use the new dataset file instead of the original jsonl file:
 
     .. code-block:: bash
 
@@ -130,15 +121,14 @@ To train with packed sequences, you need to change four items in the SFT/PEFT co
         model.micro_batch_size=1
         model.global_batch_size=<GBS divided by n>
 
-Now you are all set to finetune your model with a much improved throughput!
+Now, you are all set to fine-tune your model with a much improved throughput!
 
 Sequence Packing for NeVA
 -------------------------
 
-Sequence packing in NeVA (Multimodal LLMs) differs slightly from the LLM SFT/PEFT approach. For details,
-please refer to the documentation below
+Sequence packing with NeVA for multimodal large language models differs from the LLM SFT/PEFT approach. For details, please refer to the documentation below.
 
-:doc:`../multimodal/mllm/sequence_packing`
+:doc:`../../multimodal/mllm/sequence_packing`
 
 .. rubric:: Footnotes
 
diff --git a/docs/source/multimodal/api.rst b/docs/source/multimodal/api.rst
index 2ba9978b7640..41f67a989f84 100644
--- a/docs/source/multimodal/api.rst
+++ b/docs/source/multimodal/api.rst
@@ -20,7 +20,7 @@ Model Classes
 .. autoclass:: nemo.collections.multimodal.models.text_to_image.dreambooth.dreambooth.MegatronDreamBooth
     :show-inheritance:
     :no-members:
-    :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
+    :members: __init__, training_step, validation_step, setup
 
 
 .. autoclass:: nemo.collections.multimodal.models.text_to_image.controlnet.controlnet.MegatronControlNet
diff --git a/docs/source/multimodal/mm_all.bib b/docs/source/multimodal/mm_all.bib
index 3930484d71e5..e50895840c68 100644
--- a/docs/source/multimodal/mm_all.bib
+++ b/docs/source/multimodal/mm_all.bib
@@ -17,6 +17,17 @@ @misc{ho2020denoising
       primaryClass={cs.LG}
 }
 
+# Continual DDPM
+@misc{nichol2021improved,
+      title={Improved Denoising Diffusion Probabilistic Models}, 
+      author={Alex Nichol and Prafulla Dhariwal},
+      year={2021},
+      eprint={2102.09672},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2102.09672}, 
+}
+
 # EDM
 @misc{karras2022elucidating,
       title={Elucidating the Design Space of Diffusion-Based Generative Models}, 
@@ -68,7 +79,7 @@ @misc{chang2023muse
 }
 
 # Ins P2P
-@misc{insp2p,
+@misc{insp2p2022,
       Author = {Tim Brooks and Aleksander Holynski and Alexei A. Efros},
       Title = {InstructPix2Pix: Learning to Follow Image Editing Instructions},
       Year = {2022},
diff --git a/docs/source/multimodal/nerf/dreamfusion.rst b/docs/source/multimodal/nerf/dreamfusion.rst
index d6c926392556..33c2ab37ff44 100644
--- a/docs/source/multimodal/nerf/dreamfusion.rst
+++ b/docs/source/multimodal/nerf/dreamfusion.rst
@@ -21,7 +21,7 @@ Dreamfusion models can be instantiated using the :class:`~nemo.collections.multi
         :alt: DreamFsuion, overview of the model
 
 
-Image guidance
+Image Guidance
 ^^^^^^^^^^^^^^
 This section of DreamFusion pertains to the initial phase where the model interprets and translates text inputs into visual concepts.
 Utilizing a diffusion based text-to-image model, DreamFusion processes the text input, extracts key visual elements, and translates these into initial 2D images.
@@ -29,14 +29,14 @@ The process ensures that the generated 3D models are not only accurate in terms
 the 2D image based on the view angle.
 
 
-NeRF (foreground) network
+NeRF (Foreground) Network
 ^^^^^^^^^^^^^^^^^^^^^^^^^
-The Neural Radiance Fields (NeRF) network is at the heart of DreamFusion's 3D rendering capabilities.
+The Neural Radiance Fields network is at the heart of DreamFusion's 3D rendering capabilities.
 In DreamFusion, the NeRF network takes the 2D images generated from the textual description and constructs a 3D model.
 This model is represented as a continuous volumetric scene function, which encodes the color and density of points in space,
 allowing for highly detailed and photorealistic renderings.
 
-Background layer
+Background Layer
 ^^^^^^^^^^^^^^^^
 DreamFusion can leverage a background layer dedicated to background modeling.
 
@@ -48,14 +48,14 @@ Alternatively, DreamFusion allows for the integration of a static background col
 Implementing a static color background involves setting a uniform chromatic value that encompasses the periphery of the 3D model.
 This approach simplifies the rendering process and can be beneficial in reducing computational load while maintaining focus on the primary object.
 
-Materials network
+Materials Network
 ^^^^^^^^^^^^^^^^^
-The material network in DreamFusion is responsible for adding realism to the 3D models by accurately simulating the physical properties of different materials.
+The materials network in DreamFusion is responsible for adding realism to the 3D models by accurately simulating the physical properties of different materials.
 This network takes into account various aspects like texture, reflectivity, and transparency.
 By doing so, it adds another layer of detail, making the objects generated by DreamFusion not just structurally accurate but also visually and tactilely realistic.
 
 
-Renderer layer
+Renderer Layer
 ^^^^^^^^^^^^^^
 The renderer layer functions as the culminating stage in DreamFusion's processing pipeline.
 It translates the synthesized volumetric data from the NeRF and material networks into perceptible imagery.
@@ -114,7 +114,7 @@ The model configuration file is organized into the following sections:
       height: 800
 
 - ``defaults``: Defines default modules for different components like nerf, background, material, etc.
-- ``resume_from_checkpoint``: Path to a checkpoint file to initialize the model with.
+- ``resume_from_checkpoint``: Path to a checkpoint file for initializing the model.
 - ``prompt``: Main textual input for the model describing the object to generate.
 - ``negative_prompt``: Textual input describing what to avoid in the generated object.
 - ``front_prompt``, ``side_prompt``, ``back_prompt``: Textual inputs that are appended to the prompts for more detailed orientation guidance.
@@ -127,7 +127,7 @@ The model configuration file is organized into the following sections:
 The behavior of the pipeline can be precisely adjusted by fine-tuning the parameters of various components in the default section.
 Some components support different backends and implementations, the full components catalog can be viewed in the config directory ``{NEMO_ROOT/examples/multimodal/generative/nerf/conf/model}``.
 
-Image guidance
+Image Guidance
 ^^^^^^^^^^^^^^
 
 .. code-block:: yaml
@@ -142,7 +142,7 @@ Image guidance
 - ``t_range``: Range of threshold values for guidance stability.
 
 
-NeRF (foreground) network
+NeRF (Foreground) Network
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. code-block:: yaml
@@ -176,7 +176,7 @@ NeRF (foreground) network
 
 Describes the NeRF network's architecture, including the density activation function, network configuration, and the specification of the sigma and features networks.
 
-Background layer
+Background Layer
 ^^^^^^^^^^^^^^^^
 
 .. code-block:: yaml
@@ -203,7 +203,7 @@ Static background, where the background key is the RGB color.
 Dynamic background, where the background is generated by a NeRF network.
 
 
-Materials network
+Materials Network
 ^^^^^^^^^^^^^^^^^
 
 .. code-block:: yaml
@@ -235,7 +235,7 @@ NeRF models integrate geometry and appearance through volume rendering. As a res
 using NeRF for 3D modeling can be less effective when it comes to capturing both the intricate details of a surface as well as
 its material and texture.
 
-DMTet finetunning disentangles the learning of geometry and appearance models, such that both a fine surface and a rich
+DMTet fine-tuning disentangles the learning of geometry and appearance models, such that both a fine surface and a rich
 material/texture can be generated. To enable such a disentangled learning, a hybrid scene representation of
 [DMTet](https://nv-tlabs.github.io/DMTet/) is used.
 
@@ -252,52 +252,52 @@ However, the following changes to the training pipeline are necessary:
 
 .. code-block:: yaml
 
-  _target_: nemo.collections.multimodal.models.nerf.dreamfusion.DreamFusion
-  defaults:
-    - nerf: torchngp
-    - background: torchngp
-    - material: basic_shading
-    - renderer: nvdiffrast            # (1)
-    - guidance: sd_huggingface
-    - optim: adan
-    - loss: dmtet                     # (2)
-    - data: data
-    - _self_
-
-  ### model options
-  resume_from_checkpoint: "/results/DreamFusion/checkpoints/DreamFusion-step\=10000-last.ckpt"   # (3)
-  prompt: 'a hamburger'
-  negative_prompt: ''
-  front_prompt: ', front view'
-  side_prompt: ', side view'
-  back_prompt: ', back view'
-  update_extra_interval: 16
-  guidance_scale: 100
-  export_video: False
-
-  iters: ${trainer.max_steps}
-  latent_iter_ratio: 0.0
-  albedo_iter_ratio: 0
-  min_ambient_ratio: 0.1
-  textureless_ratio: 0.2
-
-  data:
-    train_dataset:
-      width: 512         # (4)
-      height: 512        # (4)
-    val_dataset:
-      width: 800
-      height: 800
-    test_dataset:
-      width: 800
-      height: 800
+   _target_: nemo.collections.multimodal.models.nerf.dreamfusion.DreamFusion
+   defaults:
+     - nerf: torchngp
+     - background: torchngp
+     - material: basic_shading
+     - renderer: nvdiffrast            # (1)
+     - guidance: sd_huggingface
+     - optim: adan
+     - loss: dmtet                     # (2)
+     - data: data
+     - _self_
+
+   ### model options
+   resume_from_checkpoint: "/results/DreamFusion/checkpoints/DreamFusion-step\\=10000-last.ckpt"   # (3)
+   prompt: 'a hamburger'
+   negative_prompt: ''
+   front_prompt: ', front view'
+   side_prompt: ', side view'
+   back_prompt: ', back view'
+   update_extra_interval: 16
+   guidance_scale: 100
+   export_video: False
+
+   iters: ${trainer.max_steps}
+   latent_iter_ratio: 0.0
+   albedo_iter_ratio: 0
+   min_ambient_ratio: 0.1
+   textureless_ratio: 0.2
+
+   data:
+     train_dataset:
+       width: 512         # (4)
+       height: 512        # (4)
+     val_dataset:
+       width: 800
+       height: 800
+     test_dataset:
+       width: 800
+       height: 800
 
 
 We note the following changes:
-1. The rendering module was changed from a volumetric based one to a rasterization based one (nvdiffrast).
+1. The rendering module was updated from a volumetric-based approach to a rasterization-based one using nvdiffrast.
 2. The model loss is changed to account for the changes in the geometry representation.
-3. DreamFusion-DMTet finetunes a pretrained DreamFusion model, the pretrained checkpoint is provided using ``resume_from_checkpoint``.
-4. The training shape is incrased to 512x512
+3. DreamFusion-DMTet fine-tunes a pretrained DreamFusion model, the pretrained checkpoint is provided using ``resume_from_checkpoint``.
+4. The training shape is increased to 512x512.
 
 
 References
diff --git a/docs/source/multimodal/speech_llm/intro.rst b/docs/source/multimodal/speech_llm/intro.rst
index 1f73ed9ed249..cc70f75b72b5 100644
--- a/docs/source/multimodal/speech_llm/intro.rst
+++ b/docs/source/multimodal/speech_llm/intro.rst
@@ -27,7 +27,7 @@ One way to incorporate speech into an LLM is to concatenate speech features with
 .. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/salm.png
     :align: center
     :alt: SALM model
-    :scale: 50%
+    :width: 80%
 
 
 
@@ -36,7 +36,7 @@ Another approach is to use a cross-attention mechanism, where text embeddings at
 .. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/bestow.png
     :align: center
     :alt: BESTOW model
-    :scale: 50%
+    :width: 80%
 
 NeMo Framework contains `example scripts <https://github.com/NVIDIA/NeMo/tree/main/examples/multimodal/speech_llm>`__ for training and evaluating for both SALM and BESTOW models. A pre-trained `checkpoint <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/speechllm_fc_llama2_7b>`__ for SALM is also available.
 
diff --git a/docs/source/multimodal/text2img/insp2p.rst b/docs/source/multimodal/text2img/insp2p.rst
index 282874444738..456d434d7173 100644
--- a/docs/source/multimodal/text2img/insp2p.rst
+++ b/docs/source/multimodal/text2img/insp2p.rst
@@ -4,7 +4,7 @@ InstructPix2Pix
 Model Introduction
 --------------------
 
-InstructPix2Pix [InstructPix2Pix]_ :cite:`mm-models-insp2p-insp2p` offers a unique approach to image editing using human-written instructions. Given an input image and a textual directive, the model adjusts the image according to the provided instructions. NeMo Multimodal presents a training pipeline for this conditional diffusion model, utilizing a dataset generated by harnessing the strengths of two prominent pretrained models: a language model (GPT-3) and a text-to-image model (Stable Diffusion). The InstructPix2Pix model operates swiftly, editing images within seconds, eliminating the need for per-example fine-tuning or inversion. It has demonstrated remarkable results across a wide variety of input images and written instructions.
+InstructPix2Pix :cite:`mm-models-insp2p-insp2p2022` offers a unique approach to image editing using human-written instructions. Given an input image and a textual directive, the model adjusts the image according to the provided instructions. NeMo Multimodal presents a training pipeline for this conditional diffusion model, utilizing a dataset generated by harnessing the strengths of two prominent pretrained models: a language model (GPT-3) and a text-to-image model (Stable Diffusion). The InstructPix2Pix model operates swiftly, editing images within seconds, eliminating the need for per-example fine-tuning or inversion. It has demonstrated remarkable results across a wide variety of input images and written instructions.
 
 Built upon the Stable Diffusion framework, NeMo's InstructPix2Pix shares a similar architecture with Stable Diffusion (refer to :doc:`Stable Diffusion <./sd>`). What sets it apart is its unique training dataset and the combined guidance from both image and text prompts. Specifically, InstructPix2pix ::class::``nemo.collections.multimodal.models.instruct_pix2pix.ldm.ddpm_edit.MegatronLatentDiffusionEdit`` is derived directly from Stable Diffusion's ::class::``nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion``, with alterations to accommodate the dataset and provide support for dual guidance.
 
@@ -13,7 +13,7 @@ Training Dataset
 
 The dataset for NeMo's InstructPix2Pix model stands out among NeMo multimodal models, as it doesn't mandate data storage in the webdataset format. Users are advised to verify the dataset's content, assess the relevant licenses, and ensure its appropriateness for their use. Before downloading, it's essential to review any links associated with the dataset.
 
-For instructions on downloading and preparing the custom dataset for training InstructPix2Pix, refer to the official InstructPix2Pix repository. `Instruct-Pix2Pix Repository <https://github.com/timothybrooks/instruct-pix2pix#generated-dataset>`_
+For instructions on downloading and preparing the custom dataset for training InstructPix2Pix, refer to the official `Instruct-Pix2Pix Repository <https://github.com/timothybrooks/instruct-pix2pix#generated-dataset>`_
 
 Model Configuration
 -------------------
@@ -47,6 +47,7 @@ Essential Model Configuration
 - ``first_stage_key``: Key for the model's initial processing stage. Set to `edited` for InstructPix2Pix.
 - ``cond_stage_key``: Key for the model's conditional stage. Set to `edit` for InstructPix2Pix.
 - ``unet_config``: Configuration parameters for the UNet model within the NeMo collection.
+
   - ``_target_``: Designates the target module for the UNet model in the NeMo collection.
   - ``from_pretrained``: (Value not provided) Generally indicates the path or identifier of a pretrained model.
   - ``in_channels``: Specifies the number of input channels for the UNet model. Here, the value is set to 8, with the initial 4 channels dedicated to image guidance.
diff --git a/docs/source/multimodal/text2img/sdxl_quantization.rst b/docs/source/multimodal/text2img/sdxl_quantization.rst
index bcc3031b9bd8..277ded96dc4e 100644
--- a/docs/source/multimodal/text2img/sdxl_quantization.rst
+++ b/docs/source/multimodal/text2img/sdxl_quantization.rst
@@ -7,13 +7,13 @@ This example shows how to use ModelOpt to calibrate and quantize the UNet part o
 We also provide instructions on deploying and running E2E SDXL pipeline
 with ModelOpt quantized int8 UNet to generate images and measure latency on target GPUs.
 
-To get started, it is required to have a pretrained SDXL checkpoint in ``nemo`` format. The example training configs are provided in NeMo,
-which is located in ``NeMo/examples/multimodal/text2img/stable_diffusion``.
+To get started, you need a pretrained SDXL checkpoint in NeMo format. Example training configurations are available here: Stable Diffusion Examples `<https://github.com/NVIDIA/NeMo/tree/main/examples/multimodal/text_to_image/stable_diffusion>`_.
+
 
 Calibration
 ---------------
-The first step is to run quantization script with default config, and finally the script will export the quantized unet to onnx file.
-Here is the default config for ``NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_quantize.py``.
+The first step is to run the quantization script with the default config. The script will export the quantized unet to the onnx file.
+Here is the default config for SDXL Quantize Script: `<https://github.com/NVIDIA/NeMo/blob/main/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py>`_.
 
 
 .. code-block:: yaml
@@ -32,10 +32,10 @@ Here is the default config for ``NeMo/examples/multimodal/text2img/stable_diffus
 
 Important Parameters
 ^^^^^^^^^^^^^^^^^^^^
-- percentile: Control quantization scaling factors (amax) collecting range, meaning that we will collect the minimum amax in the range of (n_steps * percentile) steps. Recommendation: 1.0
-- alpha: A parameter in SmoothQuant, used for linear layers only. Recommendation: 0.8 for SDXL, 1.0 for SD 1.5
+- percentile: Controls the range for collecting quantization scaling factors (amax). This means we will collect the minimum amax over a range of (n_steps * percentile) steps. We recommend 1.0.
+- alpha: A parameter in SmoothQuant, used exclusively for linear layers only. We recommend 0.8 for SDXL and 1.0 for SD 1.5.
 - quant-level: Which layers to be quantized, 1: CNNs, 2: CNN + FFN, 2.5: CNN + FFN + QKV, 3: CNN + Linear. Recommendation: 2, 2.5 and 3, depending on the requirements for image quality & speedup.
-- calib-size: For SDXL, we recommend 32, 64 or 128, for SD 1.5, set to 512 or 1024.
+- calib-size: For SDXL, we recommend 32, 64 or 128, for SD 1.5 and set to 512 or 1024.
 
 
 Build the TRT engine for the Quantized ONNX UNet
@@ -48,15 +48,16 @@ Build the TRT engine for the Quantized ONNX UNet
 
 Important Parameters
 ^^^^^^^^^^^^^^^^^^^^
-Input shape has to be provided here when building TRT engine.
+When building the TRT engine, you must provide the input shape as follows:
+
 - x: Input image latent shape (B * C * H * W)
-- context: Input text conditioning (B * S * hidden_dimention)
+- context: Input text conditioning (B * S * hidden_dimension)
 - y: Additional embedding (B * adm_in_channels)
 
 Build End-to-end Stable Diffusion XL Pipeline with NeMo
 -----------------------------------------------------------
 
-We provide a script to build end to end TRT inference pipeline with NeMo backend, which is located at `NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_export.py`.
+We provide a script to build an end-to-end TRT inference pipeline with the NeMo backend here: SDXL Export Script `<https://github.com/NVIDIA/NeMo/blob/main/examples/multimodal/text_to_image/stable_diffusion/sd_xl_export.py>`_.
 
 .. code-block:: yaml
 
@@ -73,16 +74,16 @@ We provide a script to build end to end TRT inference pipeline with NeMo backend
 
 Important Parameters
 ^^^^^^^^^^^^^^^^^^^^
-- out_path: Directory to save onnx file and TRT engine files
-- width and height: Image resolution of inference output
-- batch_size: Only used for dummy input generation and onnx sanity check
-- {min,max}_batch_size: The input batch size of TRT engine along its dynamic axis
+- out_path: Directory to save onnx file and TRT engine files.
+- width and height: Image resolution of inference output.
+- batch_size: Only used for dummy input generation and onnx sanity check.
+- {min,max}_batch_size: The input batch size of TRT engine along its dynamic axis.
 
 
-Run End-to-end Stable Diffusion XL TRT Pipeline
+Run End-to-End Stable Diffusion XL TRT Pipeline
 -----------------------------------------------------------
 
-The inference script can be found at `NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_trt_inference.py`.
+The inference script can be found here: SDXL TRT Inference Script `<https://github.com/NVIDIA/NeMo/blob/main/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py>`_.
 
 .. code-block:: yaml
 
@@ -138,23 +139,20 @@ FP16 inference vs Int8 inference
    :width: 50%
 .. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_int8_1.png
    :width: 50%
-Prompt: A photo of a Shiba Inu dog with a backpack riding a bike. It is wearing sunglasses and a beach hat. (FP16 upper vs Int8 lower)
-
-
 
+Prompt: A photo of a Shiba Inu dog with a backpack riding a bike. It is wearing sunglasses and a beach hat. (FP16 upper vs Int8 lower)
 
 .. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_fp16_2.png
    :width: 50%
 .. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_int8_2.png
    :width: 50%
-Prompt: A cute corgi lives in a house made out of sushi. (FP16 upper vs Int8 lower)
-
-
 
+Prompt: A cute corgi lives in a house made out of sushi. (FP16 upper vs Int8 lower)
 
 .. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_fp16_3.png
    :width: 50%
 .. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_int8_3.png
    :width: 50%
+
 Prompt: A high contrast portrait of a very happy fuzzy panda dressed as a chef in a high end kitchen making dough. There is a painting of flowers on the wall behind him. (FP16 upper vs Int8 lower)
 
diff --git a/docs/source/nlp/models.rst b/docs/source/nlp/models.rst
index 2654cfca26d8..98a5d96d5a24 100755
--- a/docs/source/nlp/models.rst
+++ b/docs/source/nlp/models.rst
@@ -15,7 +15,6 @@ NeMo's NLP collection supports provides the following task-specific models:
    text_classification
    bert_pretraining
    language_modeling
-   nemo_megatron/prompt_learning
    question_answering
    dialogue
    glue_benchmark
diff --git a/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst b/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
index efc2ac3f8439..a5914882da76 100644
--- a/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
+++ b/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
@@ -1,31 +1,31 @@
-GPT model training
+GPT Model Training
 ------------------
 
-GPT is a decoder-only Transformer model.
+The Generative Pre-trained Transformer (GPT) is a decoder-only Transformer model. This section demonstrates how to train a GPT-style model with NeMo.
+
+
+
 
 
-Quick start
-^^^^^^^^^^^
-The steps below demonstrate training of a GPT-style model with NeMo
 
 .. note::
-    This example is best completed using the latest NeMo Framework NGC Container
+    This example is best completed using the latest NeMo Framework Training container `<https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo>`_.
 
-Data download & pre-processing
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Download and Pre-process Data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. note::
-    Data download, pre-processing and tokenizer training in the example below will take ~3 hours.
+    The example below will take approximately 3 hours to download data, pre-process it, and train the tokenizer.
 
-**Step 1: Download data**
+1. Download data.
 
-The step below will download Wikipedia data (around 20GB) and can take several hours.
+The following step will download approximately 20GB of Wikipedia data, which can take several hours to complete.
 
 .. code-block:: bash
 
     wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
 
-**Step 2: Extract raw data**
+2. Extract raw data.
 
 .. code-block:: bash
 
@@ -33,16 +33,16 @@ The step below will download Wikipedia data (around 20GB) and can take several h
     python -m wikiextractor.WikiExtractor enwiki-latest-pages-articles.xml.bz2 --json
     find text -name 'wiki_*' -exec cat {} \; > train_data.jsonl
 
-Now, ``train_data.jsonl`` will contain our training data in the json line format. We are interested in the data under "text" field.
+Now, train_data.jsonl will contain our training data in JSON line format. We are particularly interested in the data within the "text" field.
 
 
-**Step 3: Train tokenizer**
+3. Train tokenizer.
 
-Below we will consider 2 options for training data tokenizers: Using pre-built HuggingFace BPE and training and using your own Google Sentencepiece tokenizer.
+Below, we will consider two options for training data tokenizers: using the pre-built Hugging Face BPE or training and using your own Google Sentencepiece tokenizer.
 
 Note that only the second option allows you to experiment with vocabulary size.
 
-*Option 1:* Using HuggingFace GPT2 tokenizer files.
+*Option 1:* Use Hugging Face GPT2 tokenizer files.
 
 With this option, we will download a pre-built vocabulary and merge the files for the BPE tokenizer.
 
@@ -52,10 +52,10 @@ With this option, we will download a pre-built vocabulary and merge the files fo
     wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
 
 
-*Option 2:* Using `Google Sentencepiece <https://github.com/google/sentencepiece>`_ tokenizer library. 
+*Option 2:* Use `Google Sentencepiece <https://github.com/google/sentencepiece>`_ tokenizer library. 
 
-It comes as a dependency with NeMo, so if you have installed NeMo it should already be installed.
-Note that training tokenizer model will also take some time.
+Google Sentencepiece is included as a dependency with NeMo, so if you have installed NeMo, it should already be installed. 
+Please note that training the tokenizer model will also take some time.
 
 .. code-block:: bash
 
@@ -70,13 +70,13 @@ Note that training tokenizer model will also take some time.
         --pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 \
         --split_digits true
 
-After this is done (will take a while), you'll have two files: ``spm_32k_wiki.model`` and ``spm_32k_wiki.vocab`` corresponding to the model and vocabulary.
+Completing this step can take some time. After it is done, you'll have two files: ``spm_32k_wiki.model`` and ``spm_32k_wiki.vocab`` corresponding to the model and vocabulary.
 
-**Step 4: Convert training data into memory map format**
+4. Convert training data into memory map format.
 
-This format makes training more efficient, especially with many nodes and GPUs. This step will also tokenize data using the tokenizer model from Step 3.
+The memory map format makes training more efficient, especially with many nodes and GPUs. This step will also tokenize data using the tokenizer model from Step 3.
 
-*Option 1:* Using HuggingFace GPT2 tokenizer files.
+*Option 1:* Use Hugging Face GPT2 tokenizer files.
 
 .. code-block:: bash
 
@@ -92,7 +92,7 @@ This format makes training more efficient, especially with many nodes and GPUs.
     --append-eod \
     --workers=32
 
-*Option 2:* Using `Google Sentencepiece <https://github.com/google/sentencepiece>`_ tokenizer library.
+*Option 2:* Use `Google Sentencepiece <https://github.com/google/sentencepiece>`_ tokenizer library.
 
 .. code-block:: bash
 
@@ -106,14 +106,14 @@ This format makes training more efficient, especially with many nodes and GPUs.
     --workers=32
 
 
-Train GPT-style Model
-~~~~~~~~~~~~~~~~~~~~~
+Train a GPT-Style Model
+~~~~~~~~~~~~~~~~~~~~~~~
 
 Once you have prepared training data and tokenizer, you are ready to train the model.
 The configuration we present below has about 124M parameters and should fit on a single 16GB GPU using float16.
 Let's go!
 
-*Option 1:* Using HuggingFace GPT2 tokenizer files.
+*Option 1:* Use Hugging Face GPT2 tokenizer files.
 
 .. code-block:: bash
 
@@ -166,7 +166,7 @@ Let's go!
 	exp_manager.checkpoint_callback_params.always_save_nemo=False
 
 
-*Option 2:* Using `Google Sentencepiece <https://github.com/google/sentencepiece>`_ tokenizer library.
+*Option 2:* Use `Google Sentencepiece <https://github.com/google/sentencepiece>`_ tokenizer library.
 
 .. code-block:: bash
 
@@ -219,18 +219,17 @@ Let's go!
 	exp_manager.checkpoint_callback_params.always_save_nemo=False
 
 
-Next, you can launch Tensorboard to monitor training like so:
+Next, you can launch Tensorboard to monitor training, as follows:
 
 .. code-block:: bash
 
     tensorboard --logdir nemo_experiments --bind_all
 
-Next steps
+Next Steps
 ~~~~~~~~~~
 
-Please refer to:
+For more information, please refer to:
 
 * :ref:`batching` section for batch size adjustments
 * :ref:`parallelisms` section for understanding various types of parallelisms
-* :ref:`promptlearning` section for details on prompt-tuning and p-tuning
 
diff --git a/docs/source/nlp/nemo_megatron/intro.rst b/docs/source/nlp/nemo_megatron/intro.rst
index 94978ffa6ffd..f376e6590255 100644
--- a/docs/source/nlp/nemo_megatron/intro.rst
+++ b/docs/source/nlp/nemo_megatron/intro.rst
@@ -14,7 +14,6 @@ To learn more about using NeMo to train Large Language Models at scale, please r
 
    gpt/gpt_training
    batching
-   prompt_learning
    retro/retro_model
    hiddens/hiddens_module
    peft/landing_page
diff --git a/docs/source/nlp/token_classification.rst b/docs/source/nlp/token_classification.rst
index 44deed4eeac6..848a07ed6b13 100755
--- a/docs/source/nlp/token_classification.rst
+++ b/docs/source/nlp/token_classification.rst
@@ -1,18 +1,19 @@
 .. _token_classification:
 
-Token Classification (Named Entity Recognition) Model
-=====================================================
+Token Classification Model with Named Entity Recognition (NER)
+==============================================================
 
-Token Classification model supports named entity recognition (NER) and other token level classification tasks, as long as the data 
+The token classification model supports NER and other token-level classification tasks, as long as the data 
 follows the format specified below.
 
-We're going to use NER task throughout this section. NER, also referred to as entity chunking, identification or extraction, is the 
-task of detecting and classifying key information (entities) in text. In other words, a NER model takes a piece of text as input and 
-for each word in the text, the model identifies a category the word belongs to. For example, in a sentence: ``Mary lives in Santa Clara 
-and works at NVIDIA``, the model should detect that ``Mary`` is a person, ``Santa Clara`` is a location and ``NVIDIA`` is a company.
+We're going to use NER task throughout this section. NER, also referred to as entity chunking, identification, or extraction, is
+the task of detecting and classifying key information (entities) in text. In other words, a NER model takes a piece of text as
+input and then determines the category for each word within it. For example, in the sentence “Mary lives in Santa Clara and 
+works at NVIDIA,” the model should detect that “Mary” is a person, “Santa Clara” is a location, and “NVIDIA” is a company.
 
-Quick Start Guide
------------------
+Quick Start
+-----------
+1. To run token-level classification, use the following Python script:
 
 .. code-block:: python
 
@@ -27,60 +28,62 @@ Quick Start Guide
     # try the model on a few examples
     model.add_predictions(['we bought four shirts from the nvidia gear store in santa clara.', 'NVIDIA is a company.'])
 
-.. note::
 
-    We recommend you try this model in a Jupyter notebook (run on `Google's Colab <https://colab.research.google.com/notebooks/intro.ipynb>`_.): 
-    `NeMo/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb>`__.
+2. Try this model in a Jupyter notebook, which you can run on `Google's Colab <https://colab.research.google.com/notebooks/intro.ipynb>`_. You can find this script in the 
+    `NeMo tutorial <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb>`__.
+
+3. Connect to an instance with a GPU (**Runtime** -> **Change runtime type** -> select **GPU** for the hardware accelerator).
 
-    Connect to an instance with a GPU (**Runtime** -> **Change runtime type** -> select **GPU** for the hardware accelerator).
+You can find example scripts and configuration files for the token classification model at the following locations:
 
-    An example script on how to train the model can be found here: `NeMo/examples/nlp/token_classification/token_classification_train.py <https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/token_classification/token_classification_train.py>`__.
+- An example script on how to train the model can be found here: `NeMo training script <https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/token_classification/token_classification_train.py>`_.
 
-    An example script on how to run evaluation and inference can be found here: `NeMo/examples/nlp/token_classification/token_classification_evaluate.py <https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/token_classification/token_classification_evaluate.py>`__.
+- An example script on how to run evaluation and inference can be found at `NeMo evaluation script <https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/token_classification/token_classification_evaluate.py>`_.
 
-    The default configuration file for the model can be found here: `NeMo/examples/nlp/token_classification/conf/token_classification_config.yaml <https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/token_classification/conf/token_classification_config.yaml>`__.
+- The default configuration file for the model can be found at `NeMo configuration file <https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/token_classification/conf/token_classification_config.yaml>`_.
 
 .. _dataset_token_classification:
 
-Data Input for Token Classification Model
------------------------------------------
+Provide Data Input for the Token Classification Model
+-----------------------------------------------------
 
-For pre-training or fine-tuning of the model, the data should be split into 2 files:
+To pre-train or fine-tune the model, split the data into the following two files:
 
 - ``text.txt``
 - ``labels.txt``
 
 Each line of the ``text.txt`` file contains text sequences, where words are separated with spaces, i.e.: ``[WORD] [SPACE] [WORD] [SPACE] [WORD]``.
 The ``labels.txt`` file contains corresponding labels for each word in ``text.txt``, the labels are separated with spaces, i.e.: ``[LABEL] [SPACE] [LABEL] [SPACE] [LABEL]``.
-Example of a ``text.txt`` file:
+The following is an example of a ``text.txt`` file:
 
     Jennifer is from New York City .
     She likes ...
     ...
 
-Corresponding ``labels.txt`` file:
+The following is an example of the corresponding ``labels.txt`` file:
 
     B-PER O O B-LOC I-LOC I-LOC O
     O O ...
     ...
 
-Dataset Conversion
-------------------
+Convert the Dataset
+-------------------
 
-To convert an `IOB format <https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>`__ (short for inside, outside, beginning) data to the format required for training, use
-`examples/nlp/token_classification/data/import_from_iob_format.py <https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/token_classification/data/import_from_iob_format.py>`_.
+To convert the IOB tagging format
+`<https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>`__ (short for inside, outside,
+beginning) into the format required for training, use the `NeMo import script <https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/token_classification/data/import_from_iob_format.py>`_.
 
 .. code::
 
     # For conversion from IOB format, for example, for CoNLL-2003 dataset:
     python import_from_iob_format.py --data_file=<PATH/TO/THE/FILE/IN/IOB/FORMAT>
 
-Convert Dataset Required Arguments
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Required Arguments for Dataset Conversion
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 - :code:`--data_file`: path to the file to convert from IOB to NeMo format
 
-After running the above command, the data directory, where the :code:`--data_file` is stored, should contain :code:`text_*.txt` and :code:`labels_*.txt` files.
+After running the above command, the data directory containing the :code:`--data_file` should include the :code:`text_*.txt` and :code:`labels_*.txt` files.
 The default names for the training and evaluation in the :code:`conf/token_classification_config.yaml` are the following:
 
 .. code::
@@ -93,15 +96,15 @@ The default names for the training and evaluation in the :code:`conf/token_class
      |-- text_train.txt
 
 
-Training The Token Classification model
----------------------------------------
+Train the Token Classification Model
+------------------------------------
 
-In the Token Classification model, we are jointly training a classifier on top of a pre-trained language model, such as 
-`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__ :cite:`nlp-ner-devlin2018bert`.
+In the token classification model, we are jointly training a classifier on top of a pre-trained language model, such as 
+`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__ :cite:`nlp-ner2-devlin2018bert`.
 Unless the user provides a pre-trained checkpoint for the language model, the language model is initialized with the pre-trained model 
-from `HuggingFace Transformers <https://github.com/huggingface/transformers>`__.
+from `Hugging Face Transformers <https://github.com/huggingface/transformers>`__.
 
-Example of model configuration file for training the model can be found at: `NeMo/examples/nlp/token_classification/conf/token_classification_config.yaml <https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/token_classification/conf/token_classification_config.yaml>`__.
+An example of model configuration file for training the model can be found at `NeMo configuration file <https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/token_classification/conf/token_classification_config.yaml>`_.
 
 The specification can be roughly grouped into three categories:
 
@@ -109,7 +112,7 @@ The specification can be roughly grouped into three categories:
 - Parameters that describe the datasets: **model.dataset**, **model.train_ds**, **model.validation_ds**
 - Parameters that describe the model: **model**
 
-More details about parameters in the spec file can be found below:
+You can find more details about the spec file parameters in table below.
 
 +-------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+
 | **Parameter**                             | **Data Type**   | **Description**                                                                                              |
@@ -139,7 +142,7 @@ More details about parameters in the spec file can be found below:
 
 For more information, see :ref:`nlp_model`.
 
-Example of the command for training the model:
+Here is an example command for training the model:
 
 .. code::
 
@@ -152,21 +155,22 @@ Example of the command for training the model:
 
 Required Arguments for Training
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The following argument is required for training:
 
 - :code:`model.dataset.data_dir`: path to the directory with pre-processed data.
 
 .. note::
 
-    While the arguments are defined in the spec file, if you want to override these parameter definitions in the spec file and 
-    experiment with them, use the command-line to define the parameter. For example, the sample spec file mentioned above has 
+    While the arguments are defined in the spec file, you can override these parameter definitions and experiment with them
+    using the command line. For example, the sample spec file mentioned above has 
     :code:`validation_ds.batch_size` set to ``64``. However, if the GPU utilization can be optimized further by
-    using a larger batch size, override it to the desired value by adding the field :code:`validation_ds.batch_size=128` from
-    the command-line. You can repeat this with any of the parameters defined in the sample spec file.
+    using a larger batch size, you can override it to the desired value by adding the field :code:`validation_ds.batch_size=128` from
+    the command-line. You can repeat this process with any of the parameters defined in the sample spec file.
 
 Inference
 ---------
 
-An example script on how to run inference can be found at `examples/nlp/token_classification/token_classification_evaluate.py <https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/token_classification/token_classification_evaluate.py>`_.
+An example script on how to run inference can be found at `NeMo evaluation script <https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/token_classification/token_classification_evaluate.py>`_.
 
 To run inference with the pre-trained model, run:
 
@@ -177,15 +181,16 @@ To run inference with the pre-trained model, run:
 
 Required Arguments for Inference
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The following argument is required for inference:
 
 - :code:`pretrained_model`: pretrained Token Classification model from ``list_available_models()`` or path to a ``.nemo`` file. For example, ``ner_en_bert`` or ``your_model.nemo``
 
-Model Evaluation
-----------------
+Evaluate the Token Classification Model
+---------------------------------------
 
-An example script on how to evaluate the pre-trained model can be found at `examples/nlp/token_classification/token_classification_evaluate.py <https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/token_classification/token_classification_evaluate.py>`_.
+An example script on how to evaluate the pre-trained model can be found at `NeMo evaluation script <https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/token_classification/token_classification_evaluate.py>`_.
 
-To start evaluation of the pre-trained model, run:
+To start the evaluation of the pre-trained mode, run:
 
 .. code::
 
@@ -197,8 +202,9 @@ To start evaluation of the pre-trained model, run:
            model.dataset.max_seq_length=512
 
 
-Required Arguments
-^^^^^^^^^^^^^^^^^^
+Required Arguments for Evaluation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The following arguments are required for evaluation:
 
 - :code:`pretrained_model`: pretrained Token Classification model from ``list_available_models()`` or path to a ``.nemo`` file. For example, ``ner_en_bert`` or ``your_model.nemo``
 - :code:`model.dataset.data_dir`: path to the directory that containes :code:`model.test_ds.text_file` and :code:`model.test_ds.labels_file`
@@ -209,12 +215,12 @@ During evaluation of the :code:`test_ds`, the script generates a classification
 - :code:`Recall`
 - :code:`F1`
 
-For more information, see `here <https://en.wikipedia.org/wiki/Precision_and_recall>`__.
+For more information, see `Wikipedia <https://en.wikipedia.org/wiki/Precision_and_recall>`__.
 
 References
 ----------
 
 .. bibliography:: nlp_all.bib
     :style: plain
-    :labelprefix: NLP-NER
-    :keyprefix: nlp-ner-
+    :labelprefix: NLP-NER2
+    :keyprefix: nlp-ner2-
diff --git a/docs/source/performance/performance_long_sequence.md b/docs/source/performance/performance_long_sequence.md
new file mode 100644
index 000000000000..d9f26dcf0d61
--- /dev/null
+++ b/docs/source/performance/performance_long_sequence.md
@@ -0,0 +1,134 @@
+# Long Sequence Performance
+
+## LLAMA2-7B (FP8)
+
+- The table below shows the pre-training performance of the LLAMA2-7B with CP (context parallelism) and compares it against the results without CP at various input sequence lengths. The detailed model-parallel configurations and the achieved performance are shown in the training results with CP. In non-CP training runs, we use the most performant model- and data-parallel configurations without CP given the memory capacity constraint of the H100 GPU system.
+
+  - Container: [NeMo24.03.01.framework](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags)
+  - System: DGX-H100
+
+
+<table>
+  <thead>
+    <tr>
+      <th rowspan="2" class="top-border">SeqLen (K)</th>
+      <th rowspan="2" class="top-border"># of GPUs</th>
+      <th rowspan="1" class="top-border">Without CP</th>
+      <th colspan="5" class="top-border">With CP</th>
+      <th rowspan="2" class="top-border">Speedup with CP/without CP</th>
+    </tr>
+    <tr>
+      <th>TFLOPS / GPU</th>
+      <th>TP</th>
+      <th>PP</th>
+      <th>DP</th>
+      <th>CP</th>
+      <th>TFLOPS / GPU</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>4</td>
+      <td>4</td>
+      <td>768</td>
+      <td>1</td>
+      <td>1</td>
+      <td>4</td>
+      <td>1</td>
+      <td>768</td>
+      <td class="speedup">1.00</td>
+    </tr>
+    <tr>
+      <td>8</td>
+      <td>8</td>
+      <td>730</td>
+      <td>1</td>
+      <td>2</td>
+      <td>4</td>
+      <td>1</td>
+      <td>730</td>
+      <td class="speedup">1.00</td>
+    </tr>
+    <tr>
+      <td>16</td>
+      <td>16</td>
+      <td>660</td>
+      <td>2</td>
+      <td>1</td>
+      <td>8</td>
+      <td>1</td>
+      <td>660</td>
+      <td class="speedup">1.00</td>
+    </tr>
+    <tr>
+      <td>32</td>
+      <td>32</td>
+      <td>595</td>
+      <td>2</td>
+      <td>1</td>
+      <td>8</td>
+      <td>2</td>
+      <td>610</td>
+      <td class="speedup">1.03</td>
+    </tr>
+    <tr>
+      <td>64</td>
+      <td>64</td>
+      <td>534</td>
+      <td>4</td>
+      <td>1</td>
+      <td>8</td>
+      <td>2</td>
+      <td>574</td>
+      <td class="speedup">1.07</td>
+    </tr>
+    <tr>
+      <td>128</td>
+      <td>128</td>
+      <td>424</td>
+      <td>4</td>
+      <td>1</td>
+      <td>8</td>
+      <td>4</td>
+      <td>555</td>
+      <td class="speedup">1.31</td>
+    </tr>
+    <tr>
+      <td>256</td>
+      <td>256</td>
+      <td>392</td>
+      <td>4</td>
+      <td>1</td>
+      <td>8</td>
+      <td>8</td>
+      <td>549</td>
+      <td class="speedup">1.40</td>
+    </tr>
+    <tr>
+      <td>512</td>
+      <td>512</td>
+      <td>104</td>
+      <td>8</td>
+      <td>1</td>
+      <td>4</td>
+      <td>16</td>
+      <td>549</td>
+      <td class="speedup">5.28</td>
+    </tr>
+    <tr>
+      <td>1024</td>
+      <td>1024</td>
+      <td>26.5</td>
+      <td>8</td>
+      <td>1</td>
+      <td>4</td>
+      <td>32</td>
+      <td>536</td>
+      <td class="speedup">20.23</td>
+    </tr>
+  </tbody>
+</table>
+
+
+### Speedup of LLAMA2 7B training with CP over without CP
+![cp_speedup_figure](https://github.com/NVIDIA/NeMo/releases/download/r2.0.0rc1/tutorial_cp_speedup_figure.png)
\ No newline at end of file
diff --git a/examples/asr/_temp/config.yaml b/examples/asr/_temp/config.yaml
new file mode 100644
index 000000000000..b0504594a17a
--- /dev/null
+++ b/examples/asr/_temp/config.yaml
@@ -0,0 +1,15 @@
+# temporary config
+
+model:
+    type: transformer
+    num_layers: 6
+    num_heads: 8
+    hidden_size: 512
+    dropout: 0.1
+    max_position_embeddings: 4096
+
+trainer:
+    devices: 1
+    accelerator: 'cpu'
+    max_epochs: 50
+    precision: 32
\ No newline at end of file
diff --git a/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml
index d8808b83069c..ea6094380856 100644
--- a/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml
+++ b/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml
@@ -136,6 +136,8 @@ model:
     xscaling: true # scales up the input embeddings by sqrt(d_model)
     untie_biases: true # unties the biases of the TransformerXL layers
     pos_emb_max_len: 5000
+    use_pytorch_sdpa: false # use torch sdpa instead of manual attention
+    use_pytorch_sdpa_backends: [] # empty list means all backends https://pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html e.g. [MATH]
 
     # Convolution module's params
     conv_kernel_size: 9
diff --git a/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml
index 90a77dee2913..9e2c1a876864 100644
--- a/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml
+++ b/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml
@@ -145,6 +145,8 @@ model:
     xscaling: true # scales up the input embeddings by sqrt(d_model)
     untie_biases: true # unties the biases of the TransformerXL layers
     pos_emb_max_len: 5000
+    use_pytorch_sdpa: false # use torch sdpa instead of manual attention
+    use_pytorch_sdpa_backends: [] # empty list means all backends https://pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html e.g. [MATH]
 
     # Convolution module's params
     conv_kernel_size: 9
diff --git a/examples/asr/conf/run_local.yaml b/examples/asr/conf/run_local.yaml
new file mode 100644
index 000000000000..e9a44335e9a6
--- /dev/null
+++ b/examples/asr/conf/run_local.yaml
@@ -0,0 +1,16 @@
+# The script to be run.
+script: ???
+script_config: ???
+
+exp_name: null
+results_dir: ???  # Where to store the results of the run
+
+num_tasks_per_node: 1
+
+executor: local
+
+containers:
+  asr: gitlab-master.nvidia.com/smajumdar/nemo_containers/asr-run:0.0.1
+
+mounts:
+  - "~/.cache/torch/NeMo:/cache/torch/NeMo"
\ No newline at end of file
diff --git a/examples/asr/run_helper.py b/examples/asr/run_helper.py
new file mode 100644
index 000000000000..6e82f1f35ab1
--- /dev/null
+++ b/examples/asr/run_helper.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from pathlib import Path
+
+import nemo_run as run
+from omegaconf import OmegaConf, open_dict
+
+from nemo.collections.common.parts import run_utils
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+
+
+NEMO_ROOT = Path(__file__).absolute().parents[2]
+
+
+def gather_mounts(cluster_cfg):
+    # Gather all mounts from the cluster config including ones which are disjoint from the cluster_cfg.mounts list.
+    mounts = cluster_cfg.get('mounts', [])
+
+    # Resolve any mounts in th cluster config that need user expansion
+    mounts = [os.path.expanduser(m) for m in mounts]
+
+    keys = list(cluster_cfg.keys())
+    with open_dict(cluster_cfg):
+        for k in keys:
+            if k.startswith("mount_"):
+                logging.info(f"Found additional mount flag in the cluster config `{k}`. Adding it to the mounts list.")
+                mounts.append(cluster_cfg[k])
+                del cluster_cfg[k]
+
+        cluster_cfg['mounts'] = mounts
+        logging.info(f"Final Mounts: {mounts}")
+
+
+def check_root_path(path, nemo_root):
+    path = str(path)
+    nemo_root = str(nemo_root)
+
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"Path {path} does not exist.")
+
+    if not path.startswith(nemo_root):
+        raise ValueError(f"Path {path} is not in the NeMo root directory.")
+
+    new_path = path.replace(nemo_root, '/nemo_run/code/')
+    return new_path
+
+
+def merge_configs(script_config, run_config):
+    script_config = OmegaConf.load(script_config)
+    original_script_keys = set(script_config.keys())
+    result = OmegaConf.merge(script_config, run_config)
+
+    # delete cluster config keys from the merged config
+    with open_dict(result):
+        for k in run_config.keys():
+            if k in result and k not in original_script_keys:
+                del result[k]
+
+    # Check for any ??? missing values in result recursively and raise an error if found
+    def check_missing_values(cfg):
+        if hasattr(cfg, 'items'):
+            for k, v in cfg.items():
+                if hasattr(v, 'items'):
+                    check_missing_values(v)
+                elif v == '???':
+                    raise ValueError(f"Missing value for key {k} in the config file")
+
+    check_missing_values(result)
+    return result
+
+def check_config_mount_paths(script_config, cluster_config):
+    # recursively walk all values of the script_config, checking if its a path-like string and if so, check if the path is a mounted path
+    # if it is not, raise an error
+
+    def check_mounted_path(cfg, cluster_cfg):
+        if hasattr(cfg, 'items'):
+            for k, v in cfg.items():
+                if hasattr(v, 'items'):
+                    check_mounted_path(v, cluster_cfg)
+                elif isinstance(v, str):
+                    if v.startswith(os.path.sep):
+                        run_utils.check_if_mounted(cluster_cfg, v)
+
+    check_mounted_path(script_config, cluster_config)
+
+
+def get_execution_script(cluster_script_path, config_name):
+    # Create the command to run the script
+    cmd = """
+nvidia-smi && \
+export PYTHONPATH=$PYTHONPATH:/nemo_run/code && \
+export HF_TOKEN={HF_TOKEN} && \
+export WANDB_API_KEY={WANDB} && \
+cd {cluster_script_dir} && \
+python {cluster_script_path} --config-path "/results" --config-name "{config_name}" && \
+cd /results && \
+ls -l;
+    """
+    wandb_key = os.environ.get("WANDB", os.environ.get("WANDB_API_KEY", os.environ.get("WANDB_KEY", "")))
+    format_dict = dict(
+        cluster_script_dir=os.path.dirname(cluster_script_path),
+        cluster_script_path=os.path.basename(cluster_script_path),
+        config_name=config_name,
+        HF_TOKEN=os.getenv('HF_TOKEN', ''),
+        WANDB=wandb_key,
+    )
+
+    cmd = cmd.format(**format_dict)
+    return cmd
+
+
+@hydra_runner(config_path='conf', config_name='run_local')
+def main(cluster_cfg):
+    script_path = cluster_cfg.script
+    script_config = cluster_cfg.script_config
+    results_dir = cluster_cfg.results_dir
+
+    script_path = Path(script_path).absolute()
+    script_config = Path(script_config).absolute()
+
+    gather_mounts(cluster_cfg)
+
+    # Add the results directory to the cluster config as a mount path
+    run_utils.add_mount_path(results_dir, '/results', cluster_cfg)
+
+    cluster_script_path = check_root_path(script_path, NEMO_ROOT)
+
+    # Create results and logdir
+    log_dir = cluster_cfg.get('log_dir', os.path.join(results_dir, 'logs'))
+    run_utils.create_remote_directory([results_dir, log_dir], cluster_cfg)
+
+    merged_config = merge_configs(script_config, cluster_cfg)
+    run_utils.create_remote_config(merged_config, "config.yaml", results_dir, cluster_cfg)
+
+    check_config_mount_paths(merged_config, cluster_cfg)
+
+    # Resolve experiment name
+    exp_name = cluster_cfg.exp_name
+    if exp_name is None:
+        if 'exp_manager' in merged_config and 'name' in merged_config['exp_manager']:
+            exp_name = merged_config['exp_manager']['name']
+        else:
+            raise ValueError("Experiment name not provided in the run config file (`exp_name`)) or the cluster config (inside exp_manager.name)")
+
+    with run.Experiment(exp_name) as exp:
+        cmd = get_execution_script(cluster_script_path, "config.yaml")
+
+        job_name = f"{exp_name}_job"
+        num_gpus = cluster_cfg.get('num_gpus', merged_config['trainer']['devices'])
+        if isinstance(num_gpus, list):
+            num_gpus = len(num_gpus)
+        num_nodes = cluster_cfg.get('num_nodes', merged_config['trainer'].get('num_nodes', 1))
+        cluster_cfg = OmegaConf.to_object(cluster_cfg)
+
+        run_utils.add_task(exp,
+            cmd=cmd,
+            task_name=job_name,
+            cluster_config=cluster_cfg,
+            container=cluster_cfg['containers']['asr'],
+            num_tasks=cluster_cfg.get('num_tasks', cluster_cfg.get('num_tasks_per_node', 1)),
+            num_gpus=num_gpus,
+            num_nodes=num_nodes,
+            log_dir=run_utils.get_mounted_filepath(cluster_cfg, log_dir),
+            partition=cluster_cfg.get('partition', None),
+            run_after=cluster_cfg.get('run_after', None),
+        )
+
+        run_utils.run_exp(exp, cluster_cfg)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/nlp/duplex_text_normalization/README.md b/examples/nlp/duplex_text_normalization/README.md
new file mode 100644
index 000000000000..808ed2856fb2
--- /dev/null
+++ b/examples/nlp/duplex_text_normalization/README.md
@@ -0,0 +1,2 @@
+> [!IMPORTANT]  
+> This section is no longer supported in NeMo and is scheduled for removal in the 23.11 release.
diff --git a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
index 40ba35f819ef..c81119489582 100644
--- a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
+++ b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
@@ -21,7 +21,9 @@
      --checkpoint_name <checkpoint_name> \
      --nemo_file_path <path_to_output_nemo_file> \
      --tensor_model_parallel_size <tensor_model_parallel_size> \
-     --pipeline_model_parallel_size <pipeline_model_parallel_size>
+     --pipeline_model_parallel_size <pipeline_model_parallel_size> \
+     --gpus_per_node <gpus_per_node> \
+     --model_type <model_type>
 """
 
 import dis
@@ -100,7 +102,7 @@ def get_args():
         default="gpt",
         choices=["gpt", "sft", "t5", "bert", "nmt", "bart", "retro"],
     )
-    parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
+    parser.add_argument("--local-rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
     parser.add_argument("--bcp", action="store_true", help="Whether on BCP platform")
     parser.add_argument(
         "--precision",
@@ -134,7 +136,7 @@ def convert(local_rank, rank, world_size, args):
             'accelerator': 'gpu',
             'precision': args.precision,
         },
-        'model': {'native_amp_init_scale': 2 ** 32, 'native_amp_growth_interval': 1000, 'hysteresis': 2},
+        'model': {'native_amp_init_scale': 2**32, 'native_amp_growth_interval': 1000, 'hysteresis': 2},
     }
     cfg = OmegaConf.create(cfg)
 
@@ -142,7 +144,7 @@ def convert(local_rank, rank, world_size, args):
     # If FP16 create a GradScaler as the build_model_parallel_config of MegatronBaseModel expects it
     if cfg.trainer.precision == '16-mixed':
         scaler = GradScaler(
-            init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
+            init_scale=cfg.model.get('native_amp_init_scale', 2**32),
             growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
             hysteresis=cfg.model.get('hysteresis', 2),
         )
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
index 06551f46486c..79a07ce4e2c0 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
@@ -126,6 +126,13 @@ model:
       tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
 
   data:
+    chat: False # whether use chatbot data or not
+    chat_prompt_tokens:  # special tokens for the chat prompts, a dictionary of {token_type: token}. note that some tokenizer may combine the characters at the junction between {end_of_turn}{turn_start}. e.g. '<im end><im start>', the '><' sometimes is merged to be a single token. This is not supported, try to avoid
+      system_turn_start: "\x00"
+      turn_start: "\x11"
+      label_start: "\x12"
+      end_of_turn: "\x0A"  # \0x0A is '\n'
+      end_of_name: "\x0A"  # \0x0A is '\n'
     train_ds:
       # Example of how to specify paths to multiple datasets
       # file_names:
diff --git a/examples/nlp/token_classification/README.md b/examples/nlp/token_classification/README.md
new file mode 100644
index 000000000000..808ed2856fb2
--- /dev/null
+++ b/examples/nlp/token_classification/README.md
@@ -0,0 +1,2 @@
+> [!IMPORTANT]  
+> This section is no longer supported in NeMo and is scheduled for removal in the 23.11 release.
diff --git a/nemo/collections/asr/data/audio_to_text_lhotse.py b/nemo/collections/asr/data/audio_to_text_lhotse.py
index 576ea8234c87..f916ae1de56b 100644
--- a/nemo/collections/asr/data/audio_to_text_lhotse.py
+++ b/nemo/collections/asr/data/audio_to_text_lhotse.py
@@ -51,15 +51,12 @@ def __init__(self, tokenizer):
     def __getitem__(self, cuts) -> Tuple[torch.Tensor, ...]:
         audio, audio_lens, cuts = self.load_audio(cuts)
         tokens = [
-            torch.as_tensor(
-                sum(
-                    (
-                        # Supervisions may come pre-tokenized from the dataloader.
-                        s.tokens if hasattr(s, "tokens") else self.tokenizer(s.text, s.language)
-                        for s in c.supervisions
-                    ),
-                    start=[],
-                )
+            torch.cat(
+                [
+                    torch.as_tensor(s.tokens if hasattr(s, "tokens") else self.tokenizer(s.text, s.language))
+                    for s in c.supervisions
+                ],
+                dim=0,
             )
             for c in cuts
         ]
diff --git a/nemo/collections/asr/metrics/wer.py b/nemo/collections/asr/metrics/wer.py
index a135e5c51e84..7bda3a77b278 100644
--- a/nemo/collections/asr/metrics/wer.py
+++ b/nemo/collections/asr/metrics/wer.py
@@ -254,8 +254,9 @@ def __init__(
         fold_consecutive=True,
         batch_dim_index=0,
         dist_sync_on_step=False,
+        sync_on_compute=True,
     ):
-        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        super().__init__(dist_sync_on_step=dist_sync_on_step, sync_on_compute=sync_on_compute)
 
         self.decoding = decoding
         self.use_cer = use_cer
diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py
index 245404a7601c..27d0cde33f8c 100644
--- a/nemo/collections/asr/modules/conformer_encoder.py
+++ b/nemo/collections/asr/modules/conformer_encoder.py
@@ -147,6 +147,10 @@ class ConformerEncoder(NeuralModule, StreamingEncoder, Exportable, AccessMixin):
             Defaults to 1.
         global_attn_separate (bool): whether the q, k, v layers used for global tokens should be separate.
             Defaults to False.
+        use_pytorch_sdpa (bool): use torch sdpa instead of manual attention.
+            Defaults to False.
+        use_pytorch_sdpa_backends (list[str]): list of backend names to use in sdpa. None or empty list means all backends. e.g. ["MATH"]
+            Defaults to None
 
     """
 
@@ -295,6 +299,8 @@ def __init__(
         global_tokens: int = 0,
         global_tokens_spacing: int = 1,
         global_attn_separate: bool = False,
+        use_pytorch_sdpa: bool = False,
+        use_pytorch_sdpa_backends=None,
     ):
         super().__init__()
         d_ff = d_model * ff_expansion_factor
@@ -309,6 +315,10 @@ def __init__(
         self.global_tokens = global_tokens
         self.global_attn_separate = global_attn_separate
         self.global_tokens_spacing = global_tokens_spacing
+        self.use_pytorch_sdpa = use_pytorch_sdpa
+        if use_pytorch_sdpa_backends is None:
+            use_pytorch_sdpa_backends = []
+        self.use_pytorch_sdpa_backends = use_pytorch_sdpa_backends
 
         # Setting up the att_context_size
         (
@@ -430,6 +440,8 @@ def __init__(
                 pos_bias_v=pos_bias_v,
                 att_context_size=self.att_context_size,
                 use_bias=use_bias,
+                use_pytorch_sdpa=self.use_pytorch_sdpa,
+                use_pytorch_sdpa_backends=self.use_pytorch_sdpa_backends,
             )
             self.layers.append(layer)
 
@@ -1028,6 +1040,8 @@ def change_attention_model(
                         max_cache_len=att_context_size[0],
                         pos_bias_u=None,
                         pos_bias_v=None,
+                        use_pytorch_sdpa=self.use_pytorch_sdpa,
+                        use_pytorch_sdpa_backends=self.use_pytorch_sdpa_backends,
                     )
                 elif self_attention_model == 'rel_pos_local_attn':
                     new_attn = RelPositionMultiHeadAttentionLongformer(
@@ -1038,6 +1052,8 @@ def change_attention_model(
                         att_context_size=att_context_size,
                         pos_bias_u=None,
                         pos_bias_v=None,
+                        use_pytorch_sdpa=self.use_pytorch_sdpa,
+                        use_pytorch_sdpa_backends=self.use_pytorch_sdpa_backends,
                     )
                 elif self_attention_model == 'abs_pos':
                     new_attn = MultiHeadAttention(
@@ -1045,6 +1061,8 @@ def change_attention_model(
                         n_feat=self._cfg.d_model,
                         dropout_rate=self._cfg.dropout_att,
                         max_cache_len=att_context_size[0],
+                        use_pytorch_sdpa=self.use_pytorch_sdpa,
+                        use_pytorch_sdpa_backends=self.use_pytorch_sdpa_backends,
                     )
                 else:
                     raise ValueError(
diff --git a/nemo/collections/asr/modules/rnnt.py b/nemo/collections/asr/modules/rnnt.py
index 2355cfb7005b..3ab6a432b947 100644
--- a/nemo/collections/asr/modules/rnnt.py
+++ b/nemo/collections/asr/modules/rnnt.py
@@ -74,8 +74,7 @@ class StatelessTransducerDecoder(rnnt_abstract.AbstractRNNTDecoder, Exportable):
 
     @property
     def input_types(self):
-        """Returns definitions of module input ports.
-        """
+        """Returns definitions of module input ports."""
         return {
             "targets": NeuralType(('B', 'T'), LabelsType()),
             "target_length": NeuralType(tuple('B'), LengthsType()),
@@ -84,8 +83,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {
             "outputs": NeuralType(('B', 'D', 'T'), EmbeddedTextType()),
             "prednet_lengths": NeuralType(tuple('B'), LengthsType()),
@@ -317,20 +315,18 @@ def initialize_state(self, y: torch.Tensor) -> List[torch.Tensor]:
         ]
         return state
 
-    def batch_initialize_states(self, batch_states: List[torch.Tensor], decoder_states: List[List[torch.Tensor]]):
+    def batch_initialize_states(self, decoder_states: List[List[torch.Tensor]]):
         """
-        Create batch of decoder states.
+        Creates a stacked decoder states to be passed to prediction network.
 
         Args:
-            batch_states (list): batch of decoder states
-                ([(B, H)])
-
-            decoder_states (list of list): list of decoder states
-                [B x ([(1, C)]]
+            decoder_states (list of list of torch.Tensor): list of decoder states
+                [B, 1, C]
+                    - B: Batch size.
+                    - C: Dimensionality of the hidden state.
 
         Returns:
-            batch_states (tuple): batch of decoder states
-                ([(B, C)])
+            batch_states (list of torch.Tensor): batch of decoder states [[B x C]]
         """
         new_state = torch.stack([s[0] for s in decoder_states])
 
@@ -382,7 +378,10 @@ def batch_concat_states(self, batch_states: List[List[torch.Tensor]]) -> List[to
 
     @classmethod
     def batch_replace_states_mask(
-        cls, src_states: list[torch.Tensor], dst_states: list[torch.Tensor], mask: torch.Tensor,
+        cls,
+        src_states: list[torch.Tensor],
+        dst_states: list[torch.Tensor],
+        mask: torch.Tensor,
     ):
         """Replace states in dst_states with states from src_states using the mask"""
         # same as `dst_states[0][mask] = src_states[0][mask]`, but non-blocking
@@ -390,7 +389,9 @@ def batch_replace_states_mask(
 
     @classmethod
     def batch_replace_states_all(
-        cls, src_states: list[torch.Tensor], dst_states: list[torch.Tensor],
+        cls,
+        src_states: list[torch.Tensor],
+        dst_states: list[torch.Tensor],
     ):
         """Replace states in dst_states with states from src_states"""
         dst_states[0].copy_(src_states[0])
@@ -449,86 +450,69 @@ def mask_select_states(
         return [states[0][mask]]
 
     def batch_score_hypothesis(
-        self, hypotheses: List[rnnt_utils.Hypothesis], cache: Dict[Tuple[int], Any], batch_states: List[torch.Tensor]
-    ) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
+        self,
+        hypotheses: List[rnnt_utils.Hypothesis],
+        cache: Dict[Tuple[int], Any],
+    ) -> Tuple[List[torch.Tensor], List[List[torch.Tensor]]]:
         """
         Used for batched beam search algorithms. Similar to score_hypothesis method.
 
         Args:
             hypothesis: List of Hypotheses. Refer to rnnt_utils.Hypothesis.
             cache: Dict which contains a cache to avoid duplicate computations.
-            batch_states: List of torch.Tensor which represent the states of the RNN for this batch.
-                Each state is of shape [L, B, H]
 
         Returns:
-            Returns a tuple (b_y, b_states, lm_tokens) such that:
-            b_y is a torch.Tensor of shape [B, 1, H] representing the scores of the last tokens in the Hypotheses.
-            b_state is a list of list of RNN states, each of shape [L, B, H].
-            Represented as B x List[states].
-            lm_token is a list of the final integer tokens of the hypotheses in the batch.
+            Returns a tuple (batch_dec_out, batch_dec_states) such that:
+                batch_dec_out: a list of torch.Tensor [1, H] representing the prediction network outputs for the last tokens in the Hypotheses.
+                batch_dec_states: a list of list of RNN states, each of shape [L, B, H]. Represented as B x List[states].
         """
         final_batch = len(hypotheses)
+
         if final_batch == 0:
             raise ValueError("No hypotheses was provided for the batch!")
 
         _p = next(self.parameters())
         device = _p.device
-        dtype = _p.dtype
 
         tokens = []
-        process = []
-        done = [None for _ in range(final_batch)]
+        to_process = []
+        final = [None for _ in range(final_batch)]
 
         # For each hypothesis, cache the last token of the sequence and the current states
-        for i, hyp in enumerate(hypotheses):
+        for final_idx, hyp in enumerate(hypotheses):
             sequence = tuple(hyp.y_sequence)
 
             if sequence in cache:
-                done[i] = cache[sequence]
+                final[final_idx] = cache[sequence]
             else:
                 tokens.append(hyp.y_sequence[-1])
-                process.append((sequence, hyp.dec_state))
+                to_process.append((sequence, hyp.dec_state))
 
-        if process:
-            batch = len(process)
+        if to_process:
+            batch = len(to_process)
 
             # convert list of tokens to torch.Tensor, then reshape.
             tokens = torch.tensor(tokens, device=device, dtype=torch.long).view(batch, -1)
-            dec_states = self.initialize_state(tokens)  # [B, C]
-            dec_states = self.batch_initialize_states(dec_states, [d_state for seq, d_state in process])
+            dec_states = self.batch_initialize_states([d_state for _, d_state in to_process])
 
-            y, dec_states = self.predict(
+            dec_outputs, dec_states = self.predict(
                 tokens, state=dec_states, add_sos=False, batch_size=batch
-            )  # [B, 1, H], List([L, 1, H])
+            )  # [B, 1, H], B x List([L, 1, H])
 
-            dec_states = tuple(state.to(dtype=dtype) for state in dec_states)
+            # Update final states and cache shared by entire batch.
+            processed_idx = 0
+            for final_idx in range(final_batch):
+                if to_process and final[final_idx] is None:
+                    # Select sample's state from the batch state list
+                    new_state = self.batch_select_state(dec_states, processed_idx)
 
-        # Update done states and cache shared by entire batch.
-        j = 0
-        for i in range(final_batch):
-            if done[i] is None:
-                # Select sample's state from the batch state list
-                new_state = self.batch_select_state(dec_states, j)
+                    # Cache [1, H] scores of the current y_j, and its corresponding state
+                    final[final_idx] = (dec_outputs[processed_idx], new_state)
+                    cache[to_process[processed_idx][0]] = (dec_outputs[processed_idx], new_state)
 
-                # Cache [1, H] scores of the current y_j, and its corresponding state
-                done[i] = (y[j], new_state)
-                cache[process[j][0]] = (y[j], new_state)
+                    processed_idx += 1
 
-                j += 1
-
-        # Set the incoming batch states with the new states obtained from `done`.
-        batch_states = self.batch_initialize_states(batch_states, [d_state for y_j, d_state in done])
-
-        # Create batch of all output scores
-        # List[1, 1, H] -> [B, 1, H]
-        batch_y = torch.stack([y_j for y_j, d_state in done])
-
-        # Extract the last tokens from all hypotheses and convert to a tensor
-        lm_tokens = torch.tensor([h.y_sequence[-1] for h in hypotheses], device=device, dtype=torch.long).view(
-            final_batch
-        )
-
-        return batch_y, batch_states, lm_tokens
+        return [dec_out for dec_out, _ in final], [dec_states for _, dec_states in final]
 
 
 class RNNTDecoder(rnnt_abstract.AbstractRNNTDecoder, Exportable, AdapterModuleMixin):
@@ -591,8 +575,7 @@ class RNNTDecoder(rnnt_abstract.AbstractRNNTDecoder, Exportable, AdapterModuleMi
 
     @property
     def input_types(self):
-        """Returns definitions of module input ports.
-        """
+        """Returns definitions of module input ports."""
         return {
             "targets": NeuralType(('B', 'T'), LabelsType()),
             "target_length": NeuralType(tuple('B'), LengthsType()),
@@ -601,8 +584,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {
             "outputs": NeuralType(('B', 'D', 'T'), EmbeddedTextType()),
             "prednet_lengths": NeuralType(tuple('B'), LengthsType()),
@@ -934,23 +916,21 @@ def score_hypothesis(
         return y, new_state, lm_token
 
     def batch_score_hypothesis(
-        self, hypotheses: List[rnnt_utils.Hypothesis], cache: Dict[Tuple[int], Any], batch_states: List[torch.Tensor]
-    ) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
+        self,
+        hypotheses: List[rnnt_utils.Hypothesis],
+        cache: Dict[Tuple[int], Any],
+    ) -> Tuple[List[torch.Tensor], List[List[torch.Tensor]]]:
         """
         Used for batched beam search algorithms. Similar to score_hypothesis method.
 
         Args:
             hypothesis: List of Hypotheses. Refer to rnnt_utils.Hypothesis.
             cache: Dict which contains a cache to avoid duplicate computations.
-            batch_states: List of torch.Tensor which represent the states of the RNN for this batch.
-                Each state is of shape [L, B, H]
 
         Returns:
-            Returns a tuple (b_y, b_states, lm_tokens) such that:
-            b_y is a torch.Tensor of shape [B, 1, H] representing the scores of the last tokens in the Hypotheses.
-            b_state is a list of list of RNN states, each of shape [L, B, H].
-            Represented as B x List[states].
-            lm_token is a list of the final integer tokens of the hypotheses in the batch.
+            Returns a tuple (batch_dec_out, batch_dec_states) such that:
+                batch_dec_out: a list of torch.Tensor [1, H] representing the prediction network outputs for the last tokens in the Hypotheses.
+                batch_dec_states: a list of list of RNN states, each of shape [L, B, H]. Represented as B x List[states].
         """
         final_batch = len(hypotheses)
 
@@ -959,90 +939,69 @@ def batch_score_hypothesis(
 
         _p = next(self.parameters())
         device = _p.device
-        dtype = _p.dtype
 
         tokens = []
-        process = []
-        done = [None for _ in range(final_batch)]
+        to_process = []
+        final = [None for _ in range(final_batch)]
 
         # For each hypothesis, cache the last token of the sequence and the current states
-        for i, hyp in enumerate(hypotheses):
+        for final_idx, hyp in enumerate(hypotheses):
             sequence = tuple(hyp.y_sequence)
 
             if sequence in cache:
-                done[i] = cache[sequence]
+                final[final_idx] = cache[sequence]
             else:
                 tokens.append(hyp.y_sequence[-1])
-                process.append((sequence, hyp.dec_state))
+                to_process.append((sequence, hyp.dec_state))
 
-        if process:
-            batch = len(process)
+        if to_process:
+            batch = len(to_process)
 
             # convert list of tokens to torch.Tensor, then reshape.
             tokens = torch.tensor(tokens, device=device, dtype=torch.long).view(batch, -1)
-            dec_states = self.initialize_state(tokens.to(dtype=dtype))  # [L, B, H]
-            dec_states = self.batch_initialize_states(dec_states, [d_state for seq, d_state in process])
+            dec_states = self.batch_initialize_states([d_state for _, d_state in to_process])
 
-            y, dec_states = self.predict(
+            dec_out, dec_states = self.predict(
                 tokens, state=dec_states, add_sos=False, batch_size=batch
-            )  # [B, 1, H], List([L, 1, H])
-
-            dec_states = tuple(state.to(dtype=dtype) for state in dec_states)
+            )  # [B, 1, H], B x List([L, 1, H])
 
-        # Update done states and cache shared by entire batch.
-        j = 0
-        for i in range(final_batch):
-            if done[i] is None:
-                # Select sample's state from the batch state list
-                new_state = self.batch_select_state(dec_states, j)
+            # Update final states and cache shared by entire batch.
+            processed_idx = 0
+            for final_idx in range(final_batch):
+                if final[final_idx] is None:
+                    # Select sample's state from the batch state list
+                    new_state = self.batch_select_state(dec_states, processed_idx)
 
-                # Cache [1, H] scores of the current y_j, and its corresponding state
-                done[i] = (y[j], new_state)
-                cache[process[j][0]] = (y[j], new_state)
+                    # Cache [1, H] scores of the current y_j, and its corresponding state
+                    final[final_idx] = (dec_out[processed_idx], new_state)
+                    cache[to_process[processed_idx][0]] = (dec_out[processed_idx], new_state)
 
-                j += 1
+                    processed_idx += 1
 
-        # Set the incoming batch states with the new states obtained from `done`.
-        batch_states = self.batch_initialize_states(batch_states, [d_state for y_j, d_state in done])
+        return [dec_out for dec_out, _ in final], [dec_states for _, dec_states in final]
 
-        # Create batch of all output scores
-        # List[1, 1, H] -> [B, 1, H]
-        batch_y = torch.stack([y_j for y_j, d_state in done])
-
-        # Extract the last tokens from all hypotheses and convert to a tensor
-        lm_tokens = torch.tensor([h.y_sequence[-1] for h in hypotheses], device=device, dtype=torch.long).view(
-            final_batch
-        )
-
-        return batch_y, batch_states, lm_tokens
-
-    def batch_initialize_states(self, batch_states: List[torch.Tensor], decoder_states: List[List[torch.Tensor]]):
+    def batch_initialize_states(self, decoder_states: List[List[torch.Tensor]]) -> List[torch.Tensor]:
         """
-        Create batch of decoder states.
-
-       Args:
-           batch_states (list): batch of decoder states
-              ([L x (B, H)], [L x (B, H)])
-
-           decoder_states (list of list): list of decoder states
-               [B x ([L x (1, H)], [L x (1, H)])]
+        Creates a stacked decoder states to be passed to prediction network
 
-       Returns:
-           batch_states (tuple): batch of decoder states
-               ([L x (B, H)], [L x (B, H)])
-       """
-        # LSTM has 2 states
-        new_states = [[] for _ in range(len(decoder_states[0]))]
-        for layer in range(self.pred_rnn_layers):
-            for state_id in range(len(decoder_states[0])):
-                # batch_states[state_id][layer] = torch.stack([s[state_id][layer] for s in decoder_states])
-                new_state_for_layer = torch.stack([s[state_id][layer] for s in decoder_states])
-                new_states[state_id].append(new_state_for_layer)
+        Args:
+            decoder_states (list of list of list of torch.Tensor): list of decoder states
+                [B, C, L, H]
+                    - B: Batch size.
+                    - C: e.g., for LSTM, this is 2: hidden and cell states
+                    - L: Number of layers in prediction RNN.
+                    - H: Dimensionality of the hidden state.
 
-        for state_id in range(len(decoder_states[0])):
-            new_states[state_id] = torch.stack([state for state in new_states[state_id]])
+        Returns:
+            batch_states (list of torch.Tensor): batch of decoder states
+                [C x torch.Tensor[L x B x H]
+        """
+        # stack decoder states into tensor of shape [B x layers x L x H]
+        # permute to the target shape [layers x L x B x H]
+        stacked_states = torch.stack([torch.stack(decoder_state) for decoder_state in decoder_states])
+        permuted_states = stacked_states.permute(1, 2, 0, 3)
 
-        return new_states
+        return list(permuted_states.contiguous())
 
     def batch_select_state(self, batch_states: List[torch.Tensor], idx: int) -> List[List[torch.Tensor]]:
         """Get decoder state from batch of states, for given id.
@@ -1058,14 +1017,9 @@ def batch_select_state(self, batch_states: List[torch.Tensor], idx: int) -> List
                 ([L x (1, H)], [L x (1, H)])
         """
         if batch_states is not None:
-            state_list = []
-            for state_id in range(len(batch_states)):
-                states = [batch_states[state_id][layer][idx] for layer in range(self.pred_rnn_layers)]
-                state_list.append(states)
+            return [state[:, idx] for state in batch_states]
 
-            return state_list
-        else:
-            return None
+        return None
 
     def batch_concat_states(self, batch_states: List[List[torch.Tensor]]) -> List[torch.Tensor]:
         """Concatenate a batch of decoder state to a packed state.
@@ -1083,7 +1037,11 @@ def batch_concat_states(self, batch_states: List[List[torch.Tensor]]) -> List[to
         for state_id in range(len(batch_states[0])):
             batch_list = []
             for sample_id in range(len(batch_states)):
-                tensor = torch.stack(batch_states[sample_id][state_id])  # [L, H]
+                tensor = (
+                    torch.stack(batch_states[sample_id][state_id])
+                    if not isinstance(batch_states[sample_id][state_id], torch.Tensor)
+                    else batch_states[sample_id][state_id]
+                )  # [L, H]
                 tensor = tensor.unsqueeze(0)  # [1, L, H]
                 batch_list.append(tensor)
 
@@ -1109,7 +1067,9 @@ def batch_replace_states_mask(
 
     @classmethod
     def batch_replace_states_all(
-        cls, src_states: Tuple[torch.Tensor, torch.Tensor], dst_states: Tuple[torch.Tensor, torch.Tensor],
+        cls,
+        src_states: Tuple[torch.Tensor, torch.Tensor],
+        dst_states: Tuple[torch.Tensor, torch.Tensor],
     ):
         """Replace states in dst_states with states from src_states"""
         dst_states[0].copy_(src_states[0])
@@ -1249,12 +1209,15 @@ class RNNTJoint(rnnt_abstract.AbstractRNNTJoint, Exportable, AdapterModuleMixin)
 
         fused_batch_size: Optional int, required if `fuse_loss_wer` flag is set. Determines the size of the
             sub-batches. Should be any value below the actual batch size per GPU.
+        masking_prob: Optional float, indicating the probability of masking out decoder output in HAINAN
+            (Hybrid Autoregressive Inference Transducer) model, described in https://arxiv.org/pdf/2410.02597
+            Default to -1.0, which runs standard Joint network computation; if > 0, then masking out decoder output
+            with the specified probability.
     """
 
     @property
     def input_types(self):
-        """Returns definitions of module input ports.
-        """
+        """Returns definitions of module input ports."""
         return {
             "encoder_outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()),
             "decoder_outputs": NeuralType(('B', 'D', 'T'), EmbeddedTextType()),
@@ -1266,8 +1229,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         if not self._fuse_loss_wer:
             return {
                 "outputs": NeuralType(('B', 'T', 'T', 'D'), LogprobsType()),
@@ -1313,6 +1275,7 @@ def __init__(
         fuse_loss_wer: bool = False,
         fused_batch_size: Optional[int] = None,
         experimental_fuse_loss_wer: Any = None,
+        masking_prob: float = -1.0,
     ):
         super().__init__()
 
@@ -1322,6 +1285,10 @@ def __init__(
         self._num_extra_outputs = num_extra_outputs
         self._num_classes = num_classes + 1 + num_extra_outputs  # 1 is for blank
 
+        self.masking_prob = masking_prob
+        if self.masking_prob > 0.0:
+            assert self.masking_prob < 1.0, "masking_prob must be between 0 and 1"
+
         if experimental_fuse_loss_wer is not None:
             # Override fuse_loss_wer from deprecated argument
             fuse_loss_wer = experimental_fuse_loss_wer
@@ -1490,6 +1457,10 @@ def forward(
                     sub_transcripts = sub_transcripts.detach()
 
                     # Update WER on each process without syncing
+                    if self.training:
+                        original_sync = self.wer._to_sync
+                        self.wer._to_sync = False
+
                     self.wer.update(
                         predictions=sub_enc,
                         predictions_lengths=sub_enc_lens,
@@ -1500,6 +1471,9 @@ def forward(
                     wer, wer_num, wer_denom = self.wer.compute()
                     self.wer.reset()
 
+                    if self.training:
+                        self.wer._to_sync = original_sync
+
                     wers.append(wer)
                     wer_nums.append(wer_num)
                     wer_denoms.append(wer_denom)
@@ -1578,6 +1552,13 @@ def joint_after_projection(self, f: torch.Tensor, g: torch.Tensor) -> torch.Tens
         """
         f = f.unsqueeze(dim=2)  # (B, T, 1, H)
         g = g.unsqueeze(dim=1)  # (B, 1, U, H)
+
+        if self.training and self.masking_prob > 0:
+            [B, _, U, _] = g.shape
+            rand = torch.rand([B, 1, U, 1]).to(g.device)
+            rand = torch.gt(rand, self.masking_prob)
+            g = g * rand
+
         inp = f + g  # [B, T, U, H]
 
         del f, g
@@ -2047,7 +2028,11 @@ def forward(
         return losses, wer, wer_num, wer_denom
 
     def sampled_joint(
-        self, f: torch.Tensor, g: torch.Tensor, transcript: torch.Tensor, transcript_lengths: torch.Tensor,
+        self,
+        f: torch.Tensor,
+        g: torch.Tensor,
+        transcript: torch.Tensor,
+        transcript_lengths: torch.Tensor,
     ) -> torch.Tensor:
         """
         Compute the sampled joint step of the network.
diff --git a/nemo/collections/asr/modules/rnnt_abstract.py b/nemo/collections/asr/modules/rnnt_abstract.py
index d3d9b7cb52d6..c895fc6deaf1 100644
--- a/nemo/collections/asr/modules/rnnt_abstract.py
+++ b/nemo/collections/asr/modules/rnnt_abstract.py
@@ -226,7 +226,7 @@ def score_hypothesis(
         raise NotImplementedError()
 
     def batch_score_hypothesis(
-        self, hypotheses: List[Hypothesis], cache: Dict[Tuple[int], Any], batch_states: List[torch.Tensor]
+        self, hypotheses: List[Hypothesis], cache: Dict[Tuple[int], Any]
     ) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
         """
         Used for batched beam search algorithms. Similar to score_hypothesis method.
@@ -234,33 +234,30 @@ def batch_score_hypothesis(
         Args:
             hypothesis: List of Hypotheses. Refer to rnnt_utils.Hypothesis.
             cache: Dict which contains a cache to avoid duplicate computations.
-            batch_states: List of torch.Tensor which represent the states of the RNN for this batch.
-                Each state is of shape [L, B, H]
 
         Returns:
-            Returns a tuple (b_y, b_states, lm_tokens) such that:
-            b_y is a torch.Tensor of shape [B, 1, H] representing the scores of the last tokens in the Hypotheses.
-            b_state is a list of list of RNN states, each of shape [L, B, H].
-                Represented as B x List[states].
-            lm_token is a list of the final integer tokens of the hypotheses in the batch.
+            Returns a tuple (batch_dec_out, batch_dec_states) such that:
+                batch_dec_out: a list of torch.Tensor [1, H] representing the prediction network outputs for the last tokens in the Hypotheses.
+                batch_dec_states: a list of list of RNN states, each of shape [L, B, H]. Represented as B x List[states].
         """
         raise NotImplementedError()
 
-    def batch_initialize_states(self, batch_states: List[torch.Tensor], decoder_states: List[List[torch.Tensor]]):
+    def batch_initialize_states(self, decoder_states: List[List[torch.Tensor]]):
         """
-        Create batch of decoder states.
+        Creates a stacked decoder states to be passed to prediction network
 
-       Args:
-           batch_states (list): batch of decoder states
-              ([L x (B, H)], [L x (B, H)])
-
-           decoder_states (list of list): list of decoder states
-               [B x ([L x (1, H)], [L x (1, H)])]
+        Args:
+            decoder_states (list of list of list of torch.Tensor): list of decoder states
+                [B, C, L, H]
+                    - B: Batch size.
+                    - C: e.g., for LSTM, this is 2: hidden and cell states
+                    - L: Number of layers in prediction RNN.
+                    - H: Dimensionality of the hidden state.
 
-       Returns:
-           batch_states (tuple): batch of decoder states
-               ([L x (B, H)], [L x (B, H)])
-       """
+        Returns:
+            batch_states (list of torch.Tensor): batch of decoder states
+                [C x torch.Tensor[L x B x H]
+        """
         raise NotImplementedError()
 
     def batch_select_state(self, batch_states: List[torch.Tensor], idx: int) -> List[List[torch.Tensor]]:
@@ -280,14 +277,19 @@ def batch_select_state(self, batch_states: List[torch.Tensor], idx: int) -> List
 
     @classmethod
     def batch_replace_states_mask(
-        cls, src_states: list[torch.Tensor], dst_states: list[torch.Tensor], mask: torch.Tensor,
+        cls,
+        src_states: list[torch.Tensor],
+        dst_states: list[torch.Tensor],
+        mask: torch.Tensor,
     ):
         """Replace states in dst_states with states from src_states using the mask, in a way that does not synchronize with the CPU"""
         raise NotImplementedError()
 
     @classmethod
     def batch_replace_states_all(
-        cls, src_states: list[torch.Tensor], dst_states: list[torch.Tensor],
+        cls,
+        src_states: list[torch.Tensor],
+        dst_states: list[torch.Tensor],
     ):
         """Replace states in dst_states with states from src_states"""
         raise NotImplementedError()
@@ -320,7 +322,7 @@ def batch_copy_states(
         value: Optional[float] = None,
     ) -> List[torch.Tensor]:
         """Copy states from new state to old state at certain indices.
-        
+
         Args:
             old_states(list): packed decoder states
                 (L x B x H, L x B x H)
diff --git a/nemo/collections/asr/parts/submodules/adapters/multi_head_attention_adapter_module.py b/nemo/collections/asr/parts/submodules/adapters/multi_head_attention_adapter_module.py
index 2617ed6f575b..4f5f7364171e 100644
--- a/nemo/collections/asr/parts/submodules/adapters/multi_head_attention_adapter_module.py
+++ b/nemo/collections/asr/parts/submodules/adapters/multi_head_attention_adapter_module.py
@@ -134,8 +134,17 @@ def __init__(
         dropout_rate: float,
         proj_dim: Optional[int] = None,
         adapter_strategy: MHAResidualAddAdapterStrategy = None,
+        use_pytorch_sdpa: bool = False,
+        use_pytorch_sdpa_backends: Optional[list] = None,
     ):
-        super().__init__(n_head=n_head, n_feat=n_feat, dropout_rate=dropout_rate, max_cache_len=0)
+        super().__init__(
+            n_head=n_head,
+            n_feat=n_feat,
+            dropout_rate=dropout_rate,
+            max_cache_len=0,
+            use_pytorch_sdpa=use_pytorch_sdpa,
+            use_pytorch_sdpa_backends=use_pytorch_sdpa_backends,
+        )
 
         self.pre_norm = nn.LayerNorm(n_feat)
 
@@ -200,6 +209,8 @@ class MultiHeadAttentionAdapterConfig:
     dropout_rate: float = 0.0
     proj_dim: Optional[int] = None
     adapter_strategy: Optional[Any] = field(default_factory=lambda: MHAResidualAddAdapterStrategyConfig())
+    use_pytorch_sdpa: bool = False
+    use_pytorch_sdpa_backends: Optional[list] = None
     _target_: str = "{0}.{1}".format(MultiHeadAttentionAdapter.__module__, MultiHeadAttentionAdapter.__name__)
 
 
@@ -225,9 +236,18 @@ def __init__(
         dropout_rate: float,
         proj_dim: Optional[int] = None,
         adapter_strategy: MHAResidualAddAdapterStrategyConfig = None,
+        use_pytorch_sdpa: bool = False,
+        use_pytorch_sdpa_backends: Optional[list] = None,
     ):
         super().__init__(
-            n_head=n_head, n_feat=n_feat, dropout_rate=dropout_rate, pos_bias_u=None, pos_bias_v=None, max_cache_len=0
+            n_head=n_head,
+            n_feat=n_feat,
+            dropout_rate=dropout_rate,
+            pos_bias_u=None,
+            pos_bias_v=None,
+            max_cache_len=0,
+            use_pytorch_sdpa=use_pytorch_sdpa,
+            use_pytorch_sdpa_backends=use_pytorch_sdpa_backends,
         )
 
         self.pre_norm = nn.LayerNorm(n_feat)
@@ -305,6 +325,8 @@ class RelPositionMultiHeadAttentionAdapterConfig:
     dropout_rate: float = 0.0
     proj_dim: Optional[int] = None
     adapter_strategy: Optional[Any] = field(default_factory=lambda: MHAResidualAddAdapterStrategyConfig())
+    use_pytorch_sdpa: bool = False
+    use_pytorch_sdpa_backends: Optional[list] = None
     _target_: str = "{0}.{1}".format(
         RelPositionMultiHeadAttentionAdapter.__module__, RelPositionMultiHeadAttentionAdapter.__name__
     )
diff --git a/nemo/collections/asr/parts/submodules/conformer_modules.py b/nemo/collections/asr/parts/submodules/conformer_modules.py
index c2d897d63225..b3098ad89ffe 100644
--- a/nemo/collections/asr/parts/submodules/conformer_modules.py
+++ b/nemo/collections/asr/parts/submodules/conformer_modules.py
@@ -77,9 +77,15 @@ def __init__(
         pos_bias_v=None,
         att_context_size=[-1, -1],
         use_bias=True,
+        use_pytorch_sdpa=False,
+        use_pytorch_sdpa_backends=None,
     ):
         super(ConformerLayer, self).__init__()
 
+        self.use_pytorch_sdpa = use_pytorch_sdpa
+        if use_pytorch_sdpa_backends is None:
+            use_pytorch_sdpa_backends = []
+        self.use_pytorch_sdpa_backends = use_pytorch_sdpa_backends
         self.self_attention_model = self_attention_model
         self.n_heads = n_heads
         self.fc_factor = 0.5
@@ -111,6 +117,8 @@ def __init__(
                 pos_bias_v=pos_bias_v,
                 max_cache_len=MHA_max_cache_len,
                 use_bias=use_bias,
+                use_pytorch_sdpa=self.use_pytorch_sdpa,
+                use_pytorch_sdpa_backends=self.use_pytorch_sdpa_backends,
             )
         elif self_attention_model == 'rel_pos_local_attn':
             self.self_attn = RelPositionMultiHeadAttentionLongformer(
@@ -133,6 +141,8 @@ def __init__(
                 dropout_rate=dropout_att,
                 max_cache_len=MHA_max_cache_len,
                 use_bias=use_bias,
+                use_pytorch_sdpa=self.use_pytorch_sdpa,
+                use_pytorch_sdpa_backends=self.use_pytorch_sdpa_backends,
             )
         else:
             raise ValueError(
diff --git a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
index aa49435ded16..fc501b3d00de 100644
--- a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
@@ -293,6 +293,13 @@ def __call__(
         device: torch.device,
         partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None,
     ):
+        if x.device.type != "cuda":
+            # If CUDA graphs are enabled and "frame-looping" algorithm is requested, current class
+            # is not suitable to handle non-CUDA inputs; thus we are passing them to original caller
+            return self.caller._greedy_decode_blank_as_pad_loop_frames(
+                x=x, out_len=out_len, device=device, partial_hypotheses=partial_hypotheses
+            )
+
         if partial_hypotheses is not None:
             raise NotImplementedError(
                 "`partial_hypotheses` support is not available "
diff --git a/nemo/collections/asr/parts/submodules/multi_head_attention.py b/nemo/collections/asr/parts/submodules/multi_head_attention.py
index de86132a721b..3e6c056bd7b5 100644
--- a/nemo/collections/asr/parts/submodules/multi_head_attention.py
+++ b/nemo/collections/asr/parts/submodules/multi_head_attention.py
@@ -38,6 +38,7 @@
 
 import torch
 import torch.nn as nn
+import torch.nn.attention
 import torch.nn.functional as F
 
 from nemo.utils import avoid_float16_autocast_context
@@ -48,6 +49,8 @@
     'PositionalEncoding',
 ]
 
+INF_VAL = 10000.0
+
 
 class MultiHeadAttention(nn.Module):
     """Multi-Head Attention layer of Transformer.
@@ -56,13 +59,35 @@ class MultiHeadAttention(nn.Module):
         n_feat (int): size of the features
         dropout_rate (float): dropout rate
         use_bias (bool): whether to remove bias in linear and conv layers
+        use_pytorch_sdpa (bool): use torch sdpa instead of manual attention
+        use_pytorch_sdpa_backends list[str]: list of backend names to use in sdpa. None or empty list means all backends. e.g. ["MATH"]
     """
 
-    def __init__(self, n_head, n_feat, dropout_rate, max_cache_len=0, use_bias=True):
+    def __init__(
+        self,
+        n_head,
+        n_feat,
+        dropout_rate,
+        max_cache_len=0,
+        use_bias=True,
+        use_pytorch_sdpa=False,
+        use_pytorch_sdpa_backends=None,
+    ):
         """Construct an MultiHeadedAttention object."""
         super(MultiHeadAttention, self).__init__()
+        self.use_pytorch_sdpa = use_pytorch_sdpa
+        if self.use_pytorch_sdpa and use_pytorch_sdpa_backends:
+            use_pytorch_sdpa_backends = list(
+                map(
+                    lambda backend_name: getattr(torch.nn.attention.SDPBackend, backend_name),
+                    use_pytorch_sdpa_backends,
+                )
+            )
+        self.use_pytorch_sdpa_backends = use_pytorch_sdpa_backends
+
         self.cache_drop_size = None
         self.use_bias = use_bias
+        self.dropout_rate = dropout_rate
         assert n_feat % n_head == 0
         # We assume d_v always equals d_k
         self.d_k = n_feat // n_head
@@ -109,7 +134,7 @@ def forward_attention(self, value, scores, mask):
         n_batch = value.size(0)
         if mask is not None:
             mask = mask.unsqueeze(1)  # (batch, 1, time1, time2)
-            scores = scores.masked_fill(mask, -10000.0)
+            scores = scores.masked_fill(mask, -INF_VAL)
             attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0)  # (batch, head, time1, time2)
         else:
             attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
@@ -141,8 +166,36 @@ def forward(self, query, key, value, mask, pos_emb=None, cache=None):
         # temporary until we solve this more gracefully
         with avoid_float16_autocast_context():
             q, k, v = self.forward_qkv(query, key, value)
-            scores = torch.matmul(q, k.transpose(-2, -1)) / self.s_d_k
-            out = self.forward_attention(v, scores, mask)
+
+            if self.use_pytorch_sdpa:
+                n_batch = value.size(0)
+
+                if mask is not None:
+                    mask = ~mask.unsqueeze(1)
+
+                dropout_rate = self.dropout_rate if self.training else 0
+                if self.use_pytorch_sdpa_backends:
+                    with torch.nn.attention.sdpa_kernel(self.use_pytorch_sdpa_backends):
+                        out = torch.nn.functional.scaled_dot_product_attention(
+                            q, k, v, attn_mask=mask, dropout_p=dropout_rate
+                        )
+                else:
+                    out = torch.nn.functional.scaled_dot_product_attention(
+                        q, k, v, attn_mask=mask, dropout_p=dropout_rate
+                    )
+
+                # this IF block can be deleted when https://github.com/pytorch/pytorch/pull/131863 is in the stable version
+                if mask is not None:
+                    all_masked_rows = torch.all(~mask, dim=-1)
+                    all_masked_rows.unsqueeze_(-1)
+                    out = out.masked_fill(all_masked_rows, 0.0)
+
+                out = out.transpose(1, 2).reshape(n_batch, -1, self.h * self.d_k)  # (batch, time1, d_model)
+                out = self.linear_out(out)  # (batch, time1, d_model)
+            else:
+                scores = torch.matmul(q, k.transpose(-2, -1)) / self.s_d_k
+                out = self.forward_attention(v, scores, mask)
+
         if cache is None:
             return out
         else:
@@ -166,7 +219,18 @@ class RelPositionMultiHeadAttention(MultiHeadAttention):
         use_bias (bool): whether to apply bias in linear and conv layers of MultiHeadAttention
     """
 
-    def __init__(self, n_head, n_feat, dropout_rate, pos_bias_u, pos_bias_v, max_cache_len=0, use_bias=True):
+    def __init__(
+        self,
+        n_head,
+        n_feat,
+        dropout_rate,
+        pos_bias_u,
+        pos_bias_v,
+        max_cache_len=0,
+        use_bias=True,
+        use_pytorch_sdpa=False,
+        use_pytorch_sdpa_backends=None,
+    ):
         """Construct an RelPositionMultiHeadedAttention object."""
         super().__init__(
             n_head=n_head,
@@ -174,6 +238,8 @@ def __init__(self, n_head, n_feat, dropout_rate, pos_bias_u, pos_bias_v, max_cac
             dropout_rate=dropout_rate,
             max_cache_len=max_cache_len,
             use_bias=use_bias,
+            use_pytorch_sdpa=use_pytorch_sdpa,
+            use_pytorch_sdpa_backends=use_pytorch_sdpa_backends,
         )
         # linear transformation for positional encoding
         self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
@@ -228,6 +294,7 @@ def forward(self, query, key, value, mask, pos_emb, cache=None):
             q = q.transpose(1, 2)  # (batch, time1, head, d_k)
 
             n_batch_pos = pos_emb.size(0)
+            n_batch = value.size(0)
             p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
             p = p.transpose(1, 2)  # (batch, head, time1, d_k)
 
@@ -240,18 +307,46 @@ def forward(self, query, key, value, mask, pos_emb, cache=None):
             # first compute matrix a and matrix c
             # as described in https://arxiv.org/abs/1901.02860 Section 3.3
             # (batch, head, time1, time2)
-            matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
 
             # compute matrix b and matrix d
             # (batch, head, time1, time2)
             matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
             matrix_bd = self.rel_shift(matrix_bd)
-            # drops extra elements in the matrix_bd to match the matrix_ac's size
-            matrix_bd = matrix_bd[:, :, :, : matrix_ac.size(-1)]
 
-            scores = (matrix_ac + matrix_bd) / self.s_d_k  # (batch, head, time1, time2)
+            if self.use_pytorch_sdpa:
+                scale_factor = 1 / math.sqrt(q_with_bias_u.size(-1))
+                matrix_bd = matrix_bd[:, :, :, : k.size(-2)] * scale_factor
+
+                if mask is not None:
+                    mask = mask.unsqueeze(1)
+                    matrix_bd.masked_fill_(mask, -INF_VAL)
 
-            out = self.forward_attention(v, scores, mask)
+                dropout_rate = self.dropout_rate if self.training else 0
+                if self.use_pytorch_sdpa_backends:
+                    with torch.nn.attention.sdpa_kernel(self.use_pytorch_sdpa_backends):
+                        out = torch.nn.functional.scaled_dot_product_attention(
+                            q_with_bias_u, k, v, attn_mask=matrix_bd, dropout_p=dropout_rate
+                        )
+                else:
+                    out = torch.nn.functional.scaled_dot_product_attention(
+                        q_with_bias_u, k, v, attn_mask=matrix_bd, dropout_p=dropout_rate
+                    )
+
+                # this IF block can be deleted when https://github.com/pytorch/pytorch/pull/131863 is in the stable version
+                if mask is not None:
+                    all_masked_rows = torch.all(mask, dim=-1)
+                    all_masked_rows.unsqueeze_(-1)
+                    all_masked_rows = all_masked_rows.expand(-1, out.size(1), -1, out.size(-1))
+                    out = out.masked_fill(all_masked_rows, 0.0)
+
+                out = out.transpose(1, 2).reshape(n_batch, -1, self.h * self.d_k)  # (batch, time1, d_model)
+                out = self.linear_out(out)  # (batch, time1, d_model)
+            else:
+                # drops extra elements in the matrix_bd to match the matrix_ac's size
+                matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+                matrix_bd = matrix_bd[:, :, :, : matrix_ac.size(-1)]
+                scores = (matrix_ac + matrix_bd) / self.s_d_k  # (batch, head, time1, time2)
+                out = self.forward_attention(v, scores, mask)
 
         if cache is None:
             return out
@@ -292,6 +387,8 @@ def __init__(
         global_tokens_spacing=1,
         global_attn_separate=False,
         use_bias=True,
+        use_pytorch_sdpa=False,
+        use_pytorch_sdpa_backends=None,
     ):
         """Construct an RelPositionMultiHeadAttentionLongformer object."""
         super().__init__(
@@ -302,7 +399,13 @@ def __init__(
             pos_bias_v=pos_bias_v,
             max_cache_len=max_cache_len,
             use_bias=use_bias,
+            use_pytorch_sdpa=use_pytorch_sdpa,
+            use_pytorch_sdpa_backends=use_pytorch_sdpa_backends,
         )
+
+        if use_pytorch_sdpa:
+            raise NotImplementedError("Not implemented for Longformer yet")
+
         self.att_context_size = att_context_size
         self.global_tokens = global_tokens
         self.global_tokens_spacing = global_tokens_spacing
@@ -374,14 +477,14 @@ def forward(self, query, key, value, pad_mask, pos_emb, cache=None):
             # (batch, head, time, 2w + 1)
 
             # mask invalid positions
-            scores[:, :, :, :start_pos] = -10000.0
-            scores[:, :, :, end_pos + 1 :] = -10000.0
+            scores[:, :, :, :start_pos] = -INF_VAL
+            scores[:, :, :, end_pos + 1 :] = -INF_VAL
 
             # This implementation is fast and takes very little memory because num_heads x hidden_size = 1
             # from (bsz x seq_len) to (bsz x num_heads x seqlen x hidden_size)
             mask = mask.unsqueeze(dim=1).unsqueeze(dim=-1)
             # cast to float/half then replace 1's with -inf
-            float_mask = mask.type_as(scores).masked_fill(mask, -10000.0)
+            float_mask = mask.type_as(scores).masked_fill(mask, -INF_VAL)
             ones = float_mask.new_ones(size=float_mask.size())  # tensor of ones
             # diagonal mask with zeros everywhere and -inf inplace of padding
             d_mask = self.sliding_chunks_matmul_qk(ones, float_mask, w, padding_value=0.0)
@@ -914,7 +1017,7 @@ def create_pe(self, positions, dtype):
         pe = torch.zeros(pos_length, self.d_model, device=positions.device)
         div_term = torch.exp(
             torch.arange(0, self.d_model, 2, dtype=torch.float32, device=positions.device)
-            * -(math.log(10000.0) / self.d_model)
+            * -(math.log(INF_VAL) / self.d_model)
         )
         pe[:, 0::2] = torch.sin(positions * div_term)
         pe[:, 1::2] = torch.cos(positions * div_term)
diff --git a/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py
index 25becda6fa75..c01f2363db75 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py
@@ -725,11 +725,11 @@ def time_sync_decoding(
                 D = []
 
                 # Decode a batch of beam states and scores
-                beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score_hypothesis(C, cache, beam_state)
+                beam_y, beam_state = self.decoder.batch_score_hypothesis(C, cache)
 
                 # Extract the log probabilities and the predicted tokens
                 beam_logp = torch.log_softmax(
-                    self.joint.joint(h_enc, beam_y) / self.softmax_temperature, dim=-1
+                    self.joint.joint(h_enc, torch.stack(beam_y)) / self.softmax_temperature, dim=-1
                 )  # [B, 1, 1, V + 1]
                 beam_logp = beam_logp[:, 0, 0, :]  # [B, V + 1]
                 beam_topk = beam_logp[:, ids].topk(beam, dim=-1)
@@ -776,7 +776,7 @@ def time_sync_decoding(
                             new_hyp = Hypothesis(
                                 score=(hyp.score + float(logp)),
                                 y_sequence=(hyp.y_sequence + [int(k)]),
-                                dec_state=self.decoder.batch_select_state(beam_state, j),
+                                dec_state=beam_state[j],
                                 lm_state=hyp.lm_state,
                                 timestep=hyp.timestep[:] + [i],
                                 length=encoded_lengths,
@@ -859,6 +859,7 @@ def align_length_sync_decoding(
         beam_state = self.decoder.initialize_state(
             torch.zeros(beam, device=h.device, dtype=h.dtype)
         )  # [L, B, H], [L, B, H] for LSTMS
+        beam_state = [self.decoder.batch_select_state(beam_state, 0)]
 
         # compute u_max as either a specific static limit,
         # or a multiple of current `h_length` dynamically.
@@ -872,7 +873,7 @@ def align_length_sync_decoding(
             Hypothesis(
                 y_sequence=[self.blank],
                 score=0.0,
-                dec_state=self.decoder.batch_select_state(beam_state, 0),
+                dec_state=beam_state[0],
                 timestep=[-1],
                 length=0,
             )
@@ -919,14 +920,8 @@ def align_length_sync_decoding(
                         sub_batch_ids.remove(id)
 
                     # extract the states of the sub batch only.
-                    if isinstance(self.decoder, RNNTDecoder):
-                        # LSTM decoder, state is [layer x batch x hidden]
-                        beam_state_ = [
-                            beam_state[state_id][:, sub_batch_ids, :] for state_id in range(len(beam_state))
-                        ]
-                    elif isinstance(self.decoder, StatelessTransducerDecoder):
-                        # stateless decoder, state is [batch x hidden]
-                        beam_state_ = [beam_state[state_id][sub_batch_ids, :] for state_id in range(len(beam_state))]
+                    if isinstance(self.decoder, RNNTDecoder) or isinstance(self.decoder, StatelessTransducerDecoder):
+                        beam_state_ = (beam_state[sub_batch_id] for sub_batch_id in sub_batch_ids)
                     else:
                         raise NotImplementedError("Unknown decoder type.")
 
@@ -935,22 +930,21 @@ def align_length_sync_decoding(
                     beam_state_ = beam_state
 
                 # Decode a batch/sub-batch of beam states and scores
-                beam_y, beam_state_, beam_lm_tokens = self.decoder.batch_score_hypothesis(B_, cache, beam_state_)
+                beam_y, beam_state_ = self.decoder.batch_score_hypothesis(B_, cache)
 
                 # If only a subset of batch ids were updated (some were removed)
                 if sub_batch_ids is not None:
                     # For each state in the RNN (2 for LSTM)
-                    for state_id in range(len(beam_state)):
-                        # Update the current batch states with the sub-batch states (in the correct indices)
-                        # These indices are specified by sub_batch_ids, the ids of samples which were updated.
-                        if isinstance(self.decoder, RNNTDecoder):
-                            # LSTM decoder, state is [layer x batch x hidden]
-                            beam_state[state_id][:, sub_batch_ids, :] = beam_state_[state_id][...]
-                        elif isinstance(self.decoder, StatelessTransducerDecoder):
-                            # stateless decoder, state is [batch x hidden]
-                            beam_state[state_id][sub_batch_ids, :] = beam_state_[state_id][...]
-                        else:
-                            raise NotImplementedError("Unknown decoder type.")
+                    # Update the current batch states with the sub-batch states (in the correct indices)
+                    # These indices are specified by sub_batch_ids, the ids of samples which were updated.
+                    if isinstance(self.decoder, RNNTDecoder) or isinstance(self.decoder, StatelessTransducerDecoder):
+                        # LSTM decoder, state is [layer x batch x hidden]
+                        index = 0
+                        for sub_batch_id in sub_batch_ids:
+                            beam_state[sub_batch_id] = beam_state_[index]
+                            index += 1
+                    else:
+                        raise NotImplementedError("Unknown decoder type.")
                 else:
                     # If entire batch was updated, simply update all the states
                     beam_state = beam_state_
@@ -963,7 +957,7 @@ def align_length_sync_decoding(
 
                 # Extract the log probabilities and the predicted tokens
                 beam_logp = torch.log_softmax(
-                    self.joint.joint(h_enc, beam_y) / self.softmax_temperature, dim=-1
+                    self.joint.joint(h_enc, torch.stack(beam_y)) / self.softmax_temperature, dim=-1
                 )  # [B=beam, 1, 1, V + 1]
                 beam_logp = beam_logp[:, 0, 0, :]  # [B=beam, V + 1]
                 beam_topk = beam_logp[:, ids].topk(beam, dim=-1)
@@ -1011,7 +1005,7 @@ def align_length_sync_decoding(
                         new_hyp = Hypothesis(
                             score=(hyp.score + float(logp)),
                             y_sequence=(hyp.y_sequence[:] + [int(k)]),
-                            dec_state=self.decoder.batch_select_state(beam_state, h_states_idx),
+                            dec_state=beam_state[h_states_idx],
                             lm_state=hyp.lm_state,
                             timestep=hyp.timestep[:] + [i],
                             length=i,
@@ -1084,7 +1078,7 @@ def modified_adaptive_expansion_search(
         # prepare the batched beam states
         beam = min(self.beam_size, self.vocab_size)
         beam_state = self.decoder.initialize_state(
-            torch.zeros(beam, device=h.device, dtype=h.dtype)
+            torch.zeros(1, device=h.device, dtype=h.dtype)
         )  # [L, B, H], [L, B, H] for LSTMS
 
         # Initialize first hypothesis for the beam (blank)
@@ -1106,8 +1100,8 @@ def modified_adaptive_expansion_search(
                 hyp.alignments = [[]]
 
         # Decode a batch of beam states and scores
-        beam_dec_out, beam_state, beam_lm_tokens = self.decoder.batch_score_hypothesis(init_tokens, cache, beam_state)
-        state = self.decoder.batch_select_state(beam_state, 0)
+        beam_dec_out, beam_state = self.decoder.batch_score_hypothesis(init_tokens, cache)
+        state = beam_state[0]
 
         # Setup ngram LM:
         if self.ngram_lm:
@@ -1267,18 +1261,10 @@ def modified_adaptive_expansion_search(
                     break
 
                 else:
-                    # Initialize the beam states for the hypotheses in the expannsion list
-                    beam_state = self.decoder.batch_initialize_states(
-                        beam_state,
-                        [hyp.dec_state for hyp in list_exp],
-                        # [hyp.y_sequence for hyp in list_exp],  # <look into when this is necessary>
-                    )
-
                     # Decode a batch of beam states and scores
-                    beam_dec_out, beam_state, beam_lm_tokens = self.decoder.batch_score_hypothesis(
+                    beam_dec_out, beam_state = self.decoder.batch_score_hypothesis(
                         list_exp,
                         cache,
-                        beam_state,
                         # self.language_model is not None,
                     )
 
@@ -1300,7 +1286,7 @@ def modified_adaptive_expansion_search(
                         for i, hyp in enumerate(list_exp):
                             # Preserve the decoder logits for the current beam
                             hyp.dec_out.append(beam_dec_out[i])
-                            hyp.dec_state = self.decoder.batch_select_state(beam_state, i)
+                            hyp.dec_state = beam_state[i]
 
                             # TODO: Setup LM
                             if self.language_model is not None:
@@ -1325,7 +1311,7 @@ def modified_adaptive_expansion_search(
 
                     else:
                         # Extract the log probabilities
-                        beam_logp, _ = self.resolve_joint_output(beam_enc_out, beam_dec_out)
+                        beam_logp, _ = self.resolve_joint_output(beam_enc_out, torch.stack(beam_dec_out))
                         beam_logp = beam_logp[:, 0, 0, :]
 
                         # For all expansions, add the score for the blank label
@@ -1334,7 +1320,7 @@ def modified_adaptive_expansion_search(
 
                             # Preserve the decoder's output and state
                             hyp.dec_out.append(beam_dec_out[i])
-                            hyp.dec_state = self.decoder.batch_select_state(beam_state, i)
+                            hyp.dec_state = beam_state[i]
 
                             # TODO: Setup LM
                             if self.language_model is not None:
@@ -1387,7 +1373,7 @@ def recombine_hypotheses(self, hypotheses: List[Hypothesis]) -> List[Hypothesis]
             else:
                 final.append(hyp)
 
-        return hypotheses
+        return final
 
     def resolve_joint_output(self, enc_out: torch.Tensor, dec_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """
diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
index 8e16688a1b32..6d52b91f1b2c 100644
--- a/nemo/collections/common/data/lhotse/cutset.py
+++ b/nemo/collections/common/data/lhotse/cutset.py
@@ -130,6 +130,7 @@ def read_dataset_config(config) -> tuple[CutSet, bool]:
         "metadata_only": config.metadata_only,
         "force_finite": config.force_finite,
         "max_open_streams": config.max_open_streams,
+        "tarred_random_access": config.tarred_random_access,
     }
     input_cfg = config.input_cfg
     if isinstance(input_cfg, (str, Path)):
@@ -403,10 +404,11 @@ def read_nemo_manifest(config, is_tarred: bool) -> CutSet:
                 LazyNeMoTarredIterator(
                     config.manifest_filepath,
                     tar_paths=config.tarred_audio_filepaths,
+                    tarred_random_access=config.tarred_random_access,
                     **common_kwargs,
                 )
             )
-            if not force_finite:
+            if not config.tarred_random_access and not force_finite:
                 cuts = cuts.repeat()
         else:
             cuts = CutSet(LazyNeMoIterator(config.manifest_filepath, **notar_kwargs, **common_kwargs))
@@ -444,6 +446,7 @@ def read_nemo_manifest(config, is_tarred: bool) -> CutSet:
                 nemo_iter = LazyNeMoTarredIterator(
                     manifest_path=manifest_path,
                     tar_paths=tar_path,
+                    tarred_random_access=config.tarred_random_access,
                     **common_kwargs,
                 )
             else:
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index 15c55a88c232..98b63a07fa9d 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -61,7 +61,8 @@ class LhotseDataLoadingConfig:
     #   b. Lhotse CutSet manifest / Lhotse Shar tar dir paths.
     cuts_path: str | None = None
     shar_path: Any = None  # str | list[str | tuple[str, float | int]] | None = None
-
+    #  Enable this to support dataloading from JSON manifests that reference subsets of audio tar files.
+    tarred_random_access: bool = False
     # 2. Batch size.
     #   a. Existing NeMo options.
     batch_size: int | None = None
@@ -316,7 +317,6 @@ def get_lhotse_dataloader_from_config(
             duration_bins=determine_bucket_duration_bins(config),
             num_cuts_for_bins_estimate=config.num_cuts_for_bins_estimate,
             buffer_size=config.bucket_buffer_size,
-            concurrent=config.concurrent_bucketing,
             rank=0 if is_tarred else global_rank,
             world_size=1 if is_tarred else world_size,
         )
@@ -368,7 +368,7 @@ def get_lhotse_dataloader_from_config(
         )
 
     # 4. Creating dataloader.
-    if is_tarred:
+    if is_tarred and not config.tarred_random_access:
         # Wrapper here is necessary when using NeMo tarred data or Lhotse Shar data,
         # because then I/O happens upon sampler iteration. Normally, the sampler resides
         # in the training loop process, but when we use iterable dataset, we can move it to
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index 3c5ced5d4018..ee623f617e26 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -265,8 +265,10 @@ def __init__(
         shard_seed: int | Literal["trng", "randomized"] = "trng",
         text_field: str = "text",
         lang_field: str = "lang",
+        tarred_random_access: bool = False,
         extra_fields: list[dict[str, str]] | None = None,
     ) -> None:
+        self.tarred_random_access = tarred_random_access
         self.shard_id_to_manifest: dict[int, Iterable[dict]]
         self.paths = expand_sharded_filepaths(manifest_path)
         if len(self.paths) == 1:
@@ -345,6 +347,30 @@ def _validate(self) -> None:
     def shard_ids(self) -> List[int]:
         return sorted(self.shard_id_to_manifest.keys())
 
+    def _iter_random_read(self, tar_path, shard_manifest, manifest_path) -> Generator[tuple[dict, bytes], None, None]:
+        with tarfile.open(fileobj=BytesIO(open_best(tar_path, mode="rb").read()), mode="r") as tar:
+            for data in shard_manifest:
+                try:
+                    tar_info = tar.getmember(data)
+                    raw_audio = tar.extractfile(tar_info).read()
+                    yield data, raw_audio, tar_info
+                except KeyError as e:
+                    raise RuntimeError(
+                        f"Mismatched entry between JSON manifest ('{manifest_path}') and tar file ('{tar_path}'). "
+                        f"The following audio_filepath='{data['audio_filepath']}' was not found in the tar file."
+                    ) from e
+
+    def _iter_sequential(self, tar_path, shard_manifest, manifest_path) -> Generator[tuple[dict, bytes], None, None]:
+        with tarfile.open(fileobj=open_best(tar_path, mode="rb"), mode="r|*") as tar:
+            for tar_info in tar:
+                assert tar_info.name in shard_manifest, (
+                    f"Mismatched entry between JSON manifest ('{manifest_path}') and tar file ('{tar_path}'). "
+                    f"Cannot locate JSON entry for tar file '{tar_info.name}'"
+                )
+                data = shard_manifest[tar_info.name]
+                raw_audio = tar.extractfile(tar_info).read()
+                yield data, raw_audio, tar_info
+
     def __iter__(self) -> Generator[Cut, None, None]:
         shard_ids = self.shard_ids
 
@@ -359,6 +385,7 @@ def __iter__(self) -> Generator[Cut, None, None]:
         # They have multiple JSONL entries where audio paths end with '-sub1', '-sub2', etc. for each offset.
         offset_pattern = re.compile(r'^(?P<stem>.+)(?P<sub>-sub\d+)(?P<ext>\.\w+)?$')
 
+        iter_fn = self._iter_random_read if self.tarred_random_access else self._iter_sequential
         for sid in shard_ids:
             manifest_path = self.paths[sid] if len(self.paths) > 1 else self.paths[0]
 
@@ -371,50 +398,40 @@ def basename(d: dict) -> str:
 
             shard_manifest: dict[str, list[dict]] = groupby(basename, self.shard_id_to_manifest[sid])
             tar_path = self.shard_id_to_tar_path[sid]
-            with tarfile.open(fileobj=open_best(tar_path, mode="rb"), mode="r|*") as tar:
-                for tar_info in tar:
-                    assert tar_info.name in shard_manifest, (
-                        f"Mismatched entry between JSON manifest ('{manifest_path}') and tar file ('{tar_path}'). "
-                        f"Cannot locate JSON entry for tar file '{tar_info.name}'"
-                    )
-                    raw_audio = tar.extractfile(tar_info).read()
-                    # Note: Lhotse has a Recording.from_bytes() utility that we won't use here because
-                    #       the profiling indicated significant overhead in torchaudio ffmpeg integration
-                    #       that parses full audio instead of just reading the header for WAV files.
-                    # recording = lhotse.Recording.from_bytes(raw_audio, recording_id=tar_info.path)
-                    meta = soundfile.info(BytesIO(raw_audio))
-                    recording = Recording(
-                        id=tar_info.path,
-                        sources=[AudioSource(type="memory", channels=list(range(meta.channels)), source=raw_audio)],
-                        sampling_rate=int(meta.samplerate),
-                        num_samples=meta.frames,
-                        duration=meta.duration,
+            for data, raw_audio, tar_info in iter_fn(tar_path, shard_manifest, manifest_path):
+                meta = soundfile.info(BytesIO(raw_audio))
+                recording = Recording(
+                    id=tar_info.path,
+                    sources=[AudioSource(type="memory", channels=list(range(meta.channels)), source=raw_audio)],
+                    sampling_rate=int(meta.samplerate),
+                    num_samples=meta.frames,
+                    duration=meta.duration,
+                )
+                cuts_for_recording = []
+                for data in sorted(shard_manifest[tar_info.name], key=lambda d: d["audio_filepath"]):
+                    # Cut the recording into corresponding segment and discard audio data outside the segment.
+                    cut = make_cut_with_subset_inmemory_recording(
+                        recording, offset=data.get("offset", 0.0), duration=data.get("duration")
                     )
-                    cuts_for_recording = []
-                    for data in sorted(shard_manifest[tar_info.name], key=lambda d: d["audio_filepath"]):
-                        # Cut the recording into corresponding segment and discard audio data outside the segment.
-                        cut = make_cut_with_subset_inmemory_recording(
-                            recording, offset=data.get("offset", 0.0), duration=data.get("duration")
+                    cut.supervisions.append(
+                        SupervisionSegment(
+                            id=cut.id,
+                            recording_id=cut.recording_id,
+                            start=0,
+                            duration=cut.duration,
+                            text=data.get(self.text_field),
+                            language=data.get(self.lang_field),
                         )
-                        cut.supervisions.append(
-                            SupervisionSegment(
-                                id=cut.id,
-                                recording_id=cut.recording_id,
-                                start=0,
-                                duration=cut.duration,
-                                text=data.get(self.text_field),
-                                language=data.get(self.lang_field),
-                            )
-                        )
-                        cut.custom = _to_custom_attr_dict(data)
-                        cut.manifest_origin = manifest_path
-                        cut.tar_origin = tar_path
-                        for extra_field in extra_fields:
-                            extra_field.attach_to(cut)
-                        cuts_for_recording.append(cut)
-                    del recording  # free the memory - helps with very large audio files
-                    del raw_audio
-                    yield from cuts_for_recording
+                    )
+                    cut.custom = _to_custom_attr_dict(data)
+                    cut.manifest_origin = manifest_path
+                    cut.tar_origin = tar_path
+                    for extra_field in extra_fields:
+                        extra_field.attach_to(cut)
+                    cuts_for_recording.append(cut)
+                del recording  # free the memory - helps with very large audio files
+                del raw_audio
+                yield from cuts_for_recording
 
     def __len__(self) -> int:
         return len(self.source)
diff --git a/nemo/collections/common/parts/run_utils.py b/nemo/collections/common/parts/run_utils.py
new file mode 100644
index 000000000000..51c16b54c4f3
--- /dev/null
+++ b/nemo/collections/common/parts/run_utils.py
@@ -0,0 +1,577 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shlex
+import subprocess
+from dataclasses import dataclass
+from functools import lru_cache
+from pathlib import Path
+
+import nemo_run as run
+from nemo_run.config import NEMORUN_HOME
+from nemo_run.core.execution.docker import DockerExecutor
+from nemo_run.core.execution.slurm import SlurmJobDetails
+from nemo_run.core.serialization.zlib_json import ZlibJSONSerializer
+from nemo_run.core.tunnel import LocalTunnel, SSHTunnel
+from omegaconf import DictConfig, OmegaConf
+
+from nemo.utils import logging
+
+
+@lru_cache(maxsize=2)
+def get_tunnel(**ssh_tunnel):
+    return SSHTunnel(**ssh_tunnel)
+
+
+def get_mounts_from_config(cluster_config: dict, env_vars: dict = None):
+    """
+    Determines if there are mount paths that are being passed via environment variables.
+    Selects the key in the cluster config called `mounts` which is a list of strings.
+    Each string is in the format of `<str | {env_var}>:<str | {env_var}>` where `env_var`
+    is the name of the environment variable.
+
+    Args:
+        cluster_config (dict): cluster config dictionary
+        env_vars (dict): dictionary of environment variables
+
+    Returns:
+        list: updated list of mounts
+    """
+    mounts = cluster_config.get('mounts', [])
+
+    # if there are env_mounts, we will add the mounts from the env_mounts
+    for mount_id in range(len(mounts)):
+        mount = mounts[mount_id]
+
+        if ":" not in mount:
+            raise ValueError(f"Invalid mount format: {mount}. The mount path must be separated by a colon.")
+
+        mount_source, mount_target = mount.split(":")
+
+        if mount_source[0] == "{" and mount_source[-1] == "}":
+            # Resolve the environment variable for the mount source
+            mount_source = mount_source[1:-1]
+
+            if mount_source not in os.environ:
+                raise ValueError(
+                    f"Required environment variable {mount_source} not found in env variables passed in cluster configs."
+                )
+
+            mount_source = os.environ[mount_source]
+
+        if mount_target[0] == "{" and mount_target[-1] == "}":
+            # Resolve the environment variable for the mount target
+            mount_target = mount_target[1:-1]
+
+            if mount_target not in os.environ:
+                raise ValueError(
+                    f"Required environment variable {mount_target} not found in env variables passed in cluster configs."
+                )
+
+            mount_target = os.environ[mount_target]
+
+        # add the mount to the list of mounts
+        resolved_mount = f"{mount_source}:{mount_target}"
+        mounts[mount_id] = resolved_mount
+
+    return mounts
+
+
+def check_if_mounted(cluster_config, path_to_check):
+    """Will check that path_to_check is referenced inside one of the mounts."""
+    for mount in get_mounts_from_config(cluster_config) + ['/nemo_run/code:/nemo_run/code']:
+        if path_to_check.startswith(mount.split(":")[1]):
+            return
+    raise ValueError(f"The path '{path_to_check}' is not mounted. Check cluster config and add appropriate mounts.")
+
+
+def add_mount_path(mount_source: str, mount_dest: str, cluster_config):
+    """Add a mount path to the cluster configuration."""
+
+    if cluster_config is None:
+        raise ValueError("Cluster config is not provided.")
+
+    if 'mounts' in cluster_config:
+        original_mounts = get_mounts_from_config(cluster_config)
+        added_mount = False
+        for mount_path in original_mounts:
+            source, destination = mount_path.split(':')
+
+            if source == mount_source and destination == mount_dest:
+                return
+
+        if not added_mount:
+            cluster_config['mounts'].append(f"{mount_source}:{mount_dest}")
+            logging.info(f"Added mount path: `{mount_source}:{mount_dest}`")
+
+    else:
+        raise ValueError("No mounts found in cluster config, can only add to existing mount list.")
+
+
+def create_remote_directory(directory: str | list, cluster_config: dict):
+    """Create a remote directory on the cluster."""
+
+    if cluster_config is None:
+        raise ValueError("Cluster config is not provided.")
+
+    if isinstance(directory, str):
+        directory = [directory]
+
+    if cluster_config.get('executor') == 'local':
+        tunnel = LocalTunnel(job_dir=directory[0])
+        for dir_path in directory:
+            tunnel.run(f'mkdir -p {dir_path}', hide=False, warn=True)
+            logging.info(f"Created directory: {dir_path} in local filesystem.")
+
+        # tunnel.cleanup()
+
+    elif cluster_config.get('executor') == 'slurm':
+        ssh_tunnel_config = cluster_config.get('ssh_tunnel', None)
+        if ssh_tunnel_config is None:
+            raise ValueError("`ssh_tunnel` sub-config is not provided in cluster_config.")
+
+        # Check for pre-existing job_dir in the ssh_tunnel_config
+        if 'job_dir' not in ssh_tunnel_config:
+            ssh_tunnel_config['job_dir'] = directory[0]
+
+        tunnel = get_tunnel(**cluster_config['ssh_tunnel'])
+        for dir_path in directory:
+            tunnel.run(f'mkdir -p {dir_path}', hide=False, warn=True)
+            logging.info(f"Created directory: {dir_path} on remote cluster.")
+        # tunnel.cleanup()
+
+    else:
+        raise ValueError(f"Unsupported executor: {cluster_config.get('executor')}")
+
+
+def create_remote_config(config: dict | DictConfig, config_name: str, config_directory: str, cluster_config: dict):
+    """Create a remote yaml config at the result directory on the cluster."""
+
+    if cluster_config is None:
+        raise ValueError("Cluster config is not provided.")
+
+    if not config_name.endswith('.yaml'):
+        config_name = f"{config_name}.yaml"
+
+    if isinstance(config_directory, str):
+        config_directory = [config_directory]
+
+    if cluster_config.get('executor') == 'local':
+        tunnel = LocalTunnel(job_dir=config_directory[0])
+        for dir_path in config_directory:
+            config_filepath = os.path.join(dir_path, config_name)
+            tunnel.run(f'mkdir -p {dir_path}', hide=False, warn=True)
+            tunnel.run(f"touch {config_filepath}", hide=False, warn=True)
+            tunnel.run(f"echo '{OmegaConf.to_yaml(config)}' > {config_filepath}", hide=False, warn=True)
+            logging.info(f"Created config file: {dir_path} in local filesystem.")
+
+        # tunnel.cleanup()
+
+    elif cluster_config.get('executor') == 'slurm':
+        ssh_tunnel_config = cluster_config.get('ssh_tunnel', None)
+        if ssh_tunnel_config is None:
+            raise ValueError("`ssh_tunnel` sub-config is not provided in cluster_config.")
+
+        # Check for pre-existing job_dir in the ssh_tunnel_config
+        if 'job_dir' not in ssh_tunnel_config:
+            ssh_tunnel_config['job_dir'] = config_directory[0]
+
+        tunnel = get_tunnel(**cluster_config['ssh_tunnel'])
+        for dir_path in config_directory:
+            config_filepath = os.path.join(dir_path, config_name)
+            tunnel.run(f'mkdir -p {dir_path}', hide=False, warn=True)
+            tunnel.run(f"touch {config_filepath}", hide=False, warn=True)
+            tunnel.run(f"echo '{OmegaConf.to_yaml(config)}' > {config_filepath}", hide=False, warn=True)
+            logging.info(f"Created config file: {dir_path} on remote cluster.")
+        # tunnel.cleanup()
+
+    else:
+        raise ValueError(f"Unsupported executor: {cluster_config.get('executor')}")
+
+
+def check_remote_mount_directories(directories: list, cluster_config: dict, exit_on_failure: bool = True):
+    """Create a remote directory on the cluster."""
+
+    if cluster_config is None:
+        raise ValueError("Cluster config is not provided.")
+
+    if isinstance(directories, str):
+        directories = [directories]
+
+    if cluster_config.get('executor') == 'local':
+        tunnel = LocalTunnel(job_dir=None)
+
+        all_dirs_exist = True
+        missing_source_locations = []
+        for directory in directories:
+            result = tunnel.run(f'test -e {directory} && echo "Directory Exists"', hide=True, warn=True)
+
+            if "Directory Exists" not in result.stdout:
+                missing_source_locations.append(directory)
+
+        # tunnel.cleanup()
+
+        if len(missing_source_locations) > 0 and exit_on_failure:
+            missing_source_locations = [
+                f"{loc} DOES NOT exist at source destination" for loc in missing_source_locations
+            ]
+            missing_source_locations = "\n".join(missing_source_locations)
+            raise FileNotFoundError(
+                f"Some files or directories do not exist at the source location for mounting !!\n\n"
+                f"{missing_source_locations}"
+            )
+
+    elif cluster_config.get('executor') == 'slurm':
+        ssh_tunnel_config = cluster_config.get('ssh_tunnel', None)
+        if ssh_tunnel_config is None:
+            raise ValueError("`ssh_tunnel` sub-config is not provided in cluster_config.")
+
+        # Check for pre-existing job_dir in the ssh_tunnel_config
+        if 'job_dir' not in ssh_tunnel_config:
+            ssh_tunnel_config['job_dir'] = os.getcwd()
+
+        tunnel = get_tunnel(**cluster_config['ssh_tunnel'])
+        missing_source_locations = []
+
+        for directory in directories:
+            result = tunnel.run(f'test -e {directory} && echo "Directory Exists"', hide=True, warn=True)
+
+            if "Directory Exists" not in result.stdout:
+                missing_source_locations.append(directory)
+
+        # tunnel.cleanup()
+
+        if len(missing_source_locations) > 0 and exit_on_failure:
+            missing_source_locations = [
+                f"{loc} DOES NOT exist at source destination" for loc in missing_source_locations
+            ]
+            missing_source_locations = "\n".join(missing_source_locations)
+            raise FileNotFoundError(
+                f"Some files or directories do not exist at the source location for mounting !!\n\n"
+                f"{missing_source_locations}"
+            )
+
+    else:
+        raise ValueError(f"Unsupported executor: {cluster_config.get('executor')}")
+
+
+def get_unmounted_filepath(cluster_config: dict, filepath: str):
+    """
+    Resolve the mounted filepath using the cluster config to merge the mount source path to the filepath.
+    Args:
+        filepath:
+        cluster_config:
+
+    Returns:
+
+    """
+    # Find which mount path matches the filepaths prefix
+    mount_path = None
+    for mount in cluster_config['mounts']:
+        mount_source, mount_dest = mount.split(':')
+        if filepath.startswith(mount_dest):
+            mount_path = mount
+            break
+
+    if mount_path is None:
+        raise ValueError(
+            f"Could not find a mount path for the file path `{filepath}`. Below paths are mounted: \n"
+            f"{cluster_config['mounts']}"
+        )
+
+    # replace the mount destination inside the filepath with the mount source
+    mount_source, mount_dest = mount_path.split(':')
+    filepath = mount_source + filepath[len(mount_dest) :]  # replace the mount destination with the mount source
+
+    return filepath
+
+
+def get_mounted_filepath(cluster_config: dict, filepath: str):
+    """
+    Resolve the mounted filepath using the cluster config to merge the mount destination path to the filepath.
+
+    Args:
+        cluster_config:
+        filepath:
+
+    Returns:
+
+    """
+    # Find which mount path matches the filepaths prefix
+    mount_path = None
+    for mount in cluster_config['mounts']:
+        mount_source, mount_dest = mount.split(':')
+        if filepath.startswith(mount_source):
+            mount_path = mount
+            break
+
+    if mount_path is None:
+        raise ValueError(
+            f"Could not find a mount path for the file path `{filepath}`. Below paths are mounted: \n"
+            f"{cluster_config['mounts']}"
+        )
+
+    # replace the mount destination inside the filepath with the mount source
+    mount_source, mount_dest = mount_path.split(':')
+    filepath = mount_dest + filepath[len(mount_source) :]  # replace the mount destination with the mount source
+
+    return filepath
+
+
+def get_env_variables(cluster_config):
+    """
+    Will get the environment variables from the cluster config and the user environment.
+
+    The following items in the cluster config are supported:
+    - `required_env_vars` - list of required environment variables
+    - `env_vars` - list of optional environment variables
+
+    Args:
+        cluster_config: cluster config dictionary
+
+    Returns:
+        dict: dictionary of environment
+    """
+    env_vars = {}
+    # Check for user requested env variables
+    required_env_vars = cluster_config.get("required_env_vars", [])
+    for env_var in required_env_vars:
+        if env_var not in os.environ:
+            raise ValueError(f"Required environment variable {env_var} not found.")
+
+        env_vars[env_var] = os.environ[env_var]
+        logging.info(f"Adding required environment variable {env_var} (value={os.environ[env_var]})")
+
+    # Add optional env variables
+    optional_env_vars = cluster_config.get("env_vars", [])
+    for env_var in optional_env_vars:
+        if env_var in os.environ:
+            logging.info(f"Adding optional environment variable {env_var} (value={os.environ[env_var]})")
+            env_vars[env_var] = os.environ[env_var]
+        else:
+            logging.info(f"Optional environment variable {env_var} not found in user environment; skipping.")
+
+    return env_vars
+
+
+def _get_latest_dir(path, expname, job_id) -> str:
+    if job_id is not None:
+        return os.path.join(path, f"{expname}_{job_id}")
+
+    dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
+    latest_dir = max(dirs, key=lambda d: os.path.getctime(os.path.join(path, d)))
+    return os.path.join(path, latest_dir)
+
+
+def get_exp_handles(expname):
+    # TODO: remove this after we can properly use .from_title api
+    if "_" in expname:
+        try:
+            job_id = int(expname.split("_")[-1])
+            expname = expname[: expname.rfind("_")]
+        except:
+            job_id = None
+
+    parent_dir = os.path.join(NEMORUN_HOME, "experiments", expname)
+    exp_dir = _get_latest_dir(parent_dir, expname, job_id)
+
+    with open(os.path.join(exp_dir, '_TASKS')) as f:
+        serialized_jobs = json.load(f)
+
+    serializer = ZlibJSONSerializer()
+    handles = []
+    for job in serialized_jobs:
+        obj = serializer.deserialize(job[0])
+        if hasattr(obj, 'handle'):
+            handles.append(obj.handle)
+        elif hasattr(obj, 'handles'):
+            handles.extend(obj.handles)
+        else:
+            raise ValueError(f"Object {obj} does not have a handle or handles attribute.")
+    return handles
+
+
+@dataclass(kw_only=True)
+class CustomJobDetails(SlurmJobDetails):
+    log_prefix: str = "main"
+
+    @property
+    def stdout(self) -> Path:
+        return Path(self.folder) / f"{self.log_prefix}_sbatch.log"
+
+    @property
+    def srun_stdout(self) -> Path:
+        return Path(self.folder) / f"{self.log_prefix}_srun.log"
+
+    @property
+    def stderr(self) -> Path:
+        return Path(self.folder) / f"{self.log_prefix}_sbatch.log"
+
+    @property
+    def srun_stderr(self) -> Path:
+        return Path(self.folder) / f"{self.log_prefix}_srun.log"
+
+    @property
+    def ls_term(self) -> str:
+        """This term will be used to fetch the logs.
+
+        The command used to list the files is ls -1 {ls_term} 2> /dev/null
+        """
+        assert self.folder
+        return os.path.join(self.folder, "*_srun.log")
+
+
+def get_packager():
+    """Will check if we are running from a git repo and use git packager or default packager otherwise."""
+    return run.GitArchivePackager(
+        check_uncommitted_changes=True,
+    )
+
+
+def get_executor(
+    cluster_config,
+    container,
+    num_nodes,
+    tasks_per_node,
+    gpus_per_node,
+    job_name,
+    log_dir,
+    log_prefix: str = "main",
+    mounts=None,
+    partition=None,
+    dependencies=None,
+):
+    env_vars = get_env_variables(cluster_config)
+    config_mounts = get_mounts_from_config(cluster_config, env_vars)
+
+    mounts = mounts or config_mounts
+    packager = get_packager()
+    if cluster_config["executor"] == "local":
+        if num_nodes > 1:
+            raise ValueError("Local executor does not support multi-node execution")
+
+        env_vars["PYTHONUNBUFFERED"] = "1"  # this makes sure logs are streamed right away
+
+        return DockerExecutor(
+            container_image=container,
+            packager=packager,
+            ipc_mode="host",
+            volumes=mounts,
+            ntasks_per_node=1,
+            num_gpus=gpus_per_node,
+            network="host",
+            env_vars=env_vars,
+        )
+
+    partition = partition or cluster_config.get("partition")
+    if 'timeouts' not in cluster_config:
+        timeout = "10000:00:00:00"
+    else:
+        timeout = cluster_config["timeouts"][partition]
+
+    return run.SlurmExecutor(
+        account=cluster_config["account"],
+        partition=partition,
+        nodes=num_nodes,
+        ntasks_per_node=tasks_per_node,
+        tunnel=get_tunnel(**cluster_config["ssh_tunnel"]),
+        container_image=container,
+        container_mounts=mounts,
+        time=timeout,
+        packager=packager,
+        gpus_per_node=gpus_per_node if not cluster_config.get("disable_gpus_per_node", False) else None,
+        srun_args=[
+            "--no-container-mount-home",
+            "--overlap",
+            "--mpi=pmix",
+            '--wait=10',
+            # we need to be explicit about this in srun as commands might need to run in parallel
+            f"--ntasks={tasks_per_node * num_nodes}",
+            f"--nodes={num_nodes}",
+        ],
+        # TODO: can we relax this to allow partial node allocation?
+        exclusive=True,
+        mem=0,
+        job_details=CustomJobDetails(
+            job_name=cluster_config.get("job_name_prefix", "") + job_name,
+            folder=get_unmounted_filepath(cluster_config, log_dir),
+            log_prefix=log_prefix + '_' + job_name,
+        ),
+        wait_time_for_group_job=0.01,
+        monitor_group_job_wait_time=20,
+        dependencies=dependencies,
+        dependency_type="afterany",
+        env_vars=env_vars,
+    )
+
+
+def add_task(
+    exp,
+    cmd,
+    task_name,
+    cluster_config,
+    container,
+    # TODO: these are good defaults for generation jobs, but probably not the best overall?
+    num_tasks=1,
+    num_gpus=1,
+    num_nodes=1,
+    log_dir=None,
+    partition=None,
+    run_after=None,
+):
+    if run_after is not None and cluster_config["executor"] == "slurm":
+        dependencies = tuple(get_exp_handles(run_after))
+    else:
+        dependencies = None
+    commands = []
+    executors = []
+    # then goes the main task unless it's empty
+    if cmd:
+        if cluster_config["executor"] == "local" and num_tasks > 1:
+            cmd = f"mpirun --allow-run-as-root -np {num_tasks} bash -c {shlex.quote(cmd)}"
+        commands.append(cmd)
+        executors.append(
+            get_executor(
+                cluster_config=cluster_config,
+                container=container,
+                num_nodes=num_nodes,
+                tasks_per_node=num_tasks,
+                gpus_per_node=num_gpus,
+                partition=partition,
+                dependencies=dependencies,
+                job_name=task_name,
+                log_dir=log_dir,
+                log_prefix="main",
+            )
+        )
+
+    if len(commands) == 1:
+        # to keep sbatch script simpler, we don't wrap in a list in this case
+        exp.add(run.Script(inline=commands[0]), executor=executors[0], name="nemo-run")
+    else:
+        exp.add(
+            [run.Script(inline=command) for command in commands],
+            executor=executors,
+            name="nemo-run",
+        )
+
+
+def run_exp(exp, cluster_config, sequential=False):
+    if cluster_config['executor'] == 'local':
+        # locally we are always running sequentially - does that need to be changed?
+        exp.run(detach=False, tail_logs=True, sequential=True)
+    else:
+        exp.run(detach=True, sequential=sequential)
diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
index 76dca1268c3b..439322b8e810 100644
--- a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
+++ b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from collections import OrderedDict
-from typing import Optional
+from typing import List, Optional
 
 from transformers import AutoTokenizer as AUTOTOKENIZER
 
@@ -43,6 +43,7 @@ def __init__(
         sep_token: Optional[str] = None,
         cls_token: Optional[str] = None,
         unk_token: Optional[str] = None,
+        additional_special_tokens: Optional[List] = [],
         use_fast: Optional[bool] = False,
         trust_remote_code: Optional[bool] = False,
     ):
@@ -60,6 +61,7 @@ def __init__(
             sep_token: token used for separating sequences
             cls_token: class token. Usually equal to bos_token
             unk_token: token to use for unknown tokens
+            additional_special_tokens: list of other tokens beside standard special tokens (bos, eos, pad, etc.). For example, sentinel tokens for T5 (<extra_id_0>, <extra_id_1>, etc.)
             use_fast: whether to use fast HuggingFace tokenizer
         """
         try:
@@ -124,10 +126,17 @@ def __init__(
         elif self.tokenizer.cls_token is None and self.tokenizer.bos_token:
             special_tokens_dict["cls_token"] = self.tokenizer.bos_token
 
+        # add additional special tokens (not standard special tokens such as bos, eod, sep)
+        if additional_special_tokens is not None:
+            special_tokens_dict["additional_special_tokens"] = additional_special_tokens
+
         new_tokens_in_vocab = []
         for token in [mask_token, bos_token, eos_token, pad_token, sep_token, cls_token, unk_token]:
             if token is not None and token not in self.tokenizer.get_vocab():
                 new_tokens_in_vocab.append(token)
+        for token in additional_special_tokens:
+            if token is not None and token not in self.tokenizer.get_vocab():
+                new_tokens_in_vocab.append(token)
 
         if len(new_tokens_in_vocab) > 0:
             """
diff --git a/nemo/collections/diffusion/data/diffusion_energon_datamodule.py b/nemo/collections/diffusion/data/diffusion_energon_datamodule.py
index fe17b4eecb5f..f18c828d9d45 100644
--- a/nemo/collections/diffusion/data/diffusion_energon_datamodule.py
+++ b/nemo/collections/diffusion/data/diffusion_energon_datamodule.py
@@ -11,7 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Literal
+
+import logging
+from typing import Any, Dict, Literal
 
 from megatron.energon import DefaultTaskEncoder, get_train_dataset
 from pytorch_lightning.utilities.types import EVAL_DATALOADERS
@@ -127,3 +129,18 @@ def val_dataloader(self) -> EVAL_DATALOADERS:
         if self.use_train_split_for_val:
             return self.train_dataloader()
         return super().val_dataloader()
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        """
+        Load the state of the data module from a checkpoint.
+
+        This method is called when loading a checkpoint. It restores the state of the data module,
+        including the state of the dataloader and the number of consumed samples.
+
+        Parameters:
+        state_dict (Dict[str, Any]): The state dictionary containing the saved state of the data module.
+        """
+        try:
+            super().load_state_dict(state_dict)
+        except Exception as e:
+            logging.warning(f"datamodule.load_state_dict failed  {e}")
diff --git a/nemo/collections/diffusion/data/diffusion_taskencoder.py b/nemo/collections/diffusion/data/diffusion_taskencoder.py
index 3285c63b2461..57e4e4ec8673 100644
--- a/nemo/collections/diffusion/data/diffusion_taskencoder.py
+++ b/nemo/collections/diffusion/data/diffusion_taskencoder.py
@@ -11,8 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import warnings
 import torch
 import torch.nn.functional as F
+from einops import rearrange
 from megatron.core import parallel_state
 from megatron.energon import DefaultTaskEncoder, SkipSample
 from megatron.energon.task_encoder.cooking import Cooker, basic_sample_keys
@@ -66,10 +69,22 @@ class BasicDiffusionTaskEncoder(DefaultTaskEncoder, IOMixin):
         Cooker(cook),
     ]
 
-    def __init__(self, *args, max_frames: int = None, text_embedding_padding_size: int = 512, **kwargs):
+    def __init__(
+        self,
+        *args,
+        max_frames: int = None,
+        text_embedding_padding_size: int = 512,
+        seq_length: int = None,
+        patch_spatial: int = 2,
+        patch_temporal: int = 1,
+        **kwargs,
+    ):
         super().__init__(*args, **kwargs)
         self.max_frames = max_frames
         self.text_embedding_padding_size = text_embedding_padding_size
+        self.seq_length = seq_length
+        self.patch_spatial = patch_spatial
+        self.patch_temporal = patch_temporal
 
     def encode_sample(self, sample: dict) -> dict:
         video_latent = sample['pth']
@@ -80,9 +95,19 @@ def encode_sample(self, sample: dict) -> dict:
             raise SkipSample()
 
         info = sample['json']
-        _, T, H, W = video_latent.shape
+        C, T, H, W = video_latent.shape
+        seq_len = (
+            video_latent.shape[-1]
+            * video_latent.shape[-2]
+            * video_latent.shape[-3]
+            // self.patch_spatial**2
+            // self.patch_temporal
+        )
         is_image = T == 1
 
+        if seq_len > self.seq_length:
+            raise SkipSample()
+
         if self.max_frames is not None:
             video_latent = video_latent[:, : self.max_frames, :, :]
 
@@ -90,11 +115,16 @@ def encode_sample(self, sample: dict) -> dict:
         if parallel_state.get_context_parallel_world_size() > 1:
             tpcp_size *= parallel_state.get_context_parallel_world_size() * 2
         if (T * H * W) % tpcp_size != 0:
-            print(f'skipping {video_latent.shape=} not divisible by {tpcp_size=}')
+            warnings.warn(f'skipping {video_latent.shape=} not divisible by {tpcp_size=}')
             raise SkipSample()
 
-        seq_len = video_latent.shape[-1] * video_latent.shape[-2] * video_latent.shape[-3]
-        loss_mask = torch.ones(seq_len, dtype=torch.bfloat16)
+        video_latent = rearrange(
+            video_latent,
+            'C (T pt) (H ph) (W pw) -> (T H W) (ph pw pt C)',
+            ph=self.patch_spatial,
+            pw=self.patch_spatial,
+            pt=self.patch_temporal,
+        )
 
         if is_image:
             t5_text_embeddings = torch.from_numpy(sample['pickle']).to(torch.bfloat16)
@@ -102,20 +132,82 @@ def encode_sample(self, sample: dict) -> dict:
             t5_text_embeddings = torch.from_numpy(sample['pickle'][0]).to(torch.bfloat16)
         t5_text_embeddings_seq_length = t5_text_embeddings.shape[0]
 
-        t5_text_embeddings = F.pad(
-            t5_text_embeddings,
-            (
-                0,
-                0,
-                0,
-                self.text_embedding_padding_size - t5_text_embeddings_seq_length % self.text_embedding_padding_size,
-            ),
-        )
+        if t5_text_embeddings_seq_length > self.text_embedding_padding_size:
+            t5_text_embeddings = t5_text_embeddings[: self.text_embedding_padding_size]
+        else:
+            t5_text_embeddings = F.pad(
+                t5_text_embeddings,
+                (
+                    0,
+                    0,
+                    0,
+                    self.text_embedding_padding_size - t5_text_embeddings_seq_length,
+                ),
+            )
         t5_text_mask = torch.ones(t5_text_embeddings_seq_length, dtype=torch.bfloat16)
 
+        if is_image:
+            h, w = info['image_height'], info['image_width']
+            fps = torch.tensor([30] * 1, dtype=torch.bfloat16)
+            num_frames = torch.tensor([1] * 1, dtype=torch.bfloat16)
+        else:
+            h, w = info['height'], info['width']
+            fps = torch.tensor([info['framerate']] * 1, dtype=torch.bfloat16)
+            num_frames = torch.tensor([info['num_frames']] * 1, dtype=torch.bfloat16)
+        image_size = torch.tensor([[h, w, h, w]] * 1, dtype=torch.bfloat16)
+
+        pos_ids = rearrange(
+            pos_id_3d.get_pos_id_3d(t=T // self.patch_temporal, h=H // self.patch_spatial, w=W // self.patch_spatial),
+            'T H W d -> (T H W) d',
+        )
+
+        if self.seq_length is not None:
+            pos_ids = F.pad(pos_ids, (0, 0, 0, self.seq_length - seq_len))
+            loss_mask = torch.zeros(self.seq_length, dtype=torch.bfloat16)
+            loss_mask[:seq_len] = 1
+            video_latent = F.pad(video_latent, (0, 0, 0, self.seq_length - seq_len))
+        else:
+            loss_mask = torch.ones(seq_len, dtype=torch.bfloat16)
+
         return dict(
             video=video_latent,
             t5_text_embeddings=t5_text_embeddings,
             t5_text_mask=t5_text_mask,
+            image_size=image_size,
+            fps=fps,
+            num_frames=num_frames,
             loss_mask=loss_mask,
+            seq_len_q=torch.tensor(seq_len, dtype=torch.int32),
+            seq_len_kv=torch.tensor(t5_text_embeddings_seq_length, dtype=torch.int32),
+            pos_ids=pos_ids,
+            latent_shape=torch.tensor([C, T, H, W], dtype=torch.int32),
         )
+
+
+class PosID3D:
+    def __init__(self, *, max_t=32, max_h=128, max_w=128):
+        self.max_t = max_t
+        self.max_h = max_h
+        self.max_w = max_w
+        self.generate_pos_id()
+
+    def generate_pos_id(self):
+        self.grid = torch.stack(
+            torch.meshgrid(
+                torch.arange(self.max_t, device='cpu'),
+                torch.arange(self.max_h, device='cpu'),
+                torch.arange(self.max_w, device='cpu'),
+            ),
+            dim=-1,
+        )
+
+    def get_pos_id_3d(self, *, t, h, w):
+        if t > self.max_t or h > self.max_h or w > self.max_w:
+            self.max_t = max(self.max_t, t)
+            self.max_h = max(self.max_h, h)
+            self.max_w = max(self.max_w, w)
+            self.generate_pos_id()
+        return self.grid[:t, :h, :w]
+
+
+pos_id_3d = PosID3D()
diff --git a/nemo/collections/diffusion/models/__init__.py b/nemo/collections/diffusion/models/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/diffusion/models/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/diffusion/models/dit/__init__.py b/nemo/collections/diffusion/models/dit/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/diffusion/models/dit/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/diffusion/models/dit/dit_embeddings.py b/nemo/collections/diffusion/models/dit/dit_embeddings.py
new file mode 100644
index 000000000000..ec8d095cbbd4
--- /dev/null
+++ b/nemo/collections/diffusion/models/dit/dit_embeddings.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import math
+from typing import Dict, Literal, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from diffusers.models.embeddings import TimestepEmbedding, get_3d_sincos_pos_embed
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from megatron.core import parallel_state
+from megatron.core.models.common.embeddings.rotary_pos_embedding import get_pos_emb_on_this_cp_rank
+from megatron.core.transformer.module import MegatronModule
+from torch import nn
+
+
+class ParallelTimestepEmbedding(TimestepEmbedding):
+    """
+    ParallelTimestepEmbedding is a subclass of TimestepEmbedding that initializes
+    the embedding layers with an optional random seed for syncronization.
+
+    Args:
+        in_channels (int): Number of input channels.
+        time_embed_dim (int): Dimension of the time embedding.
+        seed (int, optional): Random seed for initializing the embedding layers.
+                              If None, no specific seed is set.
+
+    Attributes:
+        linear_1 (nn.Module): First linear layer for the embedding.
+        linear_2 (nn.Module): Second linear layer for the embedding.
+
+    Methods:
+        __init__(in_channels, time_embed_dim, seed=None): Initializes the embedding layers.
+    """
+
+    def __init__(self, in_channels: int, time_embed_dim: int, seed=None):
+        super().__init__(in_channels=in_channels, time_embed_dim=time_embed_dim)
+        if seed is not None:
+            with torch.random.fork_rng():
+                torch.manual_seed(seed)
+                self.linear_1.reset_parameters()
+                self.linear_2.reset_parameters()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Computes the positional embeddings for the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (B, T, H, W, C).
+
+        Returns:
+            torch.Tensor: Positional embeddings of shape (B, T, H, W, C).
+        """
+        return super().forward(x.to(torch.bfloat16, non_blocking=True))
+
+
+def get_pos_emb_on_this_cp_rank(pos_emb, seq_dim):
+    """
+    Adjusts the positional embeddings tensor to the current context parallel rank.
+
+    Args:
+        pos_emb (torch.Tensor): The positional embeddings tensor.
+        seq_dim (int): The sequence dimension index in the positional embeddings tensor.
+
+    Returns:
+        torch.Tensor: The adjusted positional embeddings tensor for the current context parallel rank.
+    """
+    cp_size = parallel_state.get_context_parallel_world_size()
+    cp_rank = parallel_state.get_context_parallel_rank()
+    cp_idx = torch.tensor([cp_rank], device="cpu", pin_memory=True).cuda(non_blocking=True)
+    pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], cp_size, -1, *pos_emb.shape[(seq_dim + 1) :])
+    pos_emb = pos_emb.index_select(seq_dim, cp_idx)
+    pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], -1, *pos_emb.shape[(seq_dim + 2) :])
+    return pos_emb
+
+
+class SinCosPosEmb3D(MegatronModule):
+    """
+    SinCosPosEmb3D is a 3D sine-cosine positional embedding module.
+
+    Args:
+        model_channels (int): Number of channels in the model.
+        h (int): Length of the height dimension.
+        w (int): Length of the width dimension.
+        t (int): Length of the temporal dimension.
+        spatial_interpolation_scale (float, optional): Scale factor for spatial interpolation. Default is 1.0.
+        temporal_interpolation_scale (float, optional): Scale factor for temporal interpolation. Default is 1.0.
+
+    Methods:
+        forward(pos_ids: torch.Tensor) -> torch.Tensor:
+            Computes the positional embeddings for the input tensor.
+
+            Args:
+                pos_ids (torch.Tensor): Input tensor of shape (B S 3).
+
+            Returns:
+                torch.Tensor: Positional embeddings of shape (B S D).
+    """
+
+    def __init__(
+        self,
+        config,
+        h: int,
+        w: int,
+        t: int,
+        spatial_interpolation_scale=1.0,
+        temporal_interpolation_scale=1.0,
+    ):
+        super().__init__(config=config)
+        self.h = h
+        self.w = w
+        self.t = t
+        # h w t
+        param = get_3d_sincos_pos_embed(
+            config.hidden_size, [h, w], t, spatial_interpolation_scale, temporal_interpolation_scale
+        )
+        param = rearrange(param, "t hw c -> (t hw) c")
+        self.pos_embedding = torch.nn.Embedding(param.shape[0], config.hidden_size)
+        self.pos_embedding.weight = torch.nn.Parameter(torch.tensor(param), requires_grad=False)
+
+    def forward(self, pos_ids: torch.Tensor):
+        # pos_ids: t h w
+        pos_id = pos_ids[..., 0] * self.h * self.w + pos_ids[..., 1] * self.w + pos_ids[..., 2]
+        return self.pos_embedding(pos_id)
+
+
+class FactorizedLearnable3DEmbedding(MegatronModule):
+    def __init__(
+        self,
+        config,
+        t: int,
+        h: int,
+        w: int,
+        **kwargs,
+    ):
+        super().__init__(config=config)
+        self.emb_t = torch.nn.Embedding(t, config.hidden_size)
+        self.emb_h = torch.nn.Embedding(h, config.hidden_size)
+        self.emb_w = torch.nn.Embedding(w, config.hidden_size)
+
+        if config.perform_initialization:
+            config.init_method(self.emb_t.weight)
+            config.init_method(self.emb_h.weight)
+            config.init_method(self.emb_w.weight)
+
+    def forward(self, pos_ids: torch.Tensor):
+        return self.emb_t(pos_ids[..., 0]) + self.emb_h(pos_ids[..., 1]) + self.emb_w(pos_ids[..., 2])
diff --git a/nemo/collections/diffusion/models/dit/dit_layer_spec.py b/nemo/collections/diffusion/models/dit/dit_layer_spec.py
new file mode 100644
index 000000000000..672dcff3ba00
--- /dev/null
+++ b/nemo/collections/diffusion/models/dit/dit_layer_spec.py
@@ -0,0 +1,532 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from dataclasses import dataclass
+from typing import Literal, Union
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+from megatron.core.jit import jit_fuser
+from megatron.core.transformer.attention import (
+    CrossAttention,
+    CrossAttentionSubmodules,
+    SelfAttention,
+    SelfAttentionSubmodules,
+)
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEColumnParallelLinear,
+    TEDotProductAttention,
+    TENorm,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_block import TransformerConfig
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+from megatron.core.utils import make_viewless_tensor
+
+
+@dataclass
+class DiTWithAdaLNSubmodules(TransformerLayerSubmodules):
+    temporal_self_attention: Union[ModuleSpec, type] = IdentityOp
+    full_self_attention: Union[ModuleSpec, type] = IdentityOp
+
+
+@dataclass
+class STDiTWithAdaLNSubmodules(TransformerLayerSubmodules):
+    spatial_self_attention: Union[ModuleSpec, type] = IdentityOp
+    temporal_self_attention: Union[ModuleSpec, type] = IdentityOp
+    full_self_attention: Union[ModuleSpec, type] = IdentityOp
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size: int, config, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+
+class AdaLN(MegatronModule):
+    """
+    Adaptive Layer Normalization Module for DiT.
+    """
+
+    def __init__(self, config: TransformerConfig, n_adaln_chunks=9, norm=nn.LayerNorm):
+        super().__init__(config)
+        if norm == TENorm:
+            self.ln = norm(config, config.hidden_size, config.layernorm_epsilon)
+        else:
+            self.ln = norm(config.hidden_size, elementwise_affine=False, eps=self.config.layernorm_epsilon)
+        self.n_adaln_chunks = n_adaln_chunks
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), nn.Linear(config.hidden_size, self.n_adaln_chunks * config.hidden_size, bias=False)
+        )
+        nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+
+        setattr(self.adaLN_modulation[-1].weight, "sequence_parallel", config.sequence_parallel)
+
+    def forward(self, timestep_emb):
+        return self.adaLN_modulation(timestep_emb).chunk(self.n_adaln_chunks, dim=-1)
+
+    @jit_fuser
+    def modulate(self, x, shift, scale):
+        return x * (1 + scale) + shift
+
+    @jit_fuser
+    def scale_add(self, residual, x, gate):
+        return residual + gate * x
+
+    @jit_fuser
+    def modulated_layernorm(self, x, shift, scale):
+        # Optional Input Layer norm
+        input_layernorm_output = self.ln(x).type_as(x)
+
+        # DiT block specific
+        return self.modulate(input_layernorm_output, shift, scale)
+
+    # @jit_fuser
+    def scaled_modulated_layernorm(self, residual, x, gate, shift, scale):
+        hidden_states = self.scale_add(residual, x, gate)
+        shifted_pre_mlp_layernorm_output = self.modulated_layernorm(hidden_states, shift, scale)
+        return hidden_states, shifted_pre_mlp_layernorm_output
+
+
+class STDiTLayerWithAdaLN(TransformerLayer):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+
+    Spatial-Temporal DiT with Adapative Layer Normalization.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        hidden_dropout: float = None,
+        position_embedding_type: Literal["learned_absolute", "rope"] = "learned_absolute",
+    ):
+        def _replace_no_cp_submodules(submodules):
+            modified_submods = copy.deepcopy(submodules)
+            modified_submods.cross_attention = IdentityOp
+            modified_submods.spatial_self_attention = IdentityOp
+            return modified_submods
+
+        # Replace any submodules that will have CP disabled and build them manually later after TransformerLayer init.
+        modified_submods = _replace_no_cp_submodules(submodules)
+        super().__init__(
+            config=config, submodules=modified_submods, layer_number=layer_number, hidden_dropout=hidden_dropout
+        )
+
+        # Override Spatial Self Attention and Cross Attention to disable CP.
+        # Disable TP Comm overlap as well. Not disabling will attempt re-use of buffer size same as Q and lead to incorrect tensor shapes.
+        sa_cp_override_config = copy.deepcopy(config)
+        sa_cp_override_config.context_parallel_size = 1
+        sa_cp_override_config.tp_comm_overlap = False
+        self.spatial_self_attention = build_module(
+            submodules.spatial_self_attention, config=sa_cp_override_config, layer_number=layer_number
+        )
+        self.cross_attention = build_module(
+            submodules.cross_attention,
+            config=sa_cp_override_config,
+            layer_number=layer_number,
+        )
+
+        self.temporal_self_attention = build_module(
+            submodules.temporal_self_attention,
+            config=self.config,
+            layer_number=layer_number,
+        )
+
+        self.full_self_attention = build_module(
+            submodules.full_self_attention,
+            config=self.config,
+            layer_number=layer_number,
+        )
+
+        self.adaLN = AdaLN(config=self.config, n_adaln_chunks=3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        inference_params=None,
+        packed_seq_params=None,
+    ):
+        # timestep embedding
+        timestep_emb = attention_mask
+
+        # ******************************************** spatial self attention ******************************************************
+
+        shift_sa, scale_sa, gate_sa = self.adaLN(timestep_emb)
+
+        # adaLN with scale + shift
+        pre_spatial_attn_layernorm_output_ada = self.adaLN.modulated_layernorm(
+            hidden_states, shift=shift_sa, scale=scale_sa
+        )
+
+        attention_output, _ = self.spatial_self_attention(
+            pre_spatial_attn_layernorm_output_ada,
+            attention_mask=None,
+            # packed_seq_params=packed_seq_params['self_attention'],
+        )
+
+        # ******************************************** full self attention *************************************************
+
+        shift_full, scale_full, gate_full = self.adaLN(timestep_emb)
+
+        # adaLN with scale + shift
+        hidden_states, pre_full_attn_layernorm_output_ada = self.adaLN.scaled_modulated_layernorm(
+            residual=hidden_states,
+            x=attention_output,
+            gate=gate_sa,
+            shift=shift_full,
+            scale=scale_full,
+        )
+
+        attention_output, _ = self.full_self_attention(
+            pre_full_attn_layernorm_output_ada,
+            attention_mask=None,
+            # packed_seq_params=packed_seq_params['self_attention'],
+        )
+
+        # ******************************************** cross attention *****************************************************
+
+        shift_ca, scale_ca, gate_ca = self.adaLN(timestep_emb)
+
+        # adaLN with scale + shift
+        hidden_states, pre_cross_attn_layernorm_output_ada = self.adaLN.scaled_modulated_layernorm(
+            residual=hidden_states,
+            x=attention_output,
+            gate=gate_full,
+            shift=shift_ca,
+            scale=scale_ca,
+        )
+
+        attention_output, _ = self.cross_attention(
+            pre_cross_attn_layernorm_output_ada,
+            attention_mask=context_mask,
+            key_value_states=context,
+            # packed_seq_params=packed_seq_params['cross_attention'],
+        )
+
+        # ******************************************** temporal self attention *********************************************
+
+        shift_ta, scale_ta, gate_ta = self.adaLN(timestep_emb)
+
+        hidden_states, pre_temporal_attn_layernorm_output_ada = self.adaLN.scaled_modulated_layernorm(
+            residual=hidden_states,
+            x=attention_output,
+            gate=gate_ca,
+            shift=shift_ta,
+            scale=scale_ta,
+        )
+
+        attention_output, _ = self.temporal_self_attention(
+            pre_temporal_attn_layernorm_output_ada,
+            attention_mask=None,
+            # packed_seq_params=packed_seq_params['self_attention'],
+        )
+
+        # ******************************************** mlp *****************************************************************
+
+        shift_mlp, scale_mlp, gate_mlp = self.adaLN(timestep_emb)
+
+        hidden_states, pre_mlp_layernorm_output_ada = self.adaLN.scaled_modulated_layernorm(
+            residual=hidden_states,
+            x=attention_output,
+            gate=gate_ta,
+            shift=shift_mlp,
+            scale=scale_mlp,
+        )
+
+        mlp_output, _ = self.mlp(pre_mlp_layernorm_output_ada)
+        hidden_states = self.adaLN.scale_add(residual=hidden_states, x=mlp_output, gate=gate_mlp)
+
+        # Jit compiled function creates 'view' tensor. This tensor
+        # potentially gets saved in the MPU checkpoint function context,
+        # which rejects view tensors. While making a viewless tensor here
+        # won't result in memory savings (like the data loader, or
+        # p2p_communication), it serves to document the origin of this
+        # 'view' tensor.
+        output = make_viewless_tensor(inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True)
+
+        return output, context
+
+
+class DiTLayerWithAdaLN(TransformerLayer):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+
+    DiT with Adapative Layer Normalization.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        hidden_dropout: float = None,
+        position_embedding_type: Literal["learned_absolute", "rope"] = "learned_absolute",
+    ):
+        def _replace_no_cp_submodules(submodules):
+            modified_submods = copy.deepcopy(submodules)
+            modified_submods.cross_attention = IdentityOp
+            # modified_submods.temporal_self_attention = IdentityOp
+            return modified_submods
+
+        # Replace any submodules that will have CP disabled and build them manually later after TransformerLayer init.
+        modified_submods = _replace_no_cp_submodules(submodules)
+        super().__init__(
+            config=config, submodules=modified_submods, layer_number=layer_number, hidden_dropout=hidden_dropout
+        )
+
+        # Override Cross Attention to disable CP.
+        # Disable TP Comm overlap as well. Not disabling will attempt re-use of buffer size same as Q and lead to incorrect tensor shapes.
+        if submodules.cross_attention != IdentityOp:
+            cp_override_config = copy.deepcopy(config)
+            cp_override_config.context_parallel_size = 1
+            cp_override_config.tp_comm_overlap = False
+            self.cross_attention = build_module(
+                submodules.cross_attention,
+                config=cp_override_config,
+                layer_number=layer_number,
+            )
+        else:
+            self.cross_attention = None
+
+        self.full_self_attention = build_module(
+            submodules.full_self_attention,
+            config=self.config,
+            layer_number=layer_number,
+        )
+
+        self.adaLN = AdaLN(config=self.config, n_adaln_chunks=9 if self.cross_attention else 6)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        inference_params=None,
+        packed_seq_params=None,
+    ):
+        # timestep embedding
+        timestep_emb = attention_mask
+
+        # ******************************************** full self attention ******************************************************
+        if self.cross_attention:
+            shift_full, scale_full, gate_full, shift_ca, scale_ca, gate_ca, shift_mlp, scale_mlp, gate_mlp = (
+                self.adaLN(timestep_emb)
+            )
+        else:
+            shift_full, scale_full, gate_full, shift_mlp, scale_mlp, gate_mlp = self.adaLN(timestep_emb)
+
+        # adaLN with scale + shift
+        pre_full_attn_layernorm_output_ada = self.adaLN.modulated_layernorm(
+            hidden_states, shift=shift_full, scale=scale_full
+        )
+
+        attention_output, _ = self.full_self_attention(
+            pre_full_attn_layernorm_output_ada,
+            attention_mask=None,
+            packed_seq_params=None if packed_seq_params is None else packed_seq_params['self_attention'],
+        )
+
+        if self.cross_attention:
+            # ******************************************** cross attention ******************************************************
+            # adaLN with scale + shift
+            hidden_states, pre_cross_attn_layernorm_output_ada = self.adaLN.scaled_modulated_layernorm(
+                residual=hidden_states,
+                x=attention_output,
+                gate=gate_full,
+                shift=shift_ca,
+                scale=scale_ca,
+            )
+
+            attention_output, _ = self.cross_attention(
+                pre_cross_attn_layernorm_output_ada,
+                attention_mask=context_mask,
+                key_value_states=context,
+                packed_seq_params=None if packed_seq_params is None else packed_seq_params['cross_attention'],
+            )
+
+        # ******************************************** mlp ******************************************************
+        hidden_states, pre_mlp_layernorm_output_ada = self.adaLN.scaled_modulated_layernorm(
+            residual=hidden_states,
+            x=attention_output,
+            gate=gate_ca if self.cross_attention else gate_full,
+            shift=shift_mlp,
+            scale=scale_mlp,
+        )
+
+        mlp_output, _ = self.mlp(pre_mlp_layernorm_output_ada)
+        hidden_states = self.adaLN.scale_add(residual=hidden_states, x=mlp_output, gate=gate_mlp)
+
+        # Jit compiled function creates 'view' tensor. This tensor
+        # potentially gets saved in the MPU checkpoint function context,
+        # which rejects view tensors. While making a viewless tensor here
+        # won't result in memory savings (like the data loader, or
+        # p2p_communication), it serves to document the origin of this
+        # 'view' tensor.
+        output = make_viewless_tensor(inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True)
+
+        return output, context
+
+
+def get_stdit_adaln_block_with_transformer_engine_spec() -> ModuleSpec:
+    params = {"attn_mask_type": AttnMaskType.padding}
+    return ModuleSpec(
+        module=STDiTLayerWithAdaLN,
+        submodules=STDiTWithAdaLNSubmodules(
+            spatial_self_attention=ModuleSpec(
+                module=SelfAttention,
+                params=params,
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=TENorm,
+                    k_layernorm=TENorm,
+                ),
+            ),
+            temporal_self_attention=ModuleSpec(
+                module=SelfAttention,
+                params=params,
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=TENorm,
+                    k_layernorm=TENorm,
+                ),
+            ),
+            full_self_attention=ModuleSpec(
+                module=SelfAttention,
+                params=params,
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=TENorm,
+                    k_layernorm=TENorm,
+                ),
+            ),
+            cross_attention=ModuleSpec(
+                module=CrossAttention,
+                params=params,
+                submodules=CrossAttentionSubmodules(
+                    linear_q=TEColumnParallelLinear,
+                    linear_kv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=TENorm,
+                    k_layernorm=TENorm,
+                ),
+            ),
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TEColumnParallelLinear,
+                    linear_fc2=TERowParallelLinear,
+                ),
+            ),
+        ),
+    )
+
+
+def get_dit_adaln_block_with_transformer_engine_spec() -> ModuleSpec:
+    params = {"attn_mask_type": AttnMaskType.padding}
+    return ModuleSpec(
+        module=DiTLayerWithAdaLN,
+        submodules=DiTWithAdaLNSubmodules(
+            full_self_attention=ModuleSpec(
+                module=SelfAttention,
+                params=params,
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=RMSNorm,
+                    k_layernorm=RMSNorm,
+                ),
+            ),
+            cross_attention=ModuleSpec(
+                module=CrossAttention,
+                params=params,
+                submodules=CrossAttentionSubmodules(
+                    linear_q=TEColumnParallelLinear,
+                    linear_kv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=RMSNorm,
+                    k_layernorm=RMSNorm,
+                ),
+            ),
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TEColumnParallelLinear,
+                    linear_fc2=TERowParallelLinear,
+                ),
+            ),
+        ),
+    )
+
+
+def get_official_dit_adaln_block_with_transformer_engine_spec() -> ModuleSpec:
+    params = {"attn_mask_type": AttnMaskType.no_mask}
+    return ModuleSpec(
+        module=DiTLayerWithAdaLN,
+        submodules=DiTWithAdaLNSubmodules(
+            full_self_attention=ModuleSpec(
+                module=SelfAttention,
+                params=params,
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                ),
+            ),
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TEColumnParallelLinear,
+                    linear_fc2=TERowParallelLinear,
+                ),
+            ),
+        ),
+    )
diff --git a/nemo/collections/diffusion/models/dit/dit_model.py b/nemo/collections/diffusion/models/dit/dit_model.py
new file mode 100644
index 000000000000..0c1c1abc82f2
--- /dev/null
+++ b/nemo/collections/diffusion/models/dit/dit_model.py
@@ -0,0 +1,359 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Dict, Literal, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models.embeddings import Timesteps
+from einops import rearrange, repeat
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.models.common.vision_module.vision_module import VisionModule
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.transformer.enums import ModelType
+from megatron.core.transformer.transformer_block import TransformerBlock
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import make_sharded_tensor_for_checkpoint
+from torch import Tensor
+
+from nemo.collections.diffusion.models.dit import dit_embeddings
+from nemo.collections.diffusion.models.dit.dit_embeddings import ParallelTimestepEmbedding
+from nemo.collections.diffusion.models.dit.dit_layer_spec import (
+    get_dit_adaln_block_with_transformer_engine_spec as DiTLayerWithAdaLNspec,
+)
+
+
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, channel: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(channel))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+
+    def __init__(self, hidden_size, spatial_patch_size, temporal_patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(
+            hidden_size, spatial_patch_size * spatial_patch_size * temporal_patch_size * out_channels, bias=False
+        )
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=False))
+
+    def forward(self, x_BT_HW_D, emb_B_D):
+        shift_B_D, scale_B_D = self.adaLN_modulation(emb_B_D).chunk(2, dim=1)
+        T = x_BT_HW_D.shape[0] // emb_B_D.shape[0]
+        shift_BT_D, scale_BT_D = repeat(shift_B_D, "b d -> (b t) d", t=T), repeat(scale_B_D, "b d -> (b t) d", t=T)
+        x_BT_HW_D = modulate(self.norm_final(x_BT_HW_D), shift_BT_D, scale_BT_D)
+        x_BT_HW_D = self.linear(x_BT_HW_D)
+        return x_BT_HW_D
+
+
+class DiTCrossAttentionModel(VisionModule):
+    """
+    DiTCrossAttentionModel is a VisionModule that implements a DiT model with a cross-attention block.
+    Attributes:
+        config (TransformerConfig): Configuration for the transformer.
+        pre_process (bool): Whether to apply pre-processing steps.
+        post_process (bool): Whether to apply post-processing steps.
+        fp16_lm_cross_entropy (bool): Whether to use fp16 for cross-entropy loss.
+        parallel_output (bool): Whether to use parallel output.
+        position_embedding_type (Literal["learned_absolute", "rope"]): Type of position embedding.
+        max_img_h (int): Maximum image height.
+        max_img_w (int): Maximum image width.
+        max_frames (int): Maximum number of frames.
+        patch_spatial (int): Spatial patch size.
+        patch_temporal (int): Temporal patch size.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        transformer_decoder_layer_spec (DiTLayerWithAdaLNspec): Specification for the transformer decoder layer.
+        add_encoder (bool): Whether to add an encoder.
+        add_decoder (bool): Whether to add a decoder.
+        share_embeddings_and_output_weights (bool): Whether to share embeddings and output weights.
+        concat_padding_mask (bool): Whether to concatenate padding mask.
+        pos_emb_cls (str): Class of position embedding.
+        model_type (ModelType): Type of the model.
+        decoder (TransformerBlock): Transformer decoder block.
+        t_embedder (torch.nn.Sequential): Time embedding layer.
+        x_embedder (nn.Conv3d): Convolutional layer for input embedding.
+        pos_embedder (dit_embeddings.SinCosPosEmb3D): Position embedding layer.
+        final_layer_linear (torch.nn.Linear): Final linear layer.
+        affline_norm (RMSNorm): Affine normalization layer.
+    Methods:
+        forward(x: Tensor, timesteps: Tensor, crossattn_emb: Tensor, packed_seq_params: PackedSeqParams = None, pos_ids: Tensor = None, **kwargs) -> Tensor:
+            Forward pass of the model.
+        set_input_tensor(input_tensor: Tensor) -> None:
+            Sets input tensor to the model.
+        sharded_state_dict(prefix: str = 'module.', sharded_offsets: tuple = (), metadata: Optional[Dict] = None) -> ShardedStateDict:
+            Sharded state dict implementation for backward-compatibility.
+        tie_embeddings_weights_state_dict(tensor, sharded_state_dict: ShardedStateDict, output_layer_weight_key: str, first_stage_word_emb_key: str) -> None:
+            Ties the embedding and output weights in a given sharded state dict.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        position_embedding_type: Literal["learned_absolute", "rope"] = "rope",
+        max_img_h: int = 80,
+        max_img_w: int = 80,
+        max_frames: int = 34,
+        patch_spatial: int = 1,
+        patch_temporal: int = 1,
+        in_channels: int = 16,
+        out_channels: int = 16,
+        transformer_decoder_layer_spec=DiTLayerWithAdaLNspec,
+        pos_embedder=dit_embeddings.SinCosPosEmb3D,
+        **kwargs,
+    ):
+        super(DiTCrossAttentionModel, self).__init__(config=config)
+
+        self.config: TransformerConfig = config
+
+        self.transformer_decoder_layer_spec = transformer_decoder_layer_spec()
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.add_encoder = True
+        self.add_decoder = True
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.position_embedding_type = position_embedding_type
+        self.share_embeddings_and_output_weights = False
+        self.concat_padding_mask = True
+        self.pos_emb_cls = 'sincos'
+        self.patch_spatial = patch_spatial
+        self.patch_temporal = patch_temporal
+
+        # megatron core pipelining currently depends on model type
+        # TODO: remove this dependency ?
+        self.model_type = ModelType.encoder_or_decoder
+
+        # Transformer decoder
+        self.decoder = TransformerBlock(
+            config=self.config,
+            spec=self.transformer_decoder_layer_spec,
+            pre_process=self.pre_process,
+            post_process=False,
+            post_layer_norm=False,
+        )
+
+        self.t_embedder = torch.nn.Sequential(
+            Timesteps(self.config.hidden_size, flip_sin_to_cos=False, downscale_freq_shift=0),
+            dit_embeddings.ParallelTimestepEmbedding(self.config.hidden_size, self.config.hidden_size, seed=1234),
+        )
+
+        if self.pre_process:
+            self.x_embedder = torch.nn.Linear(in_channels * patch_spatial**2, self.config.hidden_size)
+
+            self.pos_embedder = pos_embedder(
+                config,
+                t=max_frames // patch_temporal,
+                h=max_img_h // patch_spatial,
+                w=max_img_w // patch_spatial,
+            )
+            self.fps_embedder = nn.Sequential(
+                Timesteps(num_channels=256, flip_sin_to_cos=False, downscale_freq_shift=1),
+                ParallelTimestepEmbedding(256, 256),
+            )
+
+        if self.post_process:
+            self.final_layer_linear = torch.nn.Linear(
+                self.config.hidden_size,
+                patch_spatial**2 * patch_temporal * out_channels,
+            )
+
+        self.affline_norm = RMSNorm(self.config.hidden_size)
+
+    def forward(
+        self,
+        x: Tensor,
+        timesteps: Tensor,
+        crossattn_emb: Tensor,
+        packed_seq_params: PackedSeqParams = None,
+        pos_ids: Tensor = None,
+        **kwargs,
+    ) -> Tensor:
+        """Forward pass.
+
+        Args:
+            x (Tensor): vae encoded data (b s c)
+            encoder_decoder_attn_mask (Tensor): cross-attention mask between encoder and decoder
+            inference_params (InferenceParams): relevant arguments for inferencing
+
+        Returns:
+            Tensor: loss tensor
+        """
+        B = x.shape[0]
+        fps = kwargs.get(
+            'fps',
+            torch.tensor(
+                [
+                    30,
+                ]
+                * B,
+                dtype=torch.bfloat16,
+            ),
+        ).view(-1)
+        if self.pre_process:
+            # transpose to match
+            x_B_S_D = self.x_embedder(x)
+            if isinstance(self.pos_embedder, dit_embeddings.SinCosPosEmb3D):
+                pos_emb = None
+                x_B_S_D += self.pos_embedder(pos_ids)
+            else:
+                pos_emb = self.pos_embedder(pos_ids)
+                pos_emb = rearrange(pos_emb, "B S D -> S B D")
+            x_S_B_D = rearrange(x_B_S_D, "B S D -> S B D")
+        else:
+            # intermediate stage of pipeline
+            x_S_B_D = None  ### should it take encoder_hidden_states
+
+        timesteps_B_D = self.t_embedder(timesteps.flatten()).to(torch.bfloat16)  # (b d_text_embedding)
+
+        affline_emb_B_D = timesteps_B_D
+        fps_B_D = self.fps_embedder(fps)
+        fps_B_D = nn.functional.pad(fps_B_D, (0, self.config.hidden_size - fps_B_D.shape[1]))
+        affline_emb_B_D += fps_B_D
+
+        crossattn_emb = rearrange(crossattn_emb, 'B S D -> S B D')
+
+        if self.config.sequence_parallel:
+            if self.pre_process:
+                x_S_B_D = tensor_parallel.scatter_to_sequence_parallel_region(x_S_B_D)
+            crossattn_emb = tensor_parallel.scatter_to_sequence_parallel_region(crossattn_emb)
+            # `scatter_to_sequence_parallel_region` returns a view, which prevents
+            # the original tensor from being garbage collected. Clone to facilitate GC.
+            # Has a small runtime cost (~0.5%).
+            if self.config.clone_scatter_output_in_embedding:
+                if self.pre_process:
+                    x_S_B_D = x_S_B_D.clone()
+                crossattn_emb = crossattn_emb.clone()
+
+        x_S_B_D = self.decoder(
+            hidden_states=x_S_B_D,
+            attention_mask=affline_emb_B_D,
+            context=crossattn_emb,
+            context_mask=None,
+            rotary_pos_emb=pos_emb,
+            packed_seq_params=packed_seq_params,
+        )
+
+        if not self.post_process:
+            return x_S_B_D
+
+        if self.config.sequence_parallel:
+            x_S_B_D = tensor_parallel.gather_from_sequence_parallel_region(x_S_B_D)
+
+        x_S_B_D = self.final_layer_linear(x_S_B_D)
+        return rearrange(x_S_B_D, "S B D -> B S D")
+
+    def set_input_tensor(self, input_tensor: Tensor) -> None:
+        """Sets input tensor to the model.
+
+        See megatron.model.transformer.set_input_tensor()
+
+        Args:
+            input_tensor (Tensor): Sets the input tensor for the model.
+        """
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+
+        assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt/bert'
+        self.decoder.set_input_tensor(input_tensor[0])
+
+    def sharded_state_dict(
+        self, prefix: str = 'module.', sharded_offsets: tuple = (), metadata: Optional[Dict] = None
+    ) -> ShardedStateDict:
+        """Sharded state dict implementation for GPTModel backward-compatibility (removing extra state).
+
+        Args:
+            prefix (str): Module name prefix.
+            sharded_offsets (tuple): PP related offsets, expected to be empty at this module level.
+            metadata (Optional[Dict]): metadata controlling sharded state dict creation.
+
+        Returns:
+            ShardedStateDict: sharded state dict for the GPTModel
+        """
+        sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
+
+        for param_name, param in self.t_embedder.named_parameters():
+            weight_key = f'{prefix}t_embedder.{param_name}'
+            self.tie_embeddings_weights_state_dict(param, sharded_state_dict, weight_key, weight_key)
+
+        for param_name, param in self.affline_norm.named_parameters():
+            weight_key = f'{prefix}affline_norm.{param_name}'
+            self.tie_embeddings_weights_state_dict(param, sharded_state_dict, weight_key, weight_key)
+
+        return sharded_state_dict
+
+    def tie_embeddings_weights_state_dict(
+        self,
+        tensor,
+        sharded_state_dict: ShardedStateDict,
+        output_layer_weight_key: str,
+        first_stage_word_emb_key: str,
+    ) -> None:
+        """Ties the embedding and output weights in a given sharded state dict.
+
+        Args:
+            sharded_state_dict (ShardedStateDict): state dict with the weight to tie
+            output_layer_weight_key (str): key of the output layer weight in the state dict.
+                This entry will be replaced with a tied version
+            first_stage_word_emb_key (str): this must be the same as the
+                ShardedTensor.key of the first stage word embeddings.
+
+        Returns: None, acts in-place
+        """
+        if self.pre_process and parallel_state.get_tensor_model_parallel_rank() == 0:
+            # Output layer is equivalent to the embedding already
+            return
+
+        # Replace the default output layer with a one sharing the weights with the embedding
+        del sharded_state_dict[output_layer_weight_key]
+        last_stage_word_emb_replica_id = (
+            0,  # copy of first stage embedding
+            parallel_state.get_tensor_model_parallel_rank()
+            + parallel_state.get_pipeline_model_parallel_rank()
+            * parallel_state.get_pipeline_model_parallel_world_size(),
+            parallel_state.get_data_parallel_rank(with_context_parallel=True),
+        )
+
+        sharded_state_dict[output_layer_weight_key] = make_sharded_tensor_for_checkpoint(
+            tensor=tensor,
+            key=first_stage_word_emb_key,
+            replica_id=last_stage_word_emb_replica_id,
+            allow_shape_mismatch=False,
+        )
diff --git a/nemo/collections/diffusion/models/dit_llama/__init__.py b/nemo/collections/diffusion/models/dit_llama/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/diffusion/models/dit_llama/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/diffusion/models/dit_llama/dit_llama_layer_spec.py b/nemo/collections/diffusion/models/dit_llama/dit_llama_layer_spec.py
new file mode 100644
index 000000000000..80bed5878e1b
--- /dev/null
+++ b/nemo/collections/diffusion/models/dit_llama/dit_llama_layer_spec.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import Literal
+
+from megatron.core.transformer.attention import (
+    CrossAttention,
+    CrossAttentionSubmodules,
+    SelfAttention,
+    SelfAttentionSubmodules,
+)
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEColumnParallelLinear,
+    TEDotProductAttention,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_block import TransformerConfig
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+from megatron.core.utils import make_viewless_tensor
+
+from nemo.collections.diffusion.models.dit.dit_layer_spec import AdaLN
+
+
+class MoviegGenLayer(TransformerLayer):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+
+    DiT with Adapative Layer Normalization.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        hidden_dropout: float = None,
+        position_embedding_type: Literal["learned_absolute", "rope"] = "learned_absolute",
+    ):
+        def _replace_no_cp_submodules(submodules):
+            modified_submods = copy.deepcopy(submodules)
+            modified_submods.cross_attention = IdentityOp
+            # modified_submods.temporal_self_attention = IdentityOp
+            return modified_submods
+
+        # Replace any submodules that will have CP disabled and build them manually later after TransformerLayer init.
+        modified_submods = _replace_no_cp_submodules(submodules)
+        super().__init__(
+            config=config, submodules=modified_submods, layer_number=layer_number, hidden_dropout=hidden_dropout
+        )
+
+        # Override Cross Attention to disable CP.
+        # Disable TP Comm overlap as well. Not disabling will attempt re-use of buffer size same as Q and lead to incorrect tensor shapes.
+        cp_override_config = copy.deepcopy(config)
+        cp_override_config.context_parallel_size = 1
+        cp_override_config.tp_comm_overlap = False
+        self.cross_attention = build_module(
+            submodules.cross_attention,
+            config=cp_override_config,
+            layer_number=layer_number,
+        )
+
+        self.adaLN = AdaLN(config=self.config, n_adaln_chunks=6)  # , norm=TENorm)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        inference_params=None,
+        packed_seq_params=None,
+    ):
+        # timestep embedding
+        timestep_emb = attention_mask
+        factorized_pos_emb = rotary_pos_emb
+        hidden_states = hidden_states + factorized_pos_emb
+
+        # ******************************************** full self attention ******************************************************
+        shift_full, scale_full, gate_full, shift_mlp, scale_mlp, gate_mlp = self.adaLN(timestep_emb)
+
+        # adaLN with scale + shift
+        pre_full_attn_layernorm_output_ada = self.adaLN.modulated_layernorm(
+            hidden_states, shift=shift_full, scale=scale_full
+        )
+
+        attention_output, _ = self.self_attention(
+            pre_full_attn_layernorm_output_ada,
+            attention_mask=None,
+            packed_seq_params=None if packed_seq_params is None else packed_seq_params['self_attention'],
+        )
+
+        hidden_states = self.adaLN.scale_add(residual=hidden_states, x=attention_output, gate=gate_full)
+
+        # ******************************************** cross attention ******************************************************
+        attention_output, _ = self.cross_attention(
+            hidden_states,
+            attention_mask=context_mask,
+            key_value_states=context,
+            packed_seq_params=None if packed_seq_params is None else packed_seq_params['cross_attention'],
+        )
+
+        # ******************************************** mlp ******************************************************
+        pre_mlp_layernorm_output_ada = self.adaLN.modulated_layernorm(
+            attention_output, shift=shift_mlp, scale=scale_mlp
+        )
+
+        mlp_output, _ = self.mlp(pre_mlp_layernorm_output_ada)
+        hidden_states = self.adaLN.scale_add(residual=hidden_states, x=mlp_output, gate=gate_mlp)
+
+        # Jit compiled function creates 'view' tensor. This tensor
+        # potentially gets saved in the MPU checkpoint function context,
+        # which rejects view tensors. While making a viewless tensor here
+        # won't result in memory savings (like the data loader, or
+        # p2p_communication), it serves to document the origin of this
+        # 'view' tensor.
+        output = make_viewless_tensor(inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True)
+
+        return output, context
+
+
+def get_dit_llama_spec() -> ModuleSpec:
+    params = {"attn_mask_type": AttnMaskType.padding}
+    return ModuleSpec(
+        module=MoviegGenLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params=params,
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                ),
+            ),
+            cross_attention=ModuleSpec(
+                module=CrossAttention,
+                params=params,
+                submodules=CrossAttentionSubmodules(
+                    linear_q=TEColumnParallelLinear,
+                    linear_kv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                ),
+            ),
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TEColumnParallelLinear,
+                    linear_fc2=TERowParallelLinear,
+                ),
+            ),
+        ),
+    )
diff --git a/nemo/collections/diffusion/models/dit_llama/dit_llama_model.py b/nemo/collections/diffusion/models/dit_llama/dit_llama_model.py
new file mode 100644
index 000000000000..bfa79e366cac
--- /dev/null
+++ b/nemo/collections/diffusion/models/dit_llama/dit_llama_model.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Literal
+
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+from nemo.collections.diffusion.models.dit import dit_embeddings
+from nemo.collections.diffusion.models.dit.dit_model import DiTCrossAttentionModel
+from nemo.collections.diffusion.models.dit_llama.dit_llama_layer_spec import get_dit_llama_spec
+
+
+class DiTLlamaModel(DiTCrossAttentionModel):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        position_embedding_type: Literal["learned_absolute", "rope"] = "rope",
+        max_img_h: int = 80,
+        max_img_w: int = 80,
+        max_frames: int = 34,
+        patch_spatial: int = 1,
+        patch_temporal: int = 1,
+        in_channels: int = 16,
+        out_channels: int = 16,
+        **kwargs,
+    ):
+        super().__init__(
+            config=config,
+            pre_process=pre_process,
+            post_process=post_process,
+            fp16_lm_cross_entropy=fp16_lm_cross_entropy,
+            parallel_output=parallel_output,
+            position_embedding_type=position_embedding_type,
+            max_img_h=max_img_h,
+            max_img_w=max_img_w,
+            max_frames=max_frames,
+            patch_spatial=patch_spatial,
+            patch_temporal=patch_temporal,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            transformer_decoder_layer_spec=get_dit_llama_spec,
+            pos_embedder=dit_embeddings.FactorizedLearnable3DEmbedding,
+            **kwargs,
+        )
diff --git a/nemo/collections/diffusion/models/model.py b/nemo/collections/diffusion/models/model.py
new file mode 100644
index 000000000000..8cc6be860585
--- /dev/null
+++ b/nemo/collections/diffusion/models/model.py
@@ -0,0 +1,423 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import warnings
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import wandb
+from einops import rearrange
+from megatron.core import parallel_state
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.transformer.transformer_config import TransformerConfig
+from torch import nn
+from typing_extensions import override
+
+from nemo.collections.diffusion.models.dit_llama.dit_llama_model import DiTLlamaModel
+from nemo.collections.diffusion.sampler.edm.edm_pipeline import EDMPipeline
+from nemo.collections.llm.gpt.model.base import GPTModel
+from nemo.lightning import io
+from nemo.lightning.megatron_parallel import MaskedTokenLossReduction, MegatronLossReduction
+from nemo.lightning.pytorch.optim import OptimizerModule
+
+from .dit.dit_model import DiTCrossAttentionModel
+
+
+def dit_forward_step(model, batch) -> torch.Tensor:
+    return model(**batch)
+
+
+def dit_data_step(module, dataloader_iter):
+    batch = next(dataloader_iter)[0]
+    batch = get_batch_on_this_cp_rank(batch)
+    batch = {k: v.to(device='cuda', non_blocking=True) if torch.is_tensor(v) else v for k, v in batch.items()}
+
+    cu_seqlens = batch['seq_len_q'].cumsum(dim=0).to(torch.int32)
+    zero = torch.zeros(1, dtype=torch.int32, device="cuda")
+    cu_seqlens = torch.cat((zero, cu_seqlens))
+
+    cu_seqlens_kv = batch['seq_len_kv'].cumsum(dim=0).to(torch.int32)
+    cu_seqlens_kv = torch.cat((zero, cu_seqlens_kv))
+
+    batch['packed_seq_params'] = {
+        'self_attention': PackedSeqParams(
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_kv=cu_seqlens,
+            qkv_format='sbhd',
+        ),
+        'cross_attention': PackedSeqParams(
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_kv=cu_seqlens_kv,
+            qkv_format='sbhd',
+        ),
+    }
+
+    return batch
+
+
+def get_batch_on_this_cp_rank(data: Dict):
+    """Split the data for context parallelism."""
+    from megatron.core import mpu
+
+    cp_size = mpu.get_context_parallel_world_size()
+    cp_rank = mpu.get_context_parallel_rank()
+
+    t = 16
+    if cp_size > 1:
+        assert t % cp_size == 0, "t must divisibly by cp_size"
+        num_valid_tokens_in_ub = None
+        if 'loss_mask' in data and data['loss_mask'] is not None:
+            num_valid_tokens_in_ub = data['loss_mask'].sum()
+
+        for key, value in data.items():
+            if (value is not None) and (key in ['video', 'video_latent', 'noise_latent', 'pos_ids']):
+                if len(value.shape) > 5:
+                    value = value.squeeze(0)
+                B, C, T, H, W = value.shape
+                # TODO: sequence packing
+                data[key] = value.view(B, C, cp_size, T // cp_size, H, W)[:, :, cp_rank, ...].contiguous()
+        loss_mask = data["loss_mask"]
+        data["loss_mask"] = loss_mask.view(loss_mask.shape[0], cp_size, loss_mask.shape[1] // cp_size)[
+            :, cp_rank, ...
+        ].contiguous()
+        data['num_valid_tokens_in_ub'] = num_valid_tokens_in_ub
+    return data
+
+
+@dataclass
+class DiTConfig(TransformerConfig, io.IOMixin):
+    """
+    Config for DiT-S model
+    """
+
+    crossattn_emb_size: int = 1024
+    add_bias_linear: bool = False
+    gated_linear_unit: bool = False
+
+    num_layers: int = 12
+    hidden_size: int = 384
+    max_img_h: int = 80
+    max_img_w: int = 80
+    max_frames: int = 34
+    patch_spatial: int = 2
+    num_attention_heads: int = 6
+    layernorm_epsilon = 1e-6
+    normalization = "RMSNorm"
+    add_bias_linear = False
+    qk_layernorm_per_head = True
+    layernorm_zero_centered_gamma = False
+
+    fp16_lm_cross_entropy: bool = False
+    parallel_output: bool = True
+    share_embeddings_and_output_weights: bool = True
+
+    # max_position_embeddings: int = 5400
+    hidden_dropout: float = 0
+    attention_dropout: float = 0
+
+    bf16: bool = True
+    params_dtype: torch.dtype = torch.bfloat16
+
+    vae_module: str = 'nemo.collections.diffusion.vae.diffusers_vae.AutoencoderKLVAE'
+    vae_path: str = None
+    sigma_data: float = 0.5
+
+    in_channels: int = 16
+
+    data_step_fn = dit_data_step
+    forward_step_fn = dit_forward_step
+
+    @override
+    def configure_model(self, tokenizer=None) -> DiTCrossAttentionModel:
+        vp_size = self.virtual_pipeline_model_parallel_size
+        if vp_size:
+            p_size = self.pipeline_model_parallel_size
+            assert (
+                self.num_layers // p_size
+            ) % vp_size == 0, "Make sure the number of model chunks is the same across all pipeline stages."
+
+        if isinstance(self, DiTLlama30BConfig):
+            model = DiTLlamaModel
+        else:
+            model = DiTCrossAttentionModel
+        return model(
+            self,
+            fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
+            parallel_output=self.parallel_output,
+            pre_process=parallel_state.is_pipeline_first_stage(),
+            post_process=parallel_state.is_pipeline_last_stage(),
+            max_img_h=self.max_img_h,
+            max_img_w=self.max_img_w,
+            max_frames=self.max_frames,
+            patch_spatial=self.patch_spatial,
+        )
+
+    def configure_vae(self):
+        return dynamic_import(self.vae_module)(self.vae_path)
+
+
+@dataclass
+class DiTBConfig(DiTConfig):
+    num_layers: int = 12
+    hidden_size: int = 768
+    num_attention_heads: int = 12
+
+
+@dataclass
+class DiTLConfig(DiTConfig):
+    num_layers: int = 24
+    hidden_size: int = 1024
+    num_attention_heads: int = 16
+
+
+@dataclass
+class DiTXLConfig(DiTConfig):
+    num_layers: int = 28
+    hidden_size: int = 1152
+    num_attention_heads: int = 16
+
+
+@dataclass
+class DiT7BConfig(DiTConfig):
+    num_layers: int = 32
+    hidden_size: int = 3072
+    num_attention_heads: int = 24
+
+
+@dataclass
+class DiTLlama30BConfig(DiTConfig):
+    num_layers: int = 48
+    hidden_size: int = 6144
+    ffn_hidden_size: int = 16384
+    num_attention_heads: int = 48
+    num_query_groups: int = 8
+    gated_linear_unit: int = True
+    bias_activation_fusion: int = True
+    activation_func: Callable = F.silu
+    normalization: str = "RMSNorm"
+    layernorm_epsilon: float = 1e-5
+    max_frames: int = 128
+    max_img_h: int = 240
+    max_img_w: int = 240
+    patch_spatial: int = 2
+
+    init_method_std: float = 0.01
+    add_bias_linear: bool = False
+    seq_length: int = 256
+
+    bias_activation_fusion: bool = True
+    masked_softmax_fusion: bool = True
+    persist_layer_norm: bool = True
+    bias_dropout_fusion: bool = True
+
+
+@dataclass
+class DiTLlama5BConfig(DiTLlama30BConfig):
+    num_layers: int = 32
+    hidden_size: int = 3072
+    ffn_hidden_size: int = 8192
+    num_attention_heads: int = 24
+
+
+class DiTModel(GPTModel):
+    def __init__(
+        self,
+        config: Optional[DiTConfig] = None,
+        optim: Optional[OptimizerModule] = None,
+        model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
+        tokenizer: Optional[Any] = None,
+    ):
+        super().__init__(config or DiTConfig(), optim=optim, model_transform=model_transform)
+
+        self.vae = None
+
+        self._training_loss_reduction = None
+        self._validation_loss_reduction = None
+
+        self.diffusion_pipeline = EDMPipeline(net=self, sigma_data=self.config.sigma_data)
+
+        self._noise_generator = None
+        self.seed = 42
+
+        self.vae = None
+
+    def data_step(self, dataloader_iter) -> Dict[str, Any]:
+        return self.config.data_step_fn(dataloader_iter)
+
+    def forward(self, *args, **kwargs):
+        return self.module.forward(*args, **kwargs)
+
+    def forward_step(self, batch) -> torch.Tensor:
+        if parallel_state.is_pipeline_last_stage():
+            output_batch, loss = self.diffusion_pipeline.training_step(batch, 0)
+            loss = torch.mean(loss, dim=-1)
+            return loss
+        else:
+            output_tensor = self.diffusion_pipeline.training_step(batch, 0)
+            return output_tensor
+
+    def training_step(self, batch, batch_idx=None) -> torch.Tensor:
+        # In mcore the loss-function is part of the forward-pass (when labels are provided)
+        return self.forward_step(batch)
+
+    def on_validation_start(self):
+        if self.vae is None:
+            if self.config.vae_path is None:
+                warnings.warn('vae_path not specified skipping validation')
+                return None
+            self.vae = self.config.configure_vae()
+        self.vae.to('cuda')
+
+    def on_validation_end(self):
+        if self.vae is not None:
+            self.vae.to('cpu')
+
+    def validation_step(self, batch, batch_idx=None) -> torch.Tensor:
+        # In mcore the loss-function is part of the forward-pass (when labels are provided)
+        state_shape = batch['video'].shape
+        sample = self.diffusion_pipeline.generate_samples_from_batch(
+            batch,
+            guidance=7,
+            state_shape=state_shape,
+            num_steps=35,
+            is_negative_prompt=True if 'neg_t5_text_embeddings' in batch else False,
+        )
+
+        # TODO visualize more than 1 sample
+        sample = sample[0, None]
+        C, T, H, W = batch['latent_shape'][0]
+        seq_len_q = batch['seq_len_q'][0]
+
+        sample = rearrange(
+            sample[:, :seq_len_q],
+            'B (T H W) (ph pw pt C) -> B C (T pt) (H ph) (W pw)',
+            ph=self.config.patch_spatial,
+            pw=self.config.patch_spatial,
+            C=C,
+            T=T,
+            H=H // self.config.patch_spatial,
+            W=W // self.config.patch_spatial,
+        )
+
+        video = (1.0 + self.vae.decode(sample / self.config.sigma_data)).clamp(0, 2) / 2  # [B, 3, T, H, W]
+
+        video = (video * 255).to(torch.uint8).cpu().numpy().astype(np.uint8)
+
+        T = video.shape[2]
+        if T == 1:
+            image = rearrange(video, 'b c t h w -> (b t h) w c')
+            result = image
+        else:
+            # result = wandb.Video(video, fps=float(batch['fps'])) # (batch, time, channel, height width)
+            result = video
+
+        # wandb is on the last rank for megatron, first rank for nemo
+        wandb_rank = 0
+
+        if parallel_state.get_data_parallel_src_rank() == wandb_rank:
+            if torch.distributed.get_rank() == wandb_rank:
+                gather_list = [None for _ in range(parallel_state.get_data_parallel_world_size())]
+            else:
+                gather_list = None
+            torch.distributed.gather_object(
+                result, gather_list, wandb_rank, group=parallel_state.get_data_parallel_group()
+            )
+            if gather_list is not None:
+                videos = []
+                for video in gather_list:
+                    if len(video.shape) == 3:
+                        videos.append(wandb.Image(video))
+                    else:
+                        videos.append(wandb.Video(video, fps=30))
+                wandb.log({'prediction': videos}, step=self.global_step)
+
+        return None
+
+    @property
+    def training_loss_reduction(self) -> MaskedTokenLossReduction:
+        if not self._training_loss_reduction:
+            self._training_loss_reduction = MaskedTokenLossReduction()
+
+        return self._training_loss_reduction
+
+    @property
+    def validation_loss_reduction(self) -> MaskedTokenLossReduction:
+        if not self._validation_loss_reduction:
+            self._validation_loss_reduction = DummyLossReduction()
+
+        return self._validation_loss_reduction
+
+    def on_validation_model_zero_grad(self) -> None:
+        '''
+        Small hack to avoid first validation on resume.
+        This will NOT work if the gradient accumulation step should be performed at this point.
+        https://github.com/Lightning-AI/pytorch-lightning/discussions/18110
+        '''
+        super().on_validation_model_zero_grad()
+        if self.trainer.ckpt_path is not None and getattr(self, '_restarting_skip_val_flag', True):
+            self.trainer.sanity_checking = True
+            self._restarting_skip_val_flag = False
+
+
+class DummyLossReduction(MegatronLossReduction):
+    def __init__(self, validation_step: bool = False, val_drop_last: bool = True) -> None:
+        super().__init__()
+        self.validation_step = validation_step
+        self.val_drop_last = val_drop_last
+
+    def forward(
+        self, batch: Dict[str, torch.Tensor], forward_out: torch.Tensor
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        return torch.tensor(0.0, device=torch.cuda.current_device()), {
+            "avg": torch.tensor(0.0, device=torch.cuda.current_device())
+        }
+
+    def reduce(self, losses_reduced_per_micro_batch) -> torch.Tensor:
+        return torch.tensor(0.0, device=torch.cuda.current_device())
+
+
+def dynamic_import(full_path):
+    """
+    Dynamically import a class or function from a given full path.
+
+    :param full_path: The full path to the class or function (e.g., "package.module.ClassName")
+    :return: The imported class or function
+    :raises ImportError: If the module or attribute cannot be imported
+    :raises AttributeError: If the attribute does not exist in the module
+    """
+    try:
+        # Split the full path into module path and attribute name
+        module_path, attribute_name = full_path.rsplit('.', 1)
+    except ValueError as e:
+        raise ImportError(
+            f"Invalid full path '{full_path}'. It should contain both module and attribute names."
+        ) from e
+
+    # Import the module
+    try:
+        module = importlib.import_module(module_path)
+    except ImportError as e:
+        raise ImportError(f"Cannot import module '{module_path}'.") from e
+
+    # Retrieve the attribute from the module
+    try:
+        attribute = getattr(module, attribute_name)
+    except AttributeError as e:
+        raise AttributeError(f"Module '{module_path}' does not have an attribute '{attribute_name}'.") from e
+
+    return attribute
diff --git a/nemo/collections/diffusion/readme.rst b/nemo/collections/diffusion/readme.rst
new file mode 100644
index 000000000000..871527948708
--- /dev/null
+++ b/nemo/collections/diffusion/readme.rst
@@ -0,0 +1,190 @@
+Diffusion Training Framework
+=============
+
+Overview
+--------
+
+The NeMo Diffusion Training Framework provides a scalable training platform for diffusion models with transformer backbones.  Our new features streamline the training process, allowing developers to efficiently train state-of-the-art models with ease. 
+
+
+Some of the features we currently support include:
+
+- Energon Dataloader for Webscale Dataloading
+- Model and Data Parallelism
+- Model Architectures: DiT 30B parameters or even more
+
+
+Features Status
+---------------
+
+We support image diffusion training. Video training incoming.
+
+
++---------------------------+------------------+
+| Parallelism               | Status           |
++===========================+==================+
+| FSDP                      | ✅ Supported     |
++---------------------------+------------------+
+| CP+TP+SP+distopt          | ✅ Supported     |
++---------------------------+------------------+
+| CP+TP+SP+PP+distopt       | ✅ Supported     |
++---------------------------+------------------+
+| CP+TP+SP+FSDP             | 🕒 Coming Soon   |
++---------------------------+------------------+
+
+
+**Legend:**
+- **FSDP**: Fully Sharded Data Parallelism
+- **CP**: Context Parallelism
+- **TP**: Tensor Parallelism
+- **SP**: Sequence Parallelism
+- **PP**: Pipeline Parallelism
+- **distop**: mcore distributed optmizer
+
++--------------+-------------------+-----------------+
+| Model Size   | Modality          | Status          |
++==============+===================+=================+
+| DiT 30B+     | 256px image       | ✅ Supported    |
++--------------+-------------------+-----------------+
+| DiT 30B+     | 256px image+video | 🕒 Coming Soon  |
++--------------+-------------------+-----------------+
+| DiT 30B+     | 768px image+video | 🕒 Coming Soon  |
++--------------+-------------------+-----------------+
+
+
+Energon Dataloader for Webscale Dataloading
+-------------------------------------------
+
+Webscale Dataloading
+^^^^^^^^^^^^^^^^^^^^
+
+Megatron-Energon is an optimized multi-modal dataloader for large-scale deep learning with Megatron. Energon allows for distributed loading of large training training data for multi-modal model training. Energon allows for blending many datasets together and distributing the dataloading workflow across multiple cluster nodes/processes while ensuring reproducibility and resumability. 
+
+Dataloader Checkpointing
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+One of Energon's key features is its ability to save and restore its state. This functionality is crucial for long-running training processes, making the dataloader robust and recoverable after interruptions. By allowing checkpointing of the dataloader status, Energon ensures that training can be resumed from where it left off, saving time and computational resources in case of unexpected shutdowns or planned pauses in the training process. This makes it especially useful for large scale training as it requires several training jobs for end-to-end training.
+
+Parallel Configuration
+^^^^^^^^^^^^^^^^^^^^^^
+
+Energon's architecture allows it to efficiently distribute data across multiple processing units, ensuring that each GPU or node receives a balanced workload. This parallelization not only increases the overall throughput of data processing but also helps in maintaining high utilization of available computational resources.
+
+
+Mixed Image-Video Training (comming soon)
+------------------------------
+
+Our dataloader provides support for mixed image-video training by using the NeMo packed sequence feature to pack together images and videos of varying length into the same microbatch. The sequence packing mechanism uses the THD attention kernel, which allows us to increase the model FLOPs utilization (MFU) and efficiently process data with varying length.
+
+
+.. image:: assets/mixed_training.png
+   :alt: Mixed image-video dataloading strategy
+   :width: 300px
+   :align: center
+
+Model and Data Parallelism
+--------------------------
+NeMo provides support for training models using tensor parallelism, sequence parallelism, pipeline parallelism, and context parallelism. To support pipeline parallelism with conditional diffusion training, we duplicate the conditional embeddings across the pipeline stages, and perform an all-reduce during the backward pass. This approach uses more compute, but it has a lower communication cost than sending the conditional embeddings through different pipeline stages. 
+
+.. image:: assets/pipeline_conditioning.png
+   :alt: Conditioning mechanism for pipeline parallelism
+   :width: 300px
+   :align: center
+
+Model Architectures
+-------------------
+
+DiT
+^^^
+We implement an efficient version of the diffusion transformer (DiT) [1]_. Our DiT is slightly modified from the original paper as we use cross attention and adaptive layernorm together in the same architecture. We also use a QK-layernorm for training stability. Our framework allows for customizing the DiT architecture while maintaining its scalability, enabling training large DiT models on long sequence lengths.
+
+
+
+Data preparation
+--------------------------
+
+We expect data to be in this webdataset format. For more information about webdataset and energon dataset, please refer to https://github.com/NVIDIA/Megatron-Energon
+
+Here we demonstrate a step by step example of how to prepare a dummy image dataset.
+
+.. code-block:: bash
+
+    torchrun --nproc-per-node 2 nemo/collections/diffusion/data/prepare_energon_dataset.py --factory prepare_dummy_image_dataset
+
+this will generate a folder a tar files. .pth contains image/video latent representations encode by image/video tokenizer, .json contains metadata including text caption, resolution, aspection ratio, and .pickle contains text embeddings encoded by language model like T5.
+
+.. code-block:: bash
+
+   shard_000.tar
+   ├── samples/sample_0000.pth
+   ├── samples/sample_0000.pickle
+   ├── samples/sample_0000.json
+   ├── samples/sample_0001.pth
+   ├── samples/sample_0001.pickle
+   ├── samples/sample_0001.json
+   └── ...
+   shard_001.tar   
+
+The following is a sample command to prepare prepare webdataset into energon dataset:
+
+.. code-block:: bash
+
+   # energon prepare . --num-workers 192
+   Found 369057 tar files in total. The first and last ones are:
+   - 0.tar
+   - 99999.tar
+   If you want to exclude some of them, cancel with ctrl+c and specify an exclude filter in the command line.
+   Please enter a desired train/val/test split like "0.5, 0.2, 0.3" or "8,1,1": 1,0,0
+   Indexing shards  [####################################]  369057/369057
+   Sample 0, keys:
+   - .json
+   - .pickle
+   - .pth
+   Sample 1, keys:
+   - .json
+   - .pickle
+   - .pth
+   Found the following part types in the dataset: .json, .pth, .pickle
+   Do you want to create a dataset.yaml interactively? [Y/n]: Y
+   The following dataset classes are available:
+   0. CaptioningWebdataset
+   1. CrudeWebdataset
+   2. ImageClassificationWebdataset
+   3. ImageWebdataset
+   4. InterleavedWebdataset
+   5. MultiChoiceVQAWebdataset
+   6. OCRWebdataset
+   7. SimilarityInterleavedWebdataset
+   8. TextWebdataset
+   9. VQAOCRWebdataset
+   10. VQAWebdataset
+   11. VidQAWebdataset
+   Please enter a number to choose a class: 1
+   The dataset you selected uses the following sample type:
+
+   class CrudeSample(dict):
+      """Generic sample type to be processed later."""
+
+   CrudeWebdataset does not need a field map. You will need to provide a `Cooker` for your dataset samples in your `TaskEncoder`.
+   Furthermore, you might want to add `subflavors` in your meta dataset specification.
+
+training
+--------------------------
+
+To launch training on one node
+
+.. code-block:: bash
+
+   torchrun --nproc-per-node 8 nemo/collections/diffusion/train.py --yes --factory pretrain_xl
+
+To launch training on multiple nodes using Slurm
+
+.. code-block:: bash
+
+   sbatch nemo/collections/diffusion/scripts/train.sh --factory pretrain_xl
+
+
+Citations
+---------
+
+.. [1] William Peebles and Saining Xie, "Scalable Diffusion Models with Transformers," *arXiv preprint arXiv:2212.09748*, 2022.
\ No newline at end of file
diff --git a/nemo/collections/diffusion/sampler/__init__.py b/nemo/collections/diffusion/sampler/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/diffusion/sampler/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/diffusion/sampler/batch_ops.py b/nemo/collections/diffusion/sampler/batch_ops.py
new file mode 100644
index 000000000000..956dfbee36e5
--- /dev/null
+++ b/nemo/collections/diffusion/sampler/batch_ops.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from torch import Tensor
+
+
+def common_broadcast(x: Tensor, y: Tensor) -> tuple[Tensor, Tensor]:
+    """
+    Broadcasts two tensors to have the same shape by adding singleton dimensions where necessary.
+
+    Args:
+        x (Tensor): The first input tensor.
+        y (Tensor): The second input tensor.
+
+    Returns:
+        tuple[Tensor, Tensor]: A tuple containing the two tensors with broadcasted shapes.
+
+    Raises:
+        AssertionError: If the dimensions of the tensors do not match at any axis within their common dimensions.
+    """
+    ndims1 = x.ndim
+    ndims2 = y.ndim
+
+    common_ndims = min(ndims1, ndims2)
+    for axis in range(common_ndims):
+        assert x.shape[axis] == y.shape[axis], "Dimensions not equal at axis {}".format(axis)
+
+    if ndims1 < ndims2:
+        x = x.reshape(x.shape + (1,) * (ndims2 - ndims1))
+    elif ndims2 < ndims1:
+        y = y.reshape(y.shape + (1,) * (ndims1 - ndims2))
+
+    return x, y
+
+
+def batch_add(x: Tensor, y: Tensor) -> Tensor:
+    """
+    Adds two tensors element-wise after broadcasting them to a common shape.
+
+    Args:
+        x (Tensor): The first input tensor.
+        y (Tensor): The second input tensor.
+
+    Returns:
+        Tensor: The element-wise sum of the input tensors after broadcasting.
+    """
+    x, y = common_broadcast(x, y)
+    return x + y
+
+
+def batch_mul(x: Tensor, y: Tensor) -> Tensor:
+    """
+    Multiplies two tensors element-wise after broadcasting them to a common shape.
+
+    Args:
+        x (Tensor): The first input tensor.
+        y (Tensor): The second input tensor.
+
+    Returns:
+        Tensor: The element-wise product of the input tensors after broadcasting.
+    """
+    x, y = common_broadcast(x, y)
+    return x * y
+
+
+def batch_sub(x: Tensor, y: Tensor) -> Tensor:
+    """
+    Subtracts two tensors element-wise after broadcasting them to a common shape.
+
+    Args:
+        x (Tensor): The first input tensor.
+        y (Tensor): The second input tensor.
+
+    Returns:
+        Tensor: The result of element-wise subtraction of the input tensors.
+    """
+    x, y = common_broadcast(x, y)
+    return x - y
+
+
+def batch_div(x: Tensor, y: Tensor) -> Tensor:
+    """
+    Divides two tensors element-wise after broadcasting them to a common shape.
+
+    Args:
+        x (Tensor): The first input tensor.
+        y (Tensor): The second input tensor.
+
+    Returns:
+        Tensor: The result of element-wise division of `x` by `y` after broadcasting.
+    """
+    x, y = common_broadcast(x, y)
+    return x / y
diff --git a/nemo/collections/diffusion/sampler/context_parallel.py b/nemo/collections/diffusion/sampler/context_parallel.py
new file mode 100644
index 000000000000..f389b7ba2656
--- /dev/null
+++ b/nemo/collections/diffusion/sampler/context_parallel.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch import Tensor
+from torch.distributed import ProcessGroup, all_gather, get_world_size
+
+
+def cat_outputs_cp(x: Tensor, seq_dim: int, cp_group: ProcessGroup) -> Tensor:
+    """
+    Concatenates tensors from multiple processes along a specified dimension.
+
+    This function gathers tensors from all processes in the given process group
+    and concatenates them along the specified dimension.
+
+    Args:
+        x (Tensor): The input tensor to be gathered and concatenated.
+        seq_dim (int): The dimension along which to concatenate the gathered tensors.
+        cp_group (ProcessGroup): The process group containing all the processes involved in the gathering.
+
+    Returns:
+        Tensor: A tensor resulting from the concatenation of tensors from all processes.
+
+    Raises:
+        RuntimeError: If the gathering of tensors fails.
+    """
+    # Number of processes in the group
+    world_size = get_world_size(cp_group)
+
+    # List to hold tensors from each rank
+    gathered_tensors = [torch.zeros_like(x) for _ in range(world_size)]
+
+    # Attempt to gather tensors from all ranks
+    try:
+        all_gather(gathered_tensors, x, group=cp_group)
+    except RuntimeError as e:
+        raise RuntimeError(f"Gathering failed: {e}")
+
+    # Concatenate tensors along the specified dimension
+    return torch.cat(gathered_tensors, dim=seq_dim)
diff --git a/nemo/collections/diffusion/sampler/edm/__init__.py b/nemo/collections/diffusion/sampler/edm/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/diffusion/sampler/edm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/diffusion/sampler/edm/edm.py b/nemo/collections/diffusion/sampler/edm/edm.py
new file mode 100644
index 000000000000..eb47728af40a
--- /dev/null
+++ b/nemo/collections/diffusion/sampler/edm/edm.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from statistics import NormalDist
+from typing import Callable, Tuple
+
+import numpy as np
+import torch
+from torch import nn
+from tqdm import tqdm
+
+
+class EDMScaling:
+    def __init__(self, sigma_data: float = 0.5):
+        self.sigma_data = sigma_data
+
+    def __call__(self, sigma: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        c_skip = self.sigma_data**2 / (sigma**2 + self.sigma_data**2)
+        c_out = sigma * self.sigma_data / (sigma**2 + self.sigma_data**2) ** 0.5
+        c_in = 1 / (sigma**2 + self.sigma_data**2) ** 0.5
+        c_noise = 0.25 * sigma.log()
+        return c_skip, c_out, c_in, c_noise
+
+
+class EDMSDE:
+    def __init__(
+        self,
+        p_mean: float = -1.2,
+        p_std: float = 1.2,
+        sigma_max: float = 80.0,
+        sigma_min: float = 0.002,
+    ):
+        self.gaussian_dist = NormalDist(mu=p_mean, sigma=p_std)
+        self.sigma_max = sigma_max
+        self.sigma_min = sigma_min
+        self._generator = np.random
+
+    def sample_t(self, batch_size: int) -> torch.Tensor:
+        cdf_vals = self._generator.uniform(size=(batch_size))
+        samples_interval_gaussian = [self.gaussian_dist.inv_cdf(cdf_val) for cdf_val in cdf_vals]
+        log_sigma = torch.tensor(samples_interval_gaussian, device="cuda")
+        return torch.exp(log_sigma)
+
+    def marginal_prob(self, x0: torch.Tensor, sigma: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        return x0, sigma
+
+
+class EDMSampler(nn.Module):
+    """
+    Elucidating the Design Space of Diffusion-Based Generative Models (EDM)
+    # https://github.com/NVlabs/edm/blob/62072d2612c7da05165d6233d13d17d71f213fee/generate.py#L25
+
+    Attributes:
+        None
+
+    Methods:
+        forward(x0_fn: Callable, x_sigma_max: torch.Tensor, num_steps: int = 35, sigma_min: float = 0.002,
+                sigma_max: float = 80, rho: float = 7, S_churn: float = 0, S_min: float = 0,
+                S_max: float = float("inf"), S_noise: float = 1) -> torch.Tensor:
+            Performs the forward pass for the EDM sampling process.
+
+            Parameters:
+                x0_fn (Callable): A function that takes in a tensor and returns a denoised tensor.
+                x_sigma_max (torch.Tensor): The initial noise level tensor.
+                num_steps (int, optional): The number of sampling steps. Default is 35.
+                sigma_min (float, optional): The minimum noise level. Default is 0.002.
+                sigma_max (float, optional): The maximum noise level. Default is 80.
+                rho (float, optional): The rho parameter for time step discretization. Default is 7.
+                S_churn (float, optional): The churn parameter for noise increase. Default is 0.
+                S_min (float, optional): The minimum value for the churn parameter. Default is 0.
+                S_max (float, optional): The maximum value for the churn parameter. Default is float("inf").
+                S_noise (float, optional): The noise scale for the churn parameter. Default is 1.
+
+            Returns:
+                torch.Tensor: The sampled tensor after the EDM process.
+    """
+
+    @torch.no_grad()
+    def forward(
+        self,
+        x0_fn: Callable,
+        x_sigma_max: torch.Tensor,
+        num_steps: int = 35,
+        sigma_min: float = 0.002,
+        sigma_max: float = 80,
+        rho: float = 7,
+        S_churn: float = 0,
+        S_min: float = 0,
+        S_max: float = float("inf"),
+        S_noise: float = 1,
+    ) -> torch.Tensor:
+        # Time step discretization.
+        in_dtype = x_sigma_max.dtype
+        _ones = torch.ones(x_sigma_max.shape[0], dtype=in_dtype, device=x_sigma_max.device)
+        step_indices = torch.arange(num_steps, dtype=torch.float64, device=x_sigma_max.device)
+        t_steps = (
+            sigma_max ** (1 / rho) + step_indices / (num_steps - 1) * (sigma_min ** (1 / rho) - sigma_max ** (1 / rho))
+        ) ** rho
+        t_steps = torch.cat([t_steps, torch.zeros_like(t_steps[:1])])  # t_N = 0
+
+        # Main sampling loop.
+        x_next = x_sigma_max.to(torch.float64)
+        for i, (t_cur, t_next) in enumerate(
+            tqdm(zip(t_steps[:-1], t_steps[1:], strict=False), total=len(t_steps) - 1)
+        ):  # 0, ..., N-1
+            x_cur = x_next
+
+            # Increase noise temporarily.
+            gamma = min(S_churn / num_steps, np.sqrt(2) - 1) if S_min <= t_cur <= S_max else 0
+            t_hat = t_cur + gamma * t_cur
+            x_hat = x_cur + (t_hat**2 - t_cur**2).sqrt() * S_noise * torch.randn_like(x_cur)
+
+            # Euler step.
+            denoised = x0_fn(x_hat.to(in_dtype), t_hat.to(in_dtype) * _ones).to(torch.float64)
+            d_cur = (x_hat - denoised) / t_hat
+            x_next = x_hat + (t_next - t_hat) * d_cur
+
+            # Apply 2nd order correction.
+            if i < num_steps - 1:
+                denoised = x0_fn(x_hat.to(in_dtype), t_hat.to(in_dtype) * _ones).to(torch.float64)
+                d_prime = (x_next - denoised) / t_next
+                x_next = x_hat + (t_next - t_hat) * (0.5 * d_cur + 0.5 * d_prime)
+
+        return x_next.to(in_dtype)
diff --git a/nemo/collections/diffusion/sampler/edm/edm_pipeline.py b/nemo/collections/diffusion/sampler/edm/edm_pipeline.py
new file mode 100644
index 000000000000..6e1be1f6f2a6
--- /dev/null
+++ b/nemo/collections/diffusion/sampler/edm/edm_pipeline.py
@@ -0,0 +1,434 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Callable, Dict, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed
+from einops import rearrange
+from megatron.core import parallel_state
+from torch import Tensor
+
+from nemo.collections.diffusion.sampler.batch_ops import batch_mul
+from nemo.collections.diffusion.sampler.context_parallel import cat_outputs_cp
+from nemo.collections.diffusion.sampler.edm.edm import EDMSDE, EDMSampler, EDMScaling
+
+
+class EDMPipeline:
+    """
+    EDMPipeline is a class that implements a diffusion model pipeline for video generation. It includes methods for
+    initializing the pipeline, encoding and decoding video data, performing training steps, denoising, and generating
+    samples.
+    Attributes:
+        p_mean: Mean for SDE process.
+        p_std: Standard deviation for SDE process.
+        sigma_max: Maximum noise level.
+        sigma_min: Minimum noise level.
+        _noise_generator: Generator for noise.
+        _noise_level_generator: Generator for noise levels.
+        sde: SDE process.
+        sampler: Sampler for the diffusion model.
+        scaling: Scaling for EDM.
+        input_data_key: Key for input video data.
+        input_image_key: Key for input image data.
+        tensor_kwargs: Tensor keyword arguments.
+        loss_reduce: Method for reducing loss.
+        loss_scale: Scale factor for loss.
+        aesthetic_finetuning: Aesthetic finetuning parameter.
+        camera_sample_weight: Camera sample weight parameter.
+        loss_mask_enabled: Flag for enabling loss mask.
+    Methods:
+        noise_level_generator: Returns the noise level generator.
+        _initialize_generators: Initializes noise and noise-level generators.
+        encode: Encodes input tensor using the video tokenizer.
+        decode: Decodes latent tensor using video tokenizer.
+        training_step: Performs a single training step for the diffusion model.
+        denoise: Performs denoising on the input noise data, noise level, and condition.
+        compute_loss_with_epsilon_and_sigma: Computes the loss for training.
+        get_per_sigma_loss_weights: Returns loss weights per sigma noise level.
+        get_condition_uncondition: Returns conditioning and unconditioning for classifier-free guidance.
+        get_x0_fn_from_batch: Creates a function to generate denoised predictions with the sampler.
+        generate_samples_from_batch: Generates samples based on input data batch.
+        _normalize_video_databatch_inplace: Normalizes video data in-place on a CUDA device to [-1, 1].
+        draw_training_sigma_and_epsilon: Draws training noise (epsilon) and noise levels (sigma).
+        random_dropout_input: Applies random dropout to the input tensor.
+        get_data_and_condition: Retrieves data and conditioning for model input.
+    """
+
+    def __init__(
+        self,
+        net,
+        vae=None,
+        p_mean=0.0,
+        p_std=1.0,
+        sigma_max=80,
+        sigma_min=0.0002,
+        sigma_data=0.5,
+        seed=1234,
+    ):
+        """
+        Initializes the EDM pipeline with the given parameters.
+
+        Args:
+            net: The DiT model.
+            vae: The Video Tokenizer (optional).
+            p_mean (float): Mean for the SDE.
+            p_std (float): Standard deviation for the SDE.
+            sigma_max (float): Maximum sigma value for the SDE.
+            sigma_min (float): Minimum sigma value for the SDE.
+            sigma_data (float): Sigma value for EDM scaling.
+            seed (int): Random seed for reproducibility.
+
+        Attributes:
+            vae: The Video Tokenizer.
+            net: The DiT model.
+            p_mean (float): Mean for the SDE.
+            p_std (float): Standard deviation for the SDE.
+            sigma_max (float): Maximum sigma value for the SDE.
+            sigma_min (float): Minimum sigma value for the SDE.
+            sigma_data (float): Sigma value for EDM scaling.
+            seed (int): Random seed for reproducibility.
+            _noise_generator: Placeholder for noise generator.
+            _noise_level_generator: Placeholder for noise level generator.
+            sde: Instance of EDMSDE initialized with p_mean, p_std, sigma_max, and sigma_min.
+            sampler: Instance of EDMSampler.
+            scaling: Instance of EDMScaling initialized with sigma_data.
+            input_data_key (str): Key for input data.
+            input_image_key (str): Key for input images.
+            tensor_kwargs (dict): Tensor keyword arguments for device and dtype.
+            loss_reduce (str): Method to reduce loss ('mean' or other).
+            loss_scale (float): Scale factor for loss.
+        """
+        self.vae = vae
+        self.net = net
+
+        self.p_mean = p_mean
+        self.p_std = p_std
+        self.sigma_max = sigma_max
+        self.sigma_min = sigma_min
+        self.sigma_data = sigma_data
+
+        self.seed = seed
+        self._noise_generator = None
+        self._noise_level_generator = None
+
+        self.sde = EDMSDE(p_mean, p_std, sigma_max, sigma_min)
+        self.sampler = EDMSampler()
+        self.scaling = EDMScaling(sigma_data)
+
+        self.input_data_key = 'video'
+        self.input_image_key = 'images_1024'
+        self.tensor_kwargs = {"device": "cuda", "dtype": torch.bfloat16}
+        self.loss_reduce = 'mean'
+        self.loss_scale = 1.0
+
+    @property
+    def noise_level_generator(self):
+        """
+        Generates noise levels for the EDM pipeline.
+
+        Returns:
+            Callable: A function or generator that produces noise levels.
+        """
+        return self._noise_level_generator
+
+    def _initialize_generators(self):
+        """
+        Initializes the random number generators for noise and noise level.
+
+        This method sets up two generators:
+        1. A PyTorch generator for noise, seeded with a combination of the base seed and the data parallel rank.
+        2. A NumPy generator for noise levels, seeded similarly but without considering context parallel rank.
+
+        Returns:
+            None
+        """
+        noise_seed = self.seed + 100 * parallel_state.get_data_parallel_rank(with_context_parallel=True)
+        noise_level_seed = self.seed + 100 * parallel_state.get_data_parallel_rank(with_context_parallel=False)
+        self._noise_generator = torch.Generator(device='cuda')
+        self._noise_generator.manual_seed(noise_seed)
+        self._noise_level_generator = np.random.default_rng(noise_level_seed)
+        self.sde._generator = self._noise_level_generator
+
+    def training_step(
+        self, data_batch: dict[str, torch.Tensor], iteration: int
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
+        """
+        Performs a single training step for the diffusion model.
+
+        This method is responsible for executing one iteration of the model's training. It involves:
+        1. Adding noise to the input data using the SDE process.
+        2. Passing the noisy data through the network to generate predictions.
+        3. Computing the loss based on the difference between the predictions and the original data.
+
+        Args:
+            data_batch (dict): raw data batch draw from the training data loader.
+            iteration (int): Current iteration number.
+
+        Returns:
+            A tuple with the output batch and the computed loss.
+        """
+        # Get the input data to noise and denoise~(image, video) and the corresponding conditioner.
+        x0_from_data_batch, x0, condition = self.get_data_and_condition(data_batch)
+
+        # Sample pertubation noise levels and N(0, 1) noises
+        sigma, epsilon = self.draw_training_sigma_and_epsilon(x0.size(), condition)
+
+        if parallel_state.is_pipeline_last_stage():
+            output_batch, pred_mse, edm_loss = self.compute_loss_with_epsilon_and_sigma(
+                data_batch, x0_from_data_batch, x0, condition, epsilon, sigma
+            )
+
+            return output_batch, edm_loss
+        else:
+            net_output = self.compute_loss_with_epsilon_and_sigma(
+                data_batch, x0_from_data_batch, x0, condition, epsilon, sigma
+            )
+            return net_output
+
+    def denoise(self, xt: torch.Tensor, sigma: torch.Tensor, condition: dict[str, torch.Tensor]):
+        """
+        Performs denoising on the input noise data, noise level, and condition
+
+        Args:
+            xt (torch.Tensor): The input noise data.
+            sigma (torch.Tensor): The noise level.
+            condition (dict[str, torch.Tensor]): conditional information
+
+        Returns:
+            Predicted clean data (x0) and noise (eps_pred).
+        """
+
+        xt = xt.to(**self.tensor_kwargs)
+        sigma = sigma.to(**self.tensor_kwargs)
+        # get precondition for the network
+        c_skip, c_out, c_in, c_noise = self.scaling(sigma=sigma)
+
+        net_output = self.net(
+            x=batch_mul(c_in, xt),  # Eq. 7 of https://arxiv.org/pdf/2206.00364.pdf
+            timesteps=c_noise,  # Eq. 7 of https://arxiv.org/pdf/2206.00364.pdf
+            **condition,
+        )
+
+        if not parallel_state.is_pipeline_last_stage():
+            return net_output
+
+        x0_pred = batch_mul(c_skip, xt) + batch_mul(c_out, net_output)
+
+        # get noise prediction based on sde
+        eps_pred = batch_mul(xt - x0_pred, 1.0 / sigma)
+
+        return x0_pred, eps_pred
+
+    def compute_loss_with_epsilon_and_sigma(
+        self,
+        data_batch: dict[str, torch.Tensor],
+        x0_from_data_batch: torch.Tensor,
+        x0: torch.Tensor,
+        condition: dict[str, torch.Tensor],
+        epsilon: torch.Tensor,
+        sigma: torch.Tensor,
+    ):
+        """
+        Computes the loss for training.
+
+        Args:
+            data_batch: Batch of input data.
+            x0_from_data_batch: Raw input tensor.
+            x0: Latent tensor.
+            condition: Conditional input data.
+            epsilon: Noise tensor.
+            sigma: Noise level tensor.
+
+        Returns:
+            The computed loss.
+        """
+        # Get the mean and stand deviation of the marginal probability distribution.
+        mean, std = self.sde.marginal_prob(x0, sigma)
+        # Generate noisy observations
+        xt = mean + batch_mul(std, epsilon)  # corrupted data
+
+        if parallel_state.is_pipeline_last_stage():
+            # make prediction
+            x0_pred, eps_pred = self.denoise(xt, sigma, condition)
+            # loss weights for different noise levels
+            weights_per_sigma = self.get_per_sigma_loss_weights(sigma=sigma)
+            pred_mse = (x0 - x0_pred) ** 2
+            edm_loss = batch_mul(pred_mse, weights_per_sigma)
+
+            output_batch = {
+                "x0": x0,
+                "xt": xt,
+                "sigma": sigma,
+                "weights_per_sigma": weights_per_sigma,
+                "condition": condition,
+                "model_pred": {"x0_pred": x0_pred, "eps_pred": eps_pred},
+                "mse_loss": pred_mse.mean(),
+                "edm_loss": edm_loss.mean(),
+            }
+            return output_batch, pred_mse, edm_loss
+        else:
+            # make prediction
+            x0_pred = self.denoise(xt, sigma, condition)
+            return x0_pred.contiguous()
+
+    def get_per_sigma_loss_weights(self, sigma: torch.Tensor):
+        """
+        Args:
+            sigma (tensor): noise level
+
+        Returns:
+            loss weights per sigma noise level
+        """
+        return (sigma**2 + self.sigma_data**2) / (sigma * self.sigma_data) ** 2
+
+    def get_condition_uncondition(self, data_batch: Dict):
+        """Returns conditioning and unconditioning for classifier-free guidance."""
+        _, _, condition = self.get_data_and_condition(data_batch, dropout_rate=0.0)
+
+        if 'neg_t5_text_embeddings' in data_batch:
+            data_batch['t5_text_embeddings'] = data_batch['neg_t5_text_embeddings']
+            data_batch["t5_text_mask"] = data_batch["neg_t5_text_mask"]
+            _, _, uncondition = self.get_data_and_condition(data_batch, dropout_rate=1.0)
+        else:
+            _, _, uncondition = self.get_data_and_condition(data_batch, dropout_rate=1.0)
+
+        return condition, uncondition
+
+    def get_x0_fn_from_batch(
+        self,
+        data_batch: Dict,
+        guidance: float = 1.5,
+        is_negative_prompt: bool = False,
+    ) -> Callable:
+        """
+        Creates a function to generate denoised predictions with the sampler.
+
+        Args:
+            data_batch: Batch of input data.
+            guidance: Guidance scale factor.
+            is_negative_prompt: Whether to use negative prompts.
+
+        Returns:
+            A callable to predict clean data (x0).
+        """
+        condition, uncondition = self.get_condition_uncondition(data_batch)
+
+        def x0_fn(noise_x: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
+            cond_x0, _ = self.denoise(noise_x, sigma, condition)
+            uncond_x0, _ = self.denoise(noise_x, sigma, uncondition)
+            return cond_x0 + guidance * (cond_x0 - uncond_x0)
+
+        return x0_fn
+
+    def generate_samples_from_batch(
+        self,
+        data_batch: Dict,
+        guidance: float = 1.5,
+        state_shape: Tuple | None = None,
+        is_negative_prompt: bool = False,
+        num_steps: int = 35,
+    ) -> Tensor:
+        """
+        Generates samples based on input data batch.
+
+        Args:
+            data_batch: Batch of input data.
+            guidance: Guidance scale factor.
+            state_shape: Shape of the state.
+            is_negative_prompt: Whether to use negative prompts.
+            num_steps: Number of steps for sampling.
+            solver_option: SDE Solver option.
+
+        Returns:
+            Generated samples from diffusion model.
+        """
+        cp_enabled = parallel_state.get_context_parallel_world_size() > 1
+
+        if self._noise_generator is None:
+            self._initialize_generators()
+        x0_fn = self.get_x0_fn_from_batch(data_batch, guidance, is_negative_prompt=is_negative_prompt)
+
+        state_shape = list(state_shape)
+        state_shape[1] //= parallel_state.get_context_parallel_world_size()
+        x_sigma_max = (
+            torch.randn(state_shape, **self.tensor_kwargs, generator=self._noise_generator) * self.sde.sigma_max
+        )
+
+        samples = self.sampler(x0_fn, x_sigma_max, num_steps=num_steps, sigma_max=self.sde.sigma_max)
+
+        if cp_enabled:
+            cp_group = parallel_state.get_context_parallel_group()
+            samples = cat_outputs_cp(samples, seq_dim=2, cp_group=cp_group)
+
+        return samples
+
+    def draw_training_sigma_and_epsilon(self, x0_size: int, condition: Any) -> torch.Tensor:
+        """
+        Draws training noise (epsilon) and noise levels (sigma).
+
+        Args:
+            x0_size: Shape of the input tensor.
+            condition: Conditional input (unused).
+
+        Returns:
+            Noise level (sigma) and noise (epsilon).
+        """
+        del condition
+        batch_size = x0_size[0]
+        if self._noise_generator is None:
+            self._initialize_generators()
+        epsilon = torch.randn(x0_size, **self.tensor_kwargs, generator=self._noise_generator)
+        return self.sde.sample_t(batch_size).to(**self.tensor_kwargs), epsilon
+
+    def random_dropout_input(self, in_tensor: torch.Tensor, dropout_rate: Optional[float] = None) -> torch.Tensor:
+        """
+        Applies random dropout to the input tensor.
+
+        Args:
+            in_tensor: Input tensor.
+            dropout_rate: Dropout probability (optional).
+
+        Returns:
+            Conditioning with random dropout applied.
+        """
+        dropout_rate = dropout_rate if dropout_rate is not None else self.dropout_rate
+        return batch_mul(
+            torch.bernoulli((1.0 - dropout_rate) * torch.ones(in_tensor.shape[0])).type_as(in_tensor),
+            in_tensor,
+        )
+
+    def get_data_and_condition(self, data_batch: dict[str, Tensor], dropout_rate=0.2) -> Tuple[Tensor]:
+        """
+        Retrieves data and conditioning for model input.
+
+        Args:
+            data_batch: Batch of input data.
+            dropout_rate: Dropout probability for conditioning.
+
+        Returns:
+            Raw data, latent data, and conditioning information.
+        """
+        # Latent state
+        raw_state = data_batch["video"] * self.sigma_data
+        # assume data is already encoded
+        latent_state = raw_state
+
+        # Condition
+        data_batch['crossattn_emb'] = self.random_dropout_input(
+            data_batch['t5_text_embeddings'], dropout_rate=dropout_rate
+        )
+
+        return raw_state, latent_state, data_batch
diff --git a/nemo/collections/diffusion/scripts/train.sh b/nemo/collections/diffusion/scripts/train.sh
new file mode 100644
index 000000000000..2150458e9376
--- /dev/null
+++ b/nemo/collections/diffusion/scripts/train.sh
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+# example slurm script for training diffusion
+
+#SBATCH -p your_partition -A your_account -t 24:00:00 --nodes=16 --exclusive --mem=0 --overcommit --gpus-per-node 8 --ntasks-per-node=8 --dependency=singleton
+
+export WANDB_PROJECT=xxx
+export WANDB_RUN_ID=xxx
+export WANDB_RESUME=allow
+export NVTE_FUSED_ATTN=0 
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
+DIR=`pwd`
+
+srun -l --container-image nvcr.io/nvidia/nemo:dev --container-mounts "/home:/home" --no-container-mount-home --mpi=pmix bash -c "cd ${DIR} ; python -u nemo/collections/diffusion/train.py --yes $*"
diff --git a/nemo/collections/diffusion/train.py b/nemo/collections/diffusion/train.py
new file mode 100644
index 000000000000..43a0a5dcb536
--- /dev/null
+++ b/nemo/collections/diffusion/train.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from megatron.core.optimizer import OptimizerConfig
+from pytorch_lightning.loggers import WandbLogger
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.diffusion.data.diffusion_energon_datamodule import DiffusionDataModule
+from nemo.collections.diffusion.data.diffusion_taskencoder import BasicDiffusionTaskEncoder
+from nemo.collections.diffusion.models.model import (
+    DiT7BConfig,
+    DiTConfig,
+    DiTLConfig,
+    DiTLlama5BConfig,
+    DiTLlama30BConfig,
+    DiTModel,
+    DiTXLConfig,
+)
+from nemo.lightning.pytorch.callbacks import ModelCheckpoint, PreemptionCallback
+from nemo.lightning.pytorch.callbacks.model_transform import ModelTransform
+from nemo.lightning.pytorch.strategies.utils import RestoreConfig
+
+
+@run.cli.factory
+@run.autoconvert
+def multimodal_datamodule() -> pl.LightningDataModule:
+    data_module = DiffusionDataModule(
+        seq_length=2048,
+        task_encoder=run.Config(BasicDiffusionTaskEncoder, seq_length=2048),
+        micro_batch_size=1,
+        global_batch_size=32,
+    )
+    return data_module
+
+
+@run.cli.factory
+@run.autoconvert
+def peft(args) -> ModelTransform:
+    return llm.peft.LoRA(
+        target_modules=['linear_qkv', 'linear_proj'],  # , 'linear_fc1', 'linear_fc2'],
+        dim=args.lora_dim,
+    )
+
+
+@run.cli.factory(target=llm.train)
+def pretrain() -> run.Partial:
+    return run.Partial(
+        llm.train,
+        model=run.Config(
+            DiTModel,
+            config=run.Config(DiTConfig),
+        ),
+        data=multimodal_datamodule(),
+        trainer=run.Config(
+            nl.Trainer,
+            devices='auto',
+            num_nodes=int(os.environ.get('SLURM_NNODES', 1)),
+            accelerator="gpu",
+            strategy=run.Config(
+                nl.MegatronStrategy,
+                tensor_model_parallel_size=1,
+                pipeline_model_parallel_size=1,
+                context_parallel_size=1,
+                sequence_parallel=False,
+                pipeline_dtype=torch.bfloat16,
+                ddp=run.Config(
+                    DistributedDataParallelConfig,
+                    check_for_nan_in_grad=True,
+                    grad_reduce_in_fp32=True,
+                ),
+            ),
+            plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+            num_sanity_val_steps=0,
+            limit_val_batches=1,
+            val_check_interval=1000,
+            max_epochs=10000,
+            log_every_n_steps=1,
+            callbacks=[
+                run.Config(
+                    ModelCheckpoint,
+                    monitor='reduced_train_loss',
+                    filename='{epoch}-{step}',
+                    every_n_train_steps=1000,
+                    save_top_k=-1,
+                ),
+                run.Config(PreemptionCallback),
+            ],
+        ),
+        log=nl.NeMoLogger(wandb=(WandbLogger() if "WANDB_API_KEY" in os.environ else None)),
+        optim=run.Config(
+            nl.MegatronOptimizerModule,
+            config=run.Config(
+                OptimizerConfig,
+                lr=1e-4,
+                bf16=True,
+                params_dtype=torch.bfloat16,
+                use_distributed_optimizer=True,
+                weight_decay=0,
+            ),
+        ),
+        tokenizer=None,
+        resume=run.Config(
+            nl.AutoResume,
+            resume_if_exists=True,
+            resume_ignore_no_checkpoint=True,
+            resume_past_end=True,
+        ),
+        model_transform=None,
+    )
+
+
+@run.cli.factory(target=llm.train)
+def pretrain_xl() -> run.Partial:
+    recipe = pretrain()
+    recipe.model.config = run.Config(DiTXLConfig)
+    return recipe
+
+
+@run.cli.factory(target=llm.train)
+def pretrain_l() -> run.Partial:
+    recipe = pretrain()
+    recipe.model.config = run.Config(DiTLConfig)
+    return recipe
+
+
+@run.cli.factory(target=llm.train)
+def pretrain_7b() -> run.Partial:
+    recipe = pretrain()
+    recipe.model.config = run.Config(DiT7BConfig)
+    recipe.data.global_batch_size = 4608
+    recipe.data.micro_batch_size = 9
+    recipe.data.num_workers = 15
+    recipe.data.use_train_split_for_val = True
+    recipe.data.seq_length = 260
+    recipe.data.task_encoder.seq_length = 260
+    recipe.trainer.val_check_interval = 1000
+    recipe.log.log_dir = 'nemo_experiments/dit7b'
+    recipe.optim.lr_scheduler = run.Config(nl.lr_scheduler.WarmupHoldPolicyScheduler, warmup_steps=100, hold_steps=1e9)
+    recipe.optim.config.weight_decay = 0.1
+    recipe.optim.config.adam_beta1 = 0.9
+    recipe.optim.config.adam_beta2 = 0.95
+
+    return recipe
+
+
+@run.cli.factory(target=llm.train)
+def pretrain_ditllama5b() -> run.Partial:
+    recipe = pretrain_7b()
+    recipe.data.micro_batch_size = 12
+    recipe.model.config = run.Config(DiTLlama5BConfig)
+    recipe.log.log_dir = 'nemo_experiments/ditllama5b'
+    return recipe
+
+
+@run.cli.factory(target=llm.train)
+def pretrain_ditllama30b() -> run.Partial:
+    recipe = pretrain_ditllama5b()
+    recipe.model.config = run.Config(DiTLlama30BConfig)
+    recipe.data.global_batch_size = 9216
+    recipe.data.micro_batch_size = 6
+    recipe.log.log_dir = 'nemo_experiments/ditllama30b'
+    return recipe
+
+
+@run.cli.factory(target=llm.train)
+def dreambooth() -> run.Partial:
+    recipe = pretrain()
+    recipe.optim.config.lr = 1e-6
+    recipe.data = multimodal_datamodule()
+    recipe.model.config = run.Config(DiTConfig)
+
+    recipe.trainer.max_steps = 1000
+    recipe.trainer.strategy.tensor_model_parallel_size = 8
+    recipe.trainer.strategy.sequence_parallel = True
+
+    recipe.resume.restore_config = run.Config(RestoreConfig)
+    recipe.resume.resume_if_exists = False
+
+    return recipe
+
+
+if __name__ == "__main__":
+    run.cli.main(llm.train, default_factory=dreambooth)
diff --git a/nemo/collections/diffusion/vae/__init__.py b/nemo/collections/diffusion/vae/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/diffusion/vae/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/diffusion/vae/diffusers_vae.py b/nemo/collections/diffusion/vae/diffusers_vae.py
new file mode 100644
index 000000000000..19a056d4a682
--- /dev/null
+++ b/nemo/collections/diffusion/vae/diffusers_vae.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from diffusers import AutoencoderKL
+from einops import rearrange
+
+
+class AutoencoderKLVAE(torch.nn.Module):
+    def __init__(self, path):
+        super().__init__()
+        self.vae = AutoencoderKL.from_pretrained(path, torch_dtype=torch.bfloat16)
+
+    @torch.no_grad()
+    def decode(self, x):
+        B, C, T, H, W = x.shape
+        if T == 1:
+            x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = x / self.vae.config.scaling_factor
+        out = self.vae.decode(x, return_dict=False)[0]
+        if T == 1:
+            return rearrange(out, '(b t) c h w -> b c t h w', t=1)
+        return out
diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index bc6f4dd9201e..4205c401eea8 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -17,7 +17,7 @@
 
 safe_import("transformer_engine")
 
-from nemo.collections.llm import peft, tokenizer
+from nemo.collections.llm import peft
 from nemo.collections.llm.gpt.data import (
     DollyDataModule,
     FineTuningDataModule,
@@ -70,6 +70,7 @@
     MaskedTokenLossReduction,
     MistralConfig7B,
     MistralModel,
+    MistralNeMoConfig12B,
     MixtralConfig8x3B,
     MixtralConfig8x7B,
     MixtralConfig8x22B,
@@ -115,6 +116,7 @@
     "t5_forward_step",
     "MaskedTokenLossReduction",
     "MistralConfig7B",
+    "MistralNeMoConfig12B",
     "MistralModel",
     "MixtralConfig8x3B",
     "MixtralConfig8x7B",
@@ -188,7 +190,7 @@
 try:
     import nemo_run as run
 
-    from nemo.collections.llm.api import export_ckpt, finetune, import_ckpt, pretrain, train, validate
+    from nemo.collections.llm.api import export_ckpt, finetune, generate, import_ckpt, pretrain, train, validate
     from nemo.collections.llm.recipes import *  # noqa
 
     __all__.extend(
@@ -199,6 +201,7 @@
             "pretrain",
             "validate",
             "finetune",
+            "generate",
         ]
     )
 except ImportError as error:
diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 847b87131925..71e006472db9 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -16,16 +16,23 @@
 import os
 from copy import deepcopy
 from pathlib import Path
-from typing import Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
 import nemo_run as run
 import pytorch_lightning as pl
+import torch
 from typing_extensions import Annotated
 
+import nemo.lightning as nl
 from nemo.lightning import AutoResume, NeMoLogger, OptimizerModule, Trainer, io
 from nemo.lightning.pytorch.callbacks import PEFT, ModelTransform
 from nemo.utils import logging
 
+if TYPE_CHECKING:
+    from megatron.core.inference.common_inference_params import CommonInferenceParams
+    from megatron.core.inference.inference_request import InferenceRequest
+
+
 TokenizerType = Any
 
 
@@ -384,7 +391,7 @@ def deploy(
             try:
                 logging.info("REST service will be started.")
                 uvicorn.run(
-                    'nemo.deploy.service.rest_model_api:app',
+                    "nemo.deploy.service.rest_model_api:app",
                     host=rest_service_http_address,
                     port=rest_service_port,
                     reload=True,
@@ -425,6 +432,38 @@ def export_ckpt(
     return io.export_ckpt(path, target, output_path, overwrite, load_connector)
 
 
+@run.cli.entrypoint(name="generate", namespace="llm")
+def generate(
+    path: Union[Path, str],
+    prompts: list[str],
+    trainer: Optional[nl.Trainer] = None,
+    params_dtype: torch.dtype = torch.bfloat16,
+    max_batch_size: int = 4,
+    random_seed: Optional[int] = None,
+    inference_batch_times_seqlen_threshold: int = 1000,
+    inference_params: Optional["CommonInferenceParams"] = None,
+    text_only: bool = False,
+) -> list[Union["InferenceRequest", str]]:
+    from nemo.collections.llm import inference
+
+    inference_wrapped_model, mcore_tokenizer = inference.setup_model_and_tokenizer(
+        path=path,
+        trainer=trainer,
+        params_dtype=params_dtype,
+        inference_batch_times_seqlen_threshold=inference_batch_times_seqlen_threshold,
+    )
+    results = inference.generate(
+        model=inference_wrapped_model,
+        tokenizer=mcore_tokenizer,
+        prompts=prompts,
+        max_batch_size=max_batch_size,
+        random_seed=random_seed,
+        inference_params=inference_params,
+    )
+
+    return [r.generated_text if text_only else r for r in results]
+
+
 def _use_tokenizer(model: pl.LightningModule, data: pl.LightningDataModule, tokenizer: TokenizerType) -> None:
     if tokenizer == "data":
         _set_with_io(model, "tokenizer", data.tokenizer)
diff --git a/nemo/collections/llm/gpt/data/dolly.py b/nemo/collections/llm/gpt/data/dolly.py
index 78751d60cdb0..fb8cf9fd5da0 100644
--- a/nemo/collections/llm/gpt/data/dolly.py
+++ b/nemo/collections/llm/gpt/data/dolly.py
@@ -26,6 +26,7 @@
 
 if TYPE_CHECKING:
     from nemo.collections.common.tokenizers import TokenizerSpec
+    from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 
 
 class DollyDataModule(FineTuningDataModule, IOMixin):
@@ -56,7 +57,7 @@ def __init__(
         pin_memory: bool = True,
         persistent_workers: bool = False,
         pad_to_max_length: bool = False,
-        packed_sequence_size: int = -1,
+        packed_sequence_specs: Optional["PackedSequenceSpecs"] = None,
     ):
         self.force_redownload = force_redownload
         self.delete_raw = delete_raw
@@ -74,7 +75,7 @@ def __init__(
             pin_memory=pin_memory,
             persistent_workers=persistent_workers,
             pad_to_max_length=pad_to_max_length,
-            packed_sequence_size=packed_sequence_size,
+            packed_sequence_specs=packed_sequence_specs,
         )
 
     def prepare_data(self) -> None:
diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py
index 3e4dba7ec89c..01cf617a094d 100644
--- a/nemo/collections/llm/gpt/data/fine_tuning.py
+++ b/nemo/collections/llm/gpt/data/fine_tuning.py
@@ -20,12 +20,14 @@
 import pytorch_lightning as pl
 from torch.utils.data import DataLoader
 
+from nemo.collections.common.tokenizers import AutoTokenizer
 from nemo.collections.llm.gpt.data.core import create_sft_dataset
 from nemo.lightning.pytorch.plugins import MegatronDataSampler
 from nemo.utils import logging
 
 if TYPE_CHECKING:
     from nemo.collections.common.tokenizers import TokenizerSpec
+    from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 
 
 class FineTuningDataModule(pl.LightningDataModule):
@@ -50,10 +52,7 @@ class FineTuningDataModule(pl.LightningDataModule):
         persistent_workers (bool, optional): Whether to keep data loading workers persistent across epochs. Defaults to False.
         max_train_steps (int, optional): Maximum number of steps to train. Used to calculate samples mapping for the mmap dataset
         pad_to_max_length (bool, optional): Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch.
-        packed_sequence_size (int, optional): If a positive integer, this arg enables training with sequence packing and specifies the pack size
-            If less than or equal to 0, sequence packing is disabled. Defaults to -1.
-            Note: This arg is distinct from `seq_length` because `seq_length` specifies the maximum length of the original sequence
-            (i.e. the length to truncate long sequences in the input data).
+        packed_sequence_specs (PackedSequenceSpecs, optional): See PackedSequenceSpecs for details
     """
 
     def __init__(
@@ -70,7 +69,7 @@ def __init__(
         pin_memory: bool = True,
         persistent_workers: bool = False,
         pad_to_max_length: bool = False,
-        packed_sequence_size: int = -1,
+        packed_sequence_specs: Optional["PackedSequenceSpecs"] = None,
     ):
         super().__init__()
         self.seq_length = seq_length
@@ -87,22 +86,21 @@ def __init__(
         self.data_sampler = None
         self.max_train_samples = None
         self.pad_to_max_length = pad_to_max_length
-        self.packed_sequence_size = packed_sequence_size
-        self._adjust_batch_sizes_for_packed_sequence()
+        self.packed_sequence_specs = packed_sequence_specs
+        self.packed_sequence_size = -1 if not packed_sequence_specs else packed_sequence_specs.packed_sequence_size
+        self.validate_batch_size_for_packed_sequence()
 
-    def _adjust_batch_sizes_for_packed_sequence(self):
+    def validate_batch_size_for_packed_sequence(self):
         if self.packed_sequence_size > 0 and self.micro_batch_size > 1:
-            logging.warning(
+            raise ValueError(
                 "Micro batch size should be 1 when training with packed sequence, but your micro batch size "
-                f"is {self.micro_batch_size}. Your config will be automatically updated to the following: "
-                f"MBS will be set to 1 (from {self.micro_batch_size}), "
-                f"GBS will be set to {self.global_batch_size // self.micro_batch_size} (from {self.global_batch_size}), "
-                f"packed sequence length will be set to {self.packed_sequence_size*self.micro_batch_size} (from {self.packed_sequence_size}). "
+                f"is {self.micro_batch_size}. \nThe following config is equivalent to your current setting for "
+                f"a packed dataset. Please update your config to the following: \n"
+                f"Set micro batch size to 1 (currently {self.micro_batch_size})\n"
+                f"Set global batch size to {self.global_batch_size // self.micro_batch_size} (currently {self.global_batch_size}) \n"
+                f"Set packed sequence length to {self.packed_sequence_size*self.micro_batch_size} (currently {self.packed_sequence_size}) \n"
                 f"For details please visit https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/optimizations/sequence_packing.html"
             )
-            self.global_batch_size //= self.micro_batch_size
-            self.packed_sequence_size *= self.micro_batch_size
-            self.micro_batch_size = 1
 
     def prepare_data(self) -> None:
         if self.packed_sequence_size > 0 and not self.train_path_packed.is_file():
@@ -187,7 +185,12 @@ def train_path(self) -> Path:
     @property
     def train_path_packed(self) -> Path:
         if self.packed_sequence_size > 0:
-            return self.dataset_root / f"training_packed{self.packed_sequence_size}.npy"
+            if self.packed_sequence_specs.packed_data_path is not None:
+                return self.packed_sequence_specs.packed_data_path
+            tokenizer_model_name = self._extract_tokenizer_model_name()
+            folder_name = self.dataset_root / "packed" / tokenizer_model_name
+            folder_name.mkdir(parents=True, exist_ok=True)
+            return folder_name / f"training_{self.packed_sequence_size}.npy"
         else:
             raise ValueError("`train_path_packed` invalid since packed sequence size is not specified.")
 
@@ -198,3 +201,18 @@ def validation_path(self) -> Path:
     @property
     def test_path(self) -> Path:
         return self.dataset_root / "test.jsonl"
+
+    def _extract_tokenizer_model_name(self) -> str:
+        if self.packed_sequence_specs.tokenizer_model_name is not None:
+            tokenizer_model_name = self.packed_sequence_specs.tokenizer_model_name
+        elif isinstance(self.tokenizer, AutoTokenizer):
+            name = self.tokenizer.tokenizer.name_or_path
+            if name.endswith("nemo_tokenizer"):
+                # NEMO_HOME/hf_org/hf_model/nemo_tokenizer => hf_org--hf_model
+                tokenizer_model_name = '--'.join(name.split("/")[-3:-1])
+            else:
+                # hf_org/hf_model => hf_org--hf_model
+                tokenizer_model_name = name.replace("/", "--")
+        else:
+            tokenizer_model_name = f"unknown_tokenizer_{hash(self.tokenizer)}"
+        return tokenizer_model_name
diff --git a/nemo/collections/llm/gpt/data/packed_sequence.py b/nemo/collections/llm/gpt/data/packed_sequence.py
index 4675b3fbb398..372e851da7cd 100644
--- a/nemo/collections/llm/gpt/data/packed_sequence.py
+++ b/nemo/collections/llm/gpt/data/packed_sequence.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
 
@@ -83,3 +83,32 @@ def prepare_packed_sequence_data(
     # save output data
     np.save(output_path, output_data)
     logging.info(f"Packed sequence is prepared and saved to {output_path}")
+
+
+@dataclass
+class PackedSequenceSpecs:
+    packed_sequence_size: int = -1
+    """
+    If a positive integer, this arg enables training with sequence packing and specifies the pack size
+    If less than or equal to 0, sequence packing is disabled. Defaults to -1.
+    Note: This arg is distinct from `seq_length` because `seq_length` specifies the maximum length of the original sequence
+    (i.e. the length to truncate long sequences in the input data).
+    """
+
+    tokenizer_model_name: str = None
+    """
+    Keep track of tokenizer model name, since each tokenizer produces a different packed sequence dataset file.
+    This field is set by llm.finetune api.
+    """
+
+    packed_data_path: Path = None
+    """
+    If specified, use the packed dataset from this file instead of the default path.
+    """
+
+    def __post_init__(self):
+        if self.packed_data_path is not None:
+            assert (
+                self.packed_data_path.suffix == ".npy"
+            ), f"packed data file must be a .npy file: {self.packed_data_path}"
+            assert self.packed_data_path.exists(), f"packed data file does not exist: {self.packed_data_path}"
diff --git a/nemo/collections/llm/gpt/data/squad.py b/nemo/collections/llm/gpt/data/squad.py
index ec0fc1aad02c..f872db94077d 100644
--- a/nemo/collections/llm/gpt/data/squad.py
+++ b/nemo/collections/llm/gpt/data/squad.py
@@ -24,6 +24,7 @@
 
 if TYPE_CHECKING:
     from nemo.collections.common.tokenizers import TokenizerSpec
+    from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 
 
 class SquadDataModule(FineTuningDataModule, IOMixin):
@@ -54,7 +55,7 @@ def __init__(
         pin_memory: bool = True,
         persistent_workers: bool = False,
         pad_to_max_length: bool = False,
-        packed_sequence_size: int = -1,
+        packed_sequence_specs: Optional["PackedSequenceSpecs"] = None,
     ):
         self.force_redownload = force_redownload
         self.delete_raw = delete_raw
@@ -72,7 +73,7 @@ def __init__(
             pin_memory=pin_memory,
             persistent_workers=persistent_workers,
             pad_to_max_length=pad_to_max_length,
-            packed_sequence_size=packed_sequence_size,
+            packed_sequence_specs=packed_sequence_specs,
         )
 
     def prepare_data(self) -> None:
diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index aa3615b3ddfd..ebecc06140fe 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -53,7 +53,7 @@
     LlamaConfig,
     LlamaModel,
 )
-from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel
+from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel, MistralNeMoConfig12B
 from nemo.collections.llm.gpt.model.mixtral import (
     MixtralConfig8x3B,
     MixtralConfig8x7B,
diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index f48f4a15d327..c7a6e01c673e 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -204,6 +204,9 @@ class GPTConfig5B(GPTConfig):
     ffn_hidden_size: int = 16384
     num_attention_heads: int = 32
 
+    bias_activation_fusion: bool = True
+    bias_dropout_add_fusion: bool = True
+
 
 @dataclass
 class GPTConfig7B(GPTConfig):
@@ -222,6 +225,9 @@ class GPTConfig20B(GPTConfig):
     ffn_hidden_size: int = 24576
     num_attention_heads: int = 48
 
+    bias_activation_fusion: bool = True
+    bias_dropout_add_fusion: bool = True
+
 
 @dataclass
 class GPTConfig40B(GPTConfig):
@@ -240,6 +246,9 @@ class GPTConfig175B(GPTConfig):
     ffn_hidden_size: int = 49152
     num_attention_heads: int = 96
 
+    bias_activation_fusion: bool = True
+    bias_dropout_add_fusion: bool = True
+
 
 class GPTModel(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin):
     def __init__(
diff --git a/nemo/collections/llm/gpt/model/mistral.py b/nemo/collections/llm/gpt/model/mistral.py
index f353362c9cbd..b9f4b6fb8f65 100644
--- a/nemo/collections/llm/gpt/model/mistral.py
+++ b/nemo/collections/llm/gpt/model/mistral.py
@@ -59,7 +59,7 @@ class MistralConfig7B(GPTConfig):
 
 
 @dataclass
-class MistralNeMo2407Config12B(MistralConfig7B):
+class MistralNeMoConfig12B(MistralConfig7B):
     """
     https://mistral.ai/news/mistral-nemo/
     """
@@ -75,7 +75,7 @@ class MistralNeMo2407Config12B(MistralConfig7B):
 
 
 @dataclass
-class MistralNeMo2407Config123B(MistralConfig7B):
+class MistralNeMoConfig123B(MistralConfig7B):
     """
     https://mistral.ai/news/mistral-large-2407/
     """
diff --git a/nemo/collections/llm/gpt/model/ssm.py b/nemo/collections/llm/gpt/model/ssm.py
index 954fa8bfe9f7..c7228951fa78 100644
--- a/nemo/collections/llm/gpt/model/ssm.py
+++ b/nemo/collections/llm/gpt/model/ssm.py
@@ -53,6 +53,9 @@ class SSMConfig(TransformerConfig, io.IOMixin):
     fp16_lm_cross_entropy: bool = False
     parallel_output: bool = True
     share_embeddings_and_output_weights: bool = False
+    params_dtype: torch.dtype = torch.bfloat16
+    fp16: bool = False
+    bf16: bool = True
     num_layers: int = 2
     mamba_ssm_ngroups: int = 8
     num_attention_heads: int = 1
@@ -81,6 +84,7 @@ class SSMConfig(TransformerConfig, io.IOMixin):
 
     forward_step_fn: Callable = ssm_forward_step
     data_step_fn: Callable = gpt_data_step
+    tokenizer_model_path: str = None
 
     def configure_model(self, tokenizer) -> "MCoreMambaModel":
 
@@ -127,9 +131,17 @@ def __init__(self, state_dict):
             def state_dict(self):
                 return self._state_dict
 
+            def to(self, dtype):
+                for k, v in self._state_dict.items():
+                    if v.dtype != dtype:
+                        logging.warning(f"Converting {k} from {v.dtype} (source model) to {dtype} (target model)")
+                    self._state_dict[k] = v.to(dtype)
+
         source = ModelState(source)
         target = self.init()
-        trainer = self.nemo_setup(target)
+        trainer = self.nemo_setup(target, ckpt_async_save=False)
+        source.to(self.config.params_dtype)
+        target.to(self.config.params_dtype)
         self.convert_state(source, target)
         self.nemo_save(output_path, trainer)
 
diff --git a/nemo/collections/llm/inference/__init__.py b/nemo/collections/llm/inference/__init__.py
new file mode 100644
index 000000000000..7cd9a8061d38
--- /dev/null
+++ b/nemo/collections/llm/inference/__init__.py
@@ -0,0 +1,3 @@
+from nemo.collections.llm.inference.base import MCoreTokenizerWrappper, generate, setup_model_and_tokenizer
+
+__all__ = ["MCoreTokenizerWrappper", "setup_model_and_tokenizer", "generate"]
diff --git a/nemo/collections/llm/inference/base.py b/nemo/collections/llm/inference/base.py
new file mode 100644
index 000000000000..95da536fde06
--- /dev/null
+++ b/nemo/collections/llm/inference/base.py
@@ -0,0 +1,107 @@
+from pathlib import Path
+from typing import Optional
+
+import pytorch_lightning as pl
+import torch
+import torch.distributed
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.engines.mcore_engine import MCoreEngine
+from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
+from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
+    SimpleTextGenerationController,
+)
+from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel
+from pytorch_lightning.trainer.states import TrainerFn
+
+import nemo.lightning as nl
+from nemo.lightning import io
+from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy
+from nemo.lightning.pytorch.strategies.utils import RestoreConfig
+
+
+# We need this wrapper since mcore generate uses tokenizer.detokenize, tokenizer.tokenize to encode and decode prompts
+class MCoreTokenizerWrappper:
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+        self.eod = tokenizer.eod
+        self.vocab_size = tokenizer.vocab_size
+
+    def detokenize(self, tokens):
+        return self.tokenizer.ids_to_text(tokens)
+
+    def tokenize(self, prompt):
+        return self.tokenizer.text_to_ids(prompt)
+
+
+# TODO: Move to lightning Fabric API.
+def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl.LightningModule):
+    assert isinstance(trainer.strategy, MegatronStrategy), "Only MegatronStrategy is supported for trainer.strategy."
+    assert trainer.strategy.context_parallel_size <= 1, "Context parallelism is not supported for inference."
+    restore_config = RestoreConfig(
+        path=path,
+        load_model_state=True,
+        load_optim_state=False,
+    )
+    trainer.strategy.restore_config = restore_config
+    trainer.ckpt_path = None
+    trainer.strategy.connect(model)
+    if trainer.strategy.launcher is not None:
+        trainer.strategy.launcher.launch(lambda: None, trainer=trainer)
+    trainer.strategy.setup_environment()
+
+    if not model.state_dict():
+        model.configure_model()
+
+    trainer.state.fn = TrainerFn.TESTING
+    trainer.strategy.setup_megatron_parallel(trainer=trainer)
+    trainer.strategy.trainer = trainer
+    trainer.strategy.selective_restore()
+
+
+def setup_model_and_tokenizer(
+    path: Path,
+    trainer: Optional[nl.Trainer] = None,
+    params_dtype: torch.dtype = torch.bfloat16,
+    inference_batch_times_seqlen_threshold: int = 1000,
+) -> tuple[MCoreGPTModel, MCoreTokenizerWrappper]:
+    model: io.TrainerContext = io.load_context(path=path, subpath="model")
+    trainer = trainer or io.load_context(path=path, subpath="trainer")
+    _setup_trainer_and_restore_model(path=path, trainer=trainer, model=model)
+
+    # This is to get the MCore model required in GPTInferenceWrapper.
+    mcore_model = model.module.module.module
+    inference_wrapped_model = GPTInferenceWrapper(
+        mcore_model,
+        InferenceWrapperConfig(
+            hidden_size=mcore_model.config.hidden_size,
+            params_dtype=params_dtype,
+            inference_batch_times_seqlen_threshold=inference_batch_times_seqlen_threshold,
+            padded_vocab_size=model.tokenizer.vocab_size,
+        ),
+    )
+
+    return inference_wrapped_model, MCoreTokenizerWrappper(model.tokenizer)
+
+
+def generate(
+    model: GPTInferenceWrapper,
+    tokenizer: MCoreTokenizerWrappper,
+    prompts: list[str],
+    max_batch_size: int = 4,
+    random_seed: Optional[int] = None,
+    inference_params: Optional[CommonInferenceParams] = None,
+) -> dict:
+    text_generation_controller = SimpleTextGenerationController(inference_wrapped_model=model, tokenizer=tokenizer)
+    mcore_engine = MCoreEngine(
+        text_generation_controller=text_generation_controller, max_batch_size=max_batch_size, random_seed=random_seed
+    )
+
+    common_inference_params = inference_params or CommonInferenceParams(num_tokens_to_generate=512)
+
+    results = mcore_engine.generate(
+        prompts=prompts,
+        common_inference_params=common_inference_params,
+    )
+
+    return results
diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py
index db4861e9e987..e7a0d70d0603 100644
--- a/nemo/collections/llm/peft/lora.py
+++ b/nemo/collections/llm/peft/lora.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import re
 from dataclasses import dataclass, field
 from typing import List, Literal
 
 from megatron.core import parallel_state
+from megatron.core.tensor_parallel import ColumnParallelLinear, RowParallelLinear
 from torch import nn
 
 from nemo.lightning.pytorch.callbacks.peft import PEFT, AdapterWrapper
@@ -23,15 +25,16 @@
 from nemo.utils.import_utils import safe_import_from
 
 TEColumnParallelLinear, HAVE_TE_COL_LINEAR = safe_import_from(
-    "megatron.core.transformer.custom_layers.transformer_engine", "TEColumnParallelLinear"
+    "megatron.core.extensions.transformer_engine", "TEColumnParallelLinear"
 )
-TELayerNormColumnParallelLinear, HAVE_TE_COL_LINEAR = safe_import_from(
-    "megatron.core.transformer.custom_layers.transformer_engine",
+TELayerNormColumnParallelLinear, HAVE_TE_LN_COL_LINEAR = safe_import_from(
+    "megatron.core.extensions.transformer_engine",
     "TELayerNormColumnParallelLinear",
 )
 TERowParallelLinear, HAVE_TE_ROW_LINEAR = safe_import_from(
-    "megatron.core.transformer.custom_layers.transformer_engine", "TERowParallelLinear"
+    "megatron.core.extensions.transformer_engine", "TERowParallelLinear"
 )
+HAVE_TE = all((HAVE_TE_COL_LINEAR, HAVE_TE_LN_COL_LINEAR, HAVE_TE_ROW_LINEAR))
 
 
 class AdapterParallelAdd(AdapterWrapper):
@@ -82,6 +85,9 @@ class LoRA(PEFT):
                 - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention modules.
                 - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP.
                 - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP.
+            Target modules can also contain wildcards. For example, you can specify
+                target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv
+                on the first two layers.
         dim (int): Dimension of the low-rank projection space. Defaults to 32.
         alpha (int): Weighting factor for the low-rank projection. Defaults to 32.
         dropout (float): Dropout rate for the low-rank projection. Defaults to 0.0.
@@ -129,37 +135,43 @@ def transform(self, m: nn.Module, name=None, prefix=None):
         """
         from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ParallelLinearAdapter
 
+        def wildcard_match(pattern, key):
+            if key is None:
+                return None
+            regex_pattern = re.compile("^" + pattern.replace("*", "(.*)") + "$")
+            match = regex_pattern.match(key)
+            return match is not None
+
         tp_size = parallel_state.get_tensor_model_parallel_world_size()
-        if name in self.target_modules:
-            if name in ['linear_qkv', 'linear_fc1']:
-                # Column Parallel Linear
+        full_name = f"{prefix}.{name}" if prefix else name
+        if name in self.target_modules or any(wildcard_match(pattern, full_name) for pattern in self.target_modules):
+            if HAVE_TE and isinstance(m, TEColumnParallelLinear) or isinstance(m, TELayerNormColumnParallelLinear):
                 input_is_parallel = False
-                if HAVE_TE_COL_LINEAR and (
-                    isinstance(m, TEColumnParallelLinear) or isinstance(m, TELayerNormColumnParallelLinear)
-                ):
-                    # m.in_features and m.out_features are divided by tp_size already,
-                    # but in_features and out_features passed to ParallelLinearAdapter are not.
-                    in_features = m.in_features
-                    out_features = m.out_features * tp_size
-                else:
-                    in_features = m.input_size
-                    out_features = m.output_size
+                # m.in_features and m.out_features are divided by tp_size already,
+                # but in_features and out_features passed to ParallelLinearAdapter are not.
+                in_features = m.in_features
+                out_features = m.out_features * tp_size
                 # LoRA is applied after layernorm, so layernorm output must be returned
                 m.return_layernorm_output = True
                 # perf optimization for LoRA + SP
                 if m.config.sequence_parallel and not m.ub_overlap_ag:
                     m.return_layernorm_output_gathered = True
-            else:  # name in ['linear_proj', 'linear_fc2']
-                # Row Parallel Linear
+            elif HAVE_TE and isinstance(m, TERowParallelLinear):
+                input_is_parallel = True
+                in_features = m.in_features * tp_size
+                out_features = m.out_features
+            elif isinstance(m, ColumnParallelLinear):
+                input_is_parallel = False
+                in_features = m.input_size
+                out_features = m.output_size
+            elif isinstance(m, RowParallelLinear):
                 input_is_parallel = True
-                if HAVE_TE_ROW_LINEAR and isinstance(m, TERowParallelLinear):
-                    in_features = m.in_features * tp_size
-                    out_features = m.out_features
-                else:
-                    in_features = m.input_size
-                    out_features = m.output_size
-
-            logging.info(f"Adding lora to: {prefix}.{name}")
+                in_features = m.input_size
+                out_features = m.output_size
+            else:
+                raise NotImplementedError(f"Layer type is unrecognized for LoRA: {type(m)}")
+
+            logging.info(f"Adding lora to: {full_name}")
             adapter = ParallelLinearAdapter(
                 in_features,
                 out_features,
diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
index 6bee8c882ffd..47cc4e71448d 100644
--- a/nemo/collections/llm/recipes/__init__.py
+++ b/nemo/collections/llm/recipes/__init__.py
@@ -21,7 +21,15 @@
     llama3_70b_16k,
     llama3_70b_64k,
     llama31_405b,
-    mistral,
+    mamba2_1_3b,
+    mamba2_2_7b,
+    mamba2_8b,
+    mamba2_130m,
+    mamba2_370m,
+    mamba2_780m,
+    mamba2_hybrid_8b,
+    mistral_7b,
+    mistral_nemo_12b,
     mixtral_8x7b,
     mixtral_8x7b_16k,
     mixtral_8x7b_64k,
@@ -48,7 +56,15 @@
     "llama3_70b_16k",
     "llama3_70b_64k",
     "llama31_405b",
-    "mistral",
+    "mamba2_130m",
+    "mamba2_370m",
+    "mamba2_780m",
+    "mamba2_1_3b",
+    "mamba2_2_7b",
+    "mamba2_8b",
+    "mamba2_hybrid_8b",
+    "mistral_7b",
+    "mistral_nemo_12b",
     "mixtral_8x7b",
     "mixtral_8x7b_16k",
     "mixtral_8x7b_64k",
diff --git a/nemo/collections/llm/recipes/gpt3_175b.py b/nemo/collections/llm/recipes/gpt3_175b.py
new file mode 100644
index 000000000000..1abe8a218e82
--- /dev/null
+++ b/nemo/collections/llm/recipes/gpt3_175b.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.model import GPTConfig175B, GPTModel
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
+    userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048,
+)
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "gpt3_175b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a GPT3 175B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the GPT3 175B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=gpt3_175b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(GPTModel, config=run.Config(GPTConfig175B))
+
+
+def trainer(
+    tensor_parallelism: int = 4,
+    pipeline_parallelism: int = 8,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
+    virtual_pipeline_parallelism: Optional[int] = 6,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = True,
+    num_nodes: int = 64,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for GPT3 175B model.
+
+    This function sets up the distributed training strategy optimized for the large 175B model.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=gpt3_175b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=64, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        This configuration uses extensive parallelism to handle the large model size efficiently.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for GPT3 175B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory gpt3_175b
+            $ nemo llm pretrain --factory "gpt3_175b(num_nodes=64, name='my_175b_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="gpt3_175b_pretrain", num_nodes=64)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for the large 175B model and requires significant computational resources.
+    """
+    recipe = run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=2048, global_batch_size=2048, micro_batch_size=2),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=0.9e-4),
+        resume=default_resume(),
+    )
+
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
+
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for GPT3 175B model.
+
+    This method enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Note:
+        Use this method with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+            tp_comm_overlap_cfg=userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048,
+            defer_embedding_wgrad_compute=True,
+            wgrad_deferral_limit=50,
+            overlap_param_gather_with_optimizer_step=True,
+            align_param_gather=True,
+        )
+    )
+
+    return recipe
diff --git a/nemo/collections/llm/recipes/llama31_405b.py b/nemo/collections/llm/recipes/llama31_405b.py
index f36773551ea0..055e9a06fcba 100644
--- a/nemo/collections/llm/recipes/llama31_405b.py
+++ b/nemo/collections/llm/recipes/llama31_405b.py
@@ -13,11 +13,12 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
 import torch
+from megatron.core.distributed import DistributedDataParallelConfig
 from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
@@ -27,6 +28,10 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
+    userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
+)
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "llama31_405b"
@@ -107,6 +112,14 @@ def trainer(
         gradient_as_bucket_view=True,
         ckpt_async_save=True,
         ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+        ),
     )
 
     trainer = run.Config(
@@ -131,7 +144,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Llama3.1 405B model.
@@ -144,6 +162,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -161,7 +180,7 @@ def pretrain_recipe(
     Note:
         This recipe is optimized for the large 405B model and requires significant computational resources.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -174,3 +193,47 @@ def pretrain_recipe(
         optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
         resume=default_resume(),
     )
+
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
+
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Llama3.1 405B model.
+
+    This method enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Note:
+        Use this method with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+            tp_comm_overlap_cfg=userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
+            defer_embedding_wgrad_compute=True,
+            wgrad_deferral_limit=50,
+            overlap_param_gather_with_optimizer_step=True,
+            align_param_gather=True,
+        )
+    )
+
+    return recipe
diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py
index 9cfc198038f2..b283c68b222b 100644
--- a/nemo/collections/llm/recipes/llama3_70b.py
+++ b/nemo/collections/llm/recipes/llama3_70b.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -24,7 +24,6 @@
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.gpt.model.llama import Llama3Config70B, LlamaModel
 from nemo.collections.llm.peft.lora import LoRA
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
@@ -64,7 +63,7 @@ def trainer(
     virtual_pipeline_parallelism: Optional[int] = 5,
     context_parallelism: int = 2,
     sequence_parallelism: bool = True,
-    num_nodes: int = 1,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
     max_steps: int = 1168251,
     callbacks: Optional[list[run.Config[Callback]]] = None,
@@ -117,6 +116,7 @@ def trainer(
             grad_reduce_in_fp32=True,
             overlap_grad_reduce=True,
             overlap_param_gather=True,
+            average_in_collective=True,
         ),
     )
 
@@ -142,7 +142,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Llama3 70B model.
@@ -155,6 +160,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -172,7 +178,8 @@ def pretrain_recipe(
     Note:
         This recipe is optimized for the large 70B model and requires significant computational resources.
     """
-    return run.Partial(
+
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -186,40 +193,35 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Llama3 70B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-        CLI usage:
-            $ nemo llm pretrain --factory "llama3_70b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')"
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="llama3_70b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
 
     recipe.trainer.callbacks.append(
         run.Config(
@@ -228,6 +230,8 @@ def pretrain_recipe_performance(
             tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
             defer_embedding_wgrad_compute=True,
             wgrad_deferral_limit=22,
+            overlap_param_gather_with_optimizer_step=True,
+            align_param_gather=True,
         )
     )
 
diff --git a/nemo/collections/llm/recipes/llama3_70b_16k.py b/nemo/collections/llm/recipes/llama3_70b_16k.py
index c8c1957d7bdc..928f961f7cf3 100644
--- a/nemo/collections/llm/recipes/llama3_70b_16k.py
+++ b/nemo/collections/llm/recipes/llama3_70b_16k.py
@@ -49,7 +49,7 @@ def model() -> run.Config[pl.LightningModule]:
 
 
 def trainer(
-    num_nodes: int = 2,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Config:
     """
@@ -58,8 +58,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for the large 70B model with longer sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -76,10 +76,10 @@ def trainer(
         This configuration uses extensive parallelism to handle the large model size and longer sequence length efficiently.
     """
     return llama3_70b.trainer(
-        tensor_parallelism=2,
-        pipeline_parallelism=4,
+        tensor_parallelism=8,
+        pipeline_parallelism=2,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=5,
+        virtual_pipeline_parallelism=None,
         context_parallelism=2,
         sequence_parallelism=True,
         num_nodes=num_nodes,
@@ -91,7 +91,7 @@ def trainer(
 def pretrain_recipe(
     dir: Optional[str] = None,
     name: str = "default",
-    num_nodes: int = 2,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Partial:
     """
@@ -103,8 +103,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
diff --git a/nemo/collections/llm/recipes/llama3_70b_64k.py b/nemo/collections/llm/recipes/llama3_70b_64k.py
index 5d9845d9aaa7..ffadf5ca8084 100644
--- a/nemo/collections/llm/recipes/llama3_70b_64k.py
+++ b/nemo/collections/llm/recipes/llama3_70b_64k.py
@@ -21,7 +21,6 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes import llama3_70b
 from nemo.utils.exp_manager import TimingCallback
 
@@ -59,8 +58,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for the large 70B model with long sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 32.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -81,7 +80,7 @@ def trainer(
         tensor_parallelism=8,
         pipeline_parallelism=4,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=5,
+        virtual_pipeline_parallelism=None,
         context_parallelism=8,
         sequence_parallelism=True,
         num_nodes=num_nodes,
@@ -106,8 +105,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 32.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py
index 4b2934739529..269eb7865dcf 100644
--- a/nemo/collections/llm/recipes/llama3_8b.py
+++ b/nemo/collections/llm/recipes/llama3_8b.py
@@ -117,6 +117,7 @@ def trainer(
             grad_reduce_in_fp32=True,
             overlap_grad_reduce=True,
             overlap_param_gather=True,
+            average_in_collective=True,
         ),
     )
 
@@ -142,7 +143,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Llama3 8B model.
@@ -155,6 +161,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -173,7 +180,7 @@ def pretrain_recipe(
         For more details on pre-training LLMs with NeMo, see the pre-training
         guide in the `examples/llm/pretrain/` directory.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -187,44 +194,29 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_optimized")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 1,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Llama3 8B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-            $ nemo llm pretrain --factory llama3_8b_optimized
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="llama3_8b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
-
     recipe.trainer.callbacks.append(
         run.Config(
             MegatronCommOverlapCallback,
diff --git a/nemo/collections/llm/recipes/llama3_8b_16k.py b/nemo/collections/llm/recipes/llama3_8b_16k.py
index 0b42b392827a..d6c1677a3b4b 100644
--- a/nemo/collections/llm/recipes/llama3_8b_16k.py
+++ b/nemo/collections/llm/recipes/llama3_8b_16k.py
@@ -49,7 +49,7 @@ def model() -> run.Config[pl.LightningModule]:
 
 
 def trainer(
-    num_nodes: int = 1,
+    num_nodes: int = 2,
     num_gpus_per_node: int = 8,
 ) -> run.Config:
     """
@@ -58,8 +58,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for longer sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 2.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -76,10 +76,10 @@ def trainer(
         This configuration uses increased parallelism to handle the longer sequence length efficiently.
     """
     return llama3_8b.trainer(
-        tensor_parallelism=2,
-        pipeline_parallelism=4,
+        tensor_parallelism=4,
+        pipeline_parallelism=2,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=5,
+        virtual_pipeline_parallelism=None,
         context_parallelism=2,
         sequence_parallelism=True,
         num_nodes=num_nodes,
@@ -91,7 +91,7 @@ def trainer(
 def pretrain_recipe(
     dir: Optional[str] = None,
     name: str = "default",
-    num_nodes: int = 1,
+    num_nodes: int = 2,
     num_gpus_per_node: int = 8,
 ) -> run.Partial:
     """
@@ -103,8 +103,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 2.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
diff --git a/nemo/collections/llm/recipes/llama3_8b_64k.py b/nemo/collections/llm/recipes/llama3_8b_64k.py
index 38f787113bf5..692347ea8dd0 100644
--- a/nemo/collections/llm/recipes/llama3_8b_64k.py
+++ b/nemo/collections/llm/recipes/llama3_8b_64k.py
@@ -49,7 +49,7 @@ def model() -> run.Config[pl.LightningModule]:
 
 
 def trainer(
-    num_nodes: int = 1,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Config:
     """
@@ -58,8 +58,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for long sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -69,17 +69,17 @@ def trainer(
             $ nemo llm pretrain trainer=llama3_8b_64k ...
 
         Python API usage:
-            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> trainer_config = trainer(num_nodes=4, num_gpus_per_node=8)
             >>> print(trainer_config)
 
     Note:
         This configuration uses significantly increased parallelism to handle the long sequence length efficiently.
     """
     return llama3_8b.trainer(
-        tensor_parallelism=2,
-        pipeline_parallelism=4,
+        tensor_parallelism=4,
+        pipeline_parallelism=2,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=5,
+        virtual_pipeline_parallelism=None,
         context_parallelism=4,
         sequence_parallelism=True,
         num_nodes=num_nodes,
@@ -91,7 +91,7 @@ def trainer(
 def pretrain_recipe(
     dir: Optional[str] = None,
     name: str = "default",
-    num_nodes: int = 1,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Partial:
     """
@@ -103,8 +103,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
@@ -112,10 +112,10 @@ def pretrain_recipe(
     Examples:
         CLI usage:
             $ nemo llm pretrain --factory llama3_8b_64k
-            $ nemo llm pretrain --factory "llama3_8b_64k(num_nodes=2, name='my_64k_pretrain')"
+            $ nemo llm pretrain --factory "llama3_8b_64k(num_nodes=4, name='my_64k_pretrain')"
 
         Python API usage:
-            >>> recipe = pretrain_recipe(name="llama3_8b_64k_pretrain", num_nodes=2)
+            >>> recipe = pretrain_recipe(name="llama3_8b_64k_pretrain", num_nodes=4)
             >>> print(recipe)
 
     Note:
diff --git a/nemo/collections/llm/recipes/mamba2_130m.py b/nemo/collections/llm/recipes/mamba2_130m.py
new file mode 100644
index 000000000000..08640604a112
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_130m.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_130m"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='huggingface',
+        model_name="EleutherAI/gpt-neox-20b",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 130M model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 130M model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_130m ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.BaseMambaConfig130M), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 130M model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_130m ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 130M model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_130M
+            $ nemo llm pretrain --factory "mamba2_130M(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_130M_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    resume_path: str = None,
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 130M model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_130m
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_130m_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.BaseMambaConfig130M(), tokenizer=tokenizer()).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.BaseMambaConfig130M())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_1_3b.py b/nemo/collections/llm/recipes/mamba2_1_3b.py
new file mode 100644
index 000000000000..58eaf049b059
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_1_3b.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_1_3b"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='huggingface',
+        model_name="EleutherAI/gpt-neox-20b",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 1.3B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 1.3B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_1_3B ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.BaseMambaConfig1_3B), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 1.3B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_1_3b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 1.3B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_1_3b
+            $ nemo llm pretrain --factory "mamba2_1_3b(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_1_3b_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    resume_path: str = None,
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 1.3B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_1_3b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_1_3b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.BaseMambaConfig1_3B(), tokenizer=tokenizer()).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.BaseMambaConfig1_3B())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_2_7b.py b/nemo/collections/llm/recipes/mamba2_2_7b.py
new file mode 100644
index 000000000000..5cb37c6a02a5
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_2_7b.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_2_7b"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='huggingface',
+        model_name="EleutherAI/gpt-neox-20b",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 2.7B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 2.7B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_2_7B ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.BaseMambaConfig2_7B), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 2.7B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_2_7b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 2.7B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_2_7b
+            $ nemo llm pretrain --factory "mamba2_2_7b(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_2_7b_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    resume_path: str = None,
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 2.7B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_2_7b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_2_7b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.BaseMambaConfig2_7B(), tokenizer=tokenizer()).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.BaseMambaConfig2_7B())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_370m.py b/nemo/collections/llm/recipes/mamba2_370m.py
new file mode 100644
index 000000000000..bb8bddc4045a
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_370m.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_370m"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='huggingface',
+        model_name="EleutherAI/gpt-neox-20b",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 370M model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 370M model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_370m ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.BaseMambaConfig370M), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 370M model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_370m ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 370M model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_370M
+            $ nemo llm pretrain --factory "mamba2_370M(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_370M_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    resume_path: str = None,
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 370M model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_370m
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_370m_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.BaseMambaConfig370M(), tokenizer=tokenizer()).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.BaseMambaConfig370M())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_780m.py b/nemo/collections/llm/recipes/mamba2_780m.py
new file mode 100644
index 000000000000..2f6ab6717ae1
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_780m.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_780m"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='huggingface',
+        model_name="EleutherAI/gpt-neox-20b",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 780M model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 780M model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_780m ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.BaseMambaConfig780M), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 780M model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_780m ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 780M model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_780M
+            $ nemo llm pretrain --factory "mamba2_780M(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_780M_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    resume_path: str = None,
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 780M model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_780m
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_780m_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.BaseMambaConfig780M(), tokenizer=tokenizer()).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.BaseMambaConfig780M())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_8b.py b/nemo/collections/llm/recipes/mamba2_8b.py
new file mode 100644
index 000000000000..58883deba732
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_8b.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_8b"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='megatron',
+        model_name="GPTSentencePieceTokenizer",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 8B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 Hybrid 8B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_8b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.NVIDIAMambaConfig8B), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 8,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 Hybrid 8B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_8b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 Hybrid 8B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_8b
+            $ nemo llm pretrain --factory "mamba2_8b(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_8b_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    resume_path,
+    tokenizer_model,
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 8B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_8b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_8b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.NVIDIAMambaConfig8B(), tokenizer=tokenizer(tokenizer_model=tokenizer_model)).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.NVIDIAMambaConfig8B())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=8,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 8
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_hybrid_8b.py b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
new file mode 100644
index 000000000000..eff37da46fca
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
@@ -0,0 +1,323 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_hybrid_8b"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='megatronNVIDIAMambaConfig8B',
+        model_name="GPTSentencePieceTokenizer",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 Hybrid 8B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 Hybrid 8B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_hybrid_8b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel,
+        config=run.Config(llm.NVIDIAMambaHybridConfig8B),
+        tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 8,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 Hybrid 8B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_hybrid_8b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 Hybrid 8B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_hybrid_8b
+            $ nemo llm pretrain --factory "mamba2_hybrid_8b(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_hybrid_8b_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    resume_path,
+    tokenizer_model,
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 Hybrid 8B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_hybrid_8b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_hybrid_8b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.NVIDIAMambaHybridConfig8B(), tokenizer=tokenizer(tokenizer_model=tokenizer_model)).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.NVIDIAMambaHybridConfig8B())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=8,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 8
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mistral.py b/nemo/collections/llm/recipes/mistral_7b.py
similarity index 99%
rename from nemo/collections/llm/recipes/mistral.py
rename to nemo/collections/llm/recipes/mistral_7b.py
index 2b8c42e54ee7..6e82df598140 100644
--- a/nemo/collections/llm/recipes/mistral.py
+++ b/nemo/collections/llm/recipes/mistral_7b.py
@@ -33,7 +33,7 @@
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
 from nemo.utils.exp_manager import TimingCallback
 
-NAME = "mistral"
+NAME = "mistral_7b"
 
 
 @run.cli.factory(name=NAME)
diff --git a/nemo/collections/llm/recipes/mistral_nemo_12b.py b/nemo/collections/llm/recipes/mistral_nemo_12b.py
new file mode 100644
index 000000000000..e74fa5435b62
--- /dev/null
+++ b/nemo/collections/llm/recipes/mistral_nemo_12b.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.mistral import MistralModel, MistralNeMoConfig12B
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mistral_nemo_base_12b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mistral-Nemo-Base-12B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mistral-Nemo-Base-12B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mistral_nemo_base_12b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(MistralModel, config=run.Config(MistralNeMoConfig12B))
+
+
+def trainer(
+    tensor_parallelism: int = 2,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = True,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mistral-Nemo-Base-12B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mistral_nemo_base_12b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mistral-Nemo-Base-12B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mistral_nemo_base_12b
+            $ nemo llm pretrain --factory "mistral_nemo_base_12b(num_nodes=2, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mistral_nemo_base_12b", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_optimized")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Mistral-Nemo-Base-12B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory mistral_nemo_base_12b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="mistral_nemo_base_12b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+        )
+    )
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mistral-Nemo-Base-12B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mistral_nemo_base_12b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mistral_nemo_base_12b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    recipe = default_finetune_recipe(
+        model(), "mistralai/Mistral-Nemo-Base-2407", dir, name, num_nodes, num_gpus_per_node
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32)
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py
index 222a37d7a0c5..1bfef9be5582 100644
--- a/nemo/collections/llm/recipes/mixtral_8x22b.py
+++ b/nemo/collections/llm/recipes/mixtral_8x22b.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -117,6 +117,9 @@ def trainer(
             DistributedDataParallelConfig,
             check_for_nan_in_grad=True,
             grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
         ),
     )
 
@@ -142,7 +145,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 16, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 16,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Mixtral 8x22B model.
@@ -155,6 +163,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -169,7 +178,7 @@ def pretrain_recipe(
             >>> recipe = pretrain_recipe(name="mixtral_pretrain", num_nodes=16)
             >>> print(recipe)
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -181,44 +190,44 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Mixtral 8x22B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-        CLI usage:
-            $ nemo llm pretrain --factory "mixtral_8x22b.pretrain_recipe_performance(num_nodes=8, name='perf_pretrain')"
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="mixtral_8x22b_perf", num_nodes=8)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
+
     recipe.trainer.callbacks.extend(
         [
-            run.Config(MegatronTokenDropCallback),
-            run.Config(MegatronCommOverlapCallback),
+            run.Config(
+                MegatronTokenDropCallback,
+            ),
+            run.Config(
+                MegatronCommOverlapCallback, overlap_param_gather_with_optimizer_step=True, align_param_gather=True
+            ),
         ]
     )
 
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py
index d0609761feea..8e39e73aab76 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -116,6 +116,7 @@ def trainer(
             grad_reduce_in_fp32=True,
             overlap_grad_reduce=True,
             overlap_param_gather=True,
+            average_in_collective=True,
         ),
     )
 
@@ -141,7 +142,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 8,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Mixtral 8x7B model.
@@ -154,6 +160,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -168,7 +175,7 @@ def pretrain_recipe(
             >>> recipe = pretrain_recipe(name="mixtral_8x7b_pretrain", num_nodes=8)
             >>> print(recipe)
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -180,44 +187,44 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Mixtral 8x7B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-        CLI usage:
-            $ nemo llm pretrain --factory "mixtral_8x3b.pretrain_recipe_performance(num_nodes=8, name='perf_pretrain')"
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="mixtral_8x7b_perf", num_nodes=8)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
+
     recipe.trainer.callbacks.extend(
         [
             run.Config(MegatronTokenDropCallback),
-            run.Config(MegatronCommOverlapCallback),
+            run.Config(
+                MegatronCommOverlapCallback,
+                overlap_param_gather_with_optimizer_step=True,
+                align_param_gather=True,
+            ),
         ]
     )
 
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py
index 8b26a8c7c3e3..7cbfaf723544 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py
@@ -51,7 +51,7 @@ def model() -> run.Config[pl.LightningModule]:
 
 
 def trainer(
-    num_nodes: int = 2,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Config:
     """
@@ -60,8 +60,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for longer sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -71,17 +71,17 @@ def trainer(
             $ nemo llm pretrain trainer=mixtral_8x7b_16k ...
 
         Python API usage:
-            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> trainer_config = trainer(num_nodes=4, num_gpus_per_node=8)
             >>> print(trainer_config)
 
     Note:
         This configuration uses increased parallelism to handle the longer sequence length efficiently.
     """
     return mixtral_8x7b.trainer(
-        tensor_parallelism=2,
-        pipeline_parallelism=4,
+        tensor_parallelism=4,
+        pipeline_parallelism=2,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=8,
+        virtual_pipeline_parallelism=None,
         context_parallelism=4,
         sequence_parallelism=True,
         expert_parallelism=1,
@@ -95,7 +95,7 @@ def trainer(
 def pretrain_recipe(
     dir: Optional[str] = None,
     name: str = "default",
-    num_nodes: int = 2,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Partial:
     """
@@ -107,8 +107,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
@@ -116,10 +116,10 @@ def pretrain_recipe(
     Examples:
         CLI usage:
             $ nemo llm pretrain --factory mixtral_8x7b_16k
-            $ nemo llm pretrain --factory "mixtral_8x7b_16k(num_nodes=2, name='my_16k_pretrain')"
+            $ nemo llm pretrain --factory "mixtral_8x7b_16k(num_nodes=4, name='my_16k_pretrain')"
 
         Python API usage:
-            >>> recipe = pretrain_recipe(name="mixtral_8x7b_16k_pretrain", num_nodes=2)
+            >>> recipe = pretrain_recipe(name="mixtral_8x7b_16k_pretrain", num_nodes=4)
             >>> print(recipe)
     """
     recipe = mixtral_8x7b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py
index 6c8f7077fba3..3606be5ec12b 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py
@@ -21,7 +21,6 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes import mixtral_8x7b
 from nemo.utils.exp_manager import TimingCallback
 
@@ -59,8 +58,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for very long sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 8.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -78,11 +77,11 @@ def trainer(
         It requires a substantial amount of computational resources.
     """
     return mixtral_8x7b.trainer(
-        tensor_parallelism=4,
+        tensor_parallelism=8,
         pipeline_parallelism=4,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=4,
-        context_parallelism=8,
+        virtual_pipeline_parallelism=None,
+        context_parallelism=4,
         sequence_parallelism=True,
         expert_parallelism=1,
         num_nodes=num_nodes,
@@ -107,8 +106,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 16.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
diff --git a/nemo/collections/llm/recipes/nemotron.py b/nemo/collections/llm/recipes/nemotron.py
index 1dd1ef2f83bc..aedf3fcf2954 100644
--- a/nemo/collections/llm/recipes/nemotron.py
+++ b/nemo/collections/llm/recipes/nemotron.py
@@ -17,6 +17,7 @@
 import nemo_run as run
 import pytorch_lightning as pl
 import torch
+from megatron.core.distributed import DistributedDataParallelConfig
 from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
@@ -124,6 +125,14 @@ def nemotron_trainer(
         ckpt_include_optimizer=True,
         ckpt_async_save=True,
         ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+        ),
     )
 
     precision_plugin = None
diff --git a/nemo/collections/llm/recipes/nemotron3_8b.py b/nemo/collections/llm/recipes/nemotron3_8b.py
index 3cdb647b5f84..7dcebe17f872 100644
--- a/nemo/collections/llm/recipes/nemotron3_8b.py
+++ b/nemo/collections/llm/recipes/nemotron3_8b.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -26,6 +26,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "nemotron3_8b"
@@ -82,6 +83,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=3.0e-5,
     max_lr=3e-4,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -117,6 +119,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -134,7 +137,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -173,6 +176,38 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
+
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Nemotron3 8B model.
+
+    This method enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Note:
+        Use this method with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+        )
+    )
+    return recipe
+
 
 @run.cli.factory(name=NAME + "_nemo")
 def nemo_resume() -> run.Config[nl.AutoResume]:
diff --git a/nemo/collections/llm/recipes/nemotron4_15b.py b/nemo/collections/llm/recipes/nemotron4_15b.py
index c0acae6b13f0..16ae7b2b1e79 100644
--- a/nemo/collections/llm/recipes/nemotron4_15b.py
+++ b/nemo/collections/llm/recipes/nemotron4_15b.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -23,6 +23,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "nemotron4_15b"
@@ -79,6 +80,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=4.5e-5,
     max_lr=4.5e-5,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -114,6 +116,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -131,7 +134,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -169,3 +172,34 @@ def pretrain_recipe(
         ),
         resume=default_resume(),
     )
+
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
+
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Nemotron4 15B model.
+
+    This method enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Note:
+        Use this method with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+        )
+    )
+    return recipe
diff --git a/nemo/collections/llm/recipes/nemotron4_15b_16k.py b/nemo/collections/llm/recipes/nemotron4_15b_16k.py
index d0e9d939d8e7..75eced72761f 100644
--- a/nemo/collections/llm/recipes/nemotron4_15b_16k.py
+++ b/nemo/collections/llm/recipes/nemotron4_15b_16k.py
@@ -56,7 +56,7 @@ def pretrain_recipe(
     # Trainer
     tensor_parallelism: int = 2,
     pipeline_parallelism: int = 2,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
     virtual_pipeline_parallelism: Optional[int] = None,
     context_parallelism: int = 2,
     sequence_parallelism: bool = True,
diff --git a/nemo/collections/llm/recipes/nemotron4_15b_64k.py b/nemo/collections/llm/recipes/nemotron4_15b_64k.py
index c3f4575a1fd6..8286778aa7ba 100644
--- a/nemo/collections/llm/recipes/nemotron4_15b_64k.py
+++ b/nemo/collections/llm/recipes/nemotron4_15b_64k.py
@@ -56,7 +56,7 @@ def pretrain_recipe(
     # Trainer
     tensor_parallelism: int = 4,
     pipeline_parallelism: int = 2,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
     virtual_pipeline_parallelism: Optional[int] = None,
     context_parallelism: int = 4,
     sequence_parallelism: bool = True,
diff --git a/nemo/collections/llm/recipes/nemotron4_22b.py b/nemo/collections/llm/recipes/nemotron4_22b.py
index ba07bae241d8..a20afedfea56 100644
--- a/nemo/collections/llm/recipes/nemotron4_22b.py
+++ b/nemo/collections/llm/recipes/nemotron4_22b.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -23,6 +23,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "nemotron4_22b"
@@ -56,7 +57,7 @@ def pretrain_recipe(
     # Trainer
     tensor_parallelism: int = 2,
     pipeline_parallelism: int = 4,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
     virtual_pipeline_parallelism: Optional[int] = 10,
     context_parallelism: int = 1,
     sequence_parallelism: bool = False,
@@ -79,6 +80,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=1e-5,
     max_lr=1e-4,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -114,6 +116,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -131,7 +134,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -169,3 +172,45 @@ def pretrain_recipe(
         ),
         resume=default_resume(),
     )
+
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
+
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Nemotron4 22B model.
+
+    This method enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Note:
+        Use this method with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+            defer_embedding_wgrad_compute=True,
+            wgrad_deferral_limit=22,
+            overlap_param_gather_with_optimizer_step=True,
+            align_param_gather=True,
+        )
+    )
+    return recipe
diff --git a/nemo/collections/llm/recipes/nemotron4_22b_16k.py b/nemo/collections/llm/recipes/nemotron4_22b_16k.py
index 614004d12aa3..42f258c6057d 100644
--- a/nemo/collections/llm/recipes/nemotron4_22b_16k.py
+++ b/nemo/collections/llm/recipes/nemotron4_22b_16k.py
@@ -57,7 +57,7 @@ def pretrain_recipe(
     tensor_parallelism: int = 4,
     pipeline_parallelism: int = 1,
     pipeline_parallelism_type: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = 10,
+    virtual_pipeline_parallelism: Optional[int] = None,
     context_parallelism: int = 2,
     sequence_parallelism: bool = True,
     num_nodes: int = 1,
diff --git a/nemo/collections/llm/recipes/nemotron4_22b_64k.py b/nemo/collections/llm/recipes/nemotron4_22b_64k.py
index 57211e5dddc1..67d60a6e1c90 100644
--- a/nemo/collections/llm/recipes/nemotron4_22b_64k.py
+++ b/nemo/collections/llm/recipes/nemotron4_22b_64k.py
@@ -56,9 +56,9 @@ def pretrain_recipe(
     # Trainer
     tensor_parallelism: int = 4,
     pipeline_parallelism: int = 2,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = 10,
-    context_parallelism: int = 2,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 4,
     sequence_parallelism: bool = True,
     num_nodes: int = 4,
     num_gpus_per_node: int = 8,
@@ -122,10 +122,10 @@ def pretrain_recipe(
     Examples:
         CLI usage:
             $ nemo llm pretrain --factory nemotron4_22b_64k
-            $ nemo llm pretrain --factory "nemotron4_22b_64k(num_nodes=1, name='my_nemotron_pretrain')"
+            $ nemo llm pretrain --factory "nemotron4_22b_64k(num_nodes=2, name='my_nemotron_pretrain')"
 
         Python API usage:
-            >>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=1)
+            >>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=2)
             >>> print(recipe)
 
     Note:
diff --git a/nemo/collections/llm/recipes/nemotron4_340b.py b/nemo/collections/llm/recipes/nemotron4_340b.py
index 238acb0dac3c..8268b2a87791 100644
--- a/nemo/collections/llm/recipes/nemotron4_340b.py
+++ b/nemo/collections/llm/recipes/nemotron4_340b.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -26,6 +26,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "nemotron4_340b"
@@ -41,7 +42,7 @@ def model() -> run.Config[pl.LightningModule]:
 
     Examples:
         CLI usage:
-            $ nemo llm pretrain model=nemotron4_340 ...
+            $ nemo llm pretrain model=nemotron4_340b ...
 
         Python API usage:
             >>> model_config = model()
@@ -59,7 +60,7 @@ def pretrain_recipe(
     # Trainer
     tensor_parallelism: int = 8,
     pipeline_parallelism: int = 12,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
     virtual_pipeline_parallelism: Optional[int] = 8,
     context_parallelism: int = 1,
     sequence_parallelism: bool = False,
@@ -82,6 +83,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=1.0e-5,
     max_lr=1.0e-4,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -117,6 +119,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -124,8 +127,8 @@ def pretrain_recipe(
 
     Examples:
         CLI usage:
-            $ nemo llm pretrain --factory nemotron4_340
-            $ nemo llm pretrain --factory "nemotron4_340(num_nodes=1, name='my_nemotron_pretrain')"
+            $ nemo llm pretrain --factory nemotron4_340b
+            $ nemo llm pretrain --factory "nemotron4_340b(num_nodes=1, name='my_nemotron_pretrain')"
 
         Python API usage:
             >>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=1)
@@ -134,7 +137,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -173,6 +176,48 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
+
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Nemotron4 340B model.
+
+    This method enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Note:
+        Use this method with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+            defer_embedding_wgrad_compute=True,
+            wgrad_deferral_limit=22,
+            overlap_param_gather_with_optimizer_step=True,
+            align_param_gather=True,
+        )
+    )
+    return recipe
+
 
 @run.cli.factory(name=NAME + "_nemo")
 def nemo_resume() -> run.Config[nl.AutoResume]:
@@ -207,7 +252,7 @@ def finetune_recipe(
     # Trainer
     tensor_parallelism: int = 8,
     pipeline_parallelism: int = 12,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
     virtual_pipeline_parallelism: Optional[int] = 8,
     context_parallelism: int = 1,
     sequence_parallelism: bool = False,
@@ -272,8 +317,8 @@ def finetune_recipe(
 
     Examples:
         CLI usage:
-            $ nemo llm finetune --factory nemotron4_340
-            $ nemo llm finetune --factory "nemotron4_340(name='my_nemotron4_340_finetune', num_nodes=4)"
+            $ nemo llm finetune --factory nemotron4_340b
+            $ nemo llm finetune --factory "nemotron4_340b(name='my_nemotron4_340_finetune', num_nodes=4)"
 
         Python API usage:
             >>> recipe = finetune_recipe(name="my_nemotron4_340_finetune", num_nodes=4)
diff --git a/nemo/collections/llm/t5/data/fine_tuning.py b/nemo/collections/llm/t5/data/fine_tuning.py
index b1315f7a708a..9326dabe7b84 100644
--- a/nemo/collections/llm/t5/data/fine_tuning.py
+++ b/nemo/collections/llm/t5/data/fine_tuning.py
@@ -61,8 +61,6 @@ def __init__(
         from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 
         self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "BertWordPieceCase")
-        additional_tokens = {'additional_special_tokens': [f'<extra_id_{i}>' for i in range(100)]}
-        self.tokenizer.add_special_tokens(additional_tokens)
 
         self.memmap_workers = memmap_workers
         self.num_workers = num_workers
diff --git a/nemo/collections/llm/t5/data/pre_training.py b/nemo/collections/llm/t5/data/pre_training.py
index 2c73e0b78b11..e6f619972284 100644
--- a/nemo/collections/llm/t5/data/pre_training.py
+++ b/nemo/collections/llm/t5/data/pre_training.py
@@ -130,10 +130,6 @@ def __init__(
         # add additional tokens for T5 tokenizer
         from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 
-        self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "BertWordPieceCase")
-        additional_tokens = {'additional_special_tokens': [f'<extra_id_{i}>' for i in range(100)]}
-        self.tokenizer.add_special_tokens(additional_tokens)
-
         self.data_sampler = MegatronDataSampler(
             seq_len=self.seq_length,
             micro_batch_size=micro_batch_size,
diff --git a/nemo/collections/llm/t5/model/t5.py b/nemo/collections/llm/t5/model/t5.py
index dcba70bc8986..83dc2029a513 100644
--- a/nemo/collections/llm/t5/model/t5.py
+++ b/nemo/collections/llm/t5/model/t5.py
@@ -11,8 +11,6 @@
 from torch import nn
 
 from nemo.collections.llm import fn
-from nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder import AttnMaskType
-from nemo.collections.nlp.modules.common.megatron.utils import build_attention_mask_3d
 from nemo.lightning import get_vocab_size, io
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
 from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule
@@ -32,6 +30,9 @@
 def t5_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
     from megatron.core import parallel_state
 
+    from nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder import AttnMaskType
+    from nemo.collections.nlp.modules.common.megatron.utils import build_attention_mask_3d
+
     batch = next(dataloader_iter)
 
     _batch: dict
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index bb7598421c33..5291497f92c3 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -1194,11 +1194,16 @@ def loss_func(self, loss_mask, output_tensor):
         return loss
 
     def setup(self, stage=None):
-        """PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+
+        We setup datasets here as Megatron datasets require DDP to instantiate.
+        See the PyTorch Lightning documentation for more information:
+        https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup
+
         Args:
-            stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
+            stage (str, optional):
+                Can be 'fit', 'validate', 'test', or 'predict'. Defaults to None.
         """
         num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert()
 
diff --git a/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py b/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py
index 3b795aa7618c..158fa7595782 100644
--- a/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py
+++ b/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py
@@ -889,11 +889,16 @@ def validation_step(self, batch, batch_idx):
         self.log_dict(val_loss_dict, prog_bar=False, logger=True, on_step=False, on_epoch=True)
 
     def setup(self, stage=None):
-        """PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+
+        We setup datasets here as Megatron datasets require DDP to instantiate.
+        See the PyTorch Lightning documentation for more information:
+        https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup
+
         Args:
-            stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
+            stage (str, optional):
+                Can be 'fit', 'validate', 'test', or 'predict'. Defaults to None.
         """
         self.model.rng.manual_seed(self.cfg.seed + 100 * parallel_state.get_data_parallel_rank())
 
diff --git a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
index 24712ed30021..47548b02961d 100644
--- a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
+++ b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
@@ -424,11 +424,16 @@ def fwd_output_only_func(batch, model):
         return fwd_output_only_func
 
     def setup(self, stage=None):
-        """PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+
+        We setup datasets here as Megatron datasets require DDP to instantiate.
+        See the PyTorch Lightning documentation for more information:
+        https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup
+
         Args:
-            stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
+            stage (str, optional):
+                Can be 'fit', 'validate', 'test', or 'predict'. Defaults to None.
         """
         self.model.rng.manual_seed(self.cfg.seed + 100 * parallel_state.get_data_parallel_rank())
 
diff --git a/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py b/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py
index b7cf6d629d65..ed9be58178c4 100644
--- a/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py
+++ b/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py
@@ -483,11 +483,16 @@ def validation_step(self, dataloader_iter):
         return loss
 
     def setup(self, stage=None):
-        """PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+
+        We setup datasets here as Megatron datasets require DDP to instantiate.
+        See the PyTorch Lightning documentation for more information:
+        https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup
+
         Args:
-            stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
+            stage (str, optional):
+                Can be 'fit', 'validate', 'test', or 'predict'. Defaults to None.
         """
 
         # log number of parameters
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py
index 77a8caa58b40..8b18fe2b25fe 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py
@@ -589,11 +589,16 @@ def validation_step(self, dataloader_iter, batch_idx):
         return loss
 
     def setup(self, stage=None):
-        """PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+
+        We setup datasets here as Megatron datasets require DDP to instantiate.
+        See the PyTorch Lightning documentation for more information:
+        https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup
+
         Args:
-            stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
+            stage (str, optional):
+                Can be 'fit', 'validate', 'test', or 'predict'. Defaults to None.
         """
         self.model.rng.manual_seed(self.cfg.seed + 100 * parallel_state.get_data_parallel_rank())
 
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
index f0a6b9640540..744dc6945394 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
@@ -1980,11 +1980,16 @@ def validation_step(self, dataloader_iter):
         return loss
 
     def setup(self, stage=None):
-        """PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+
+        We setup datasets here as Megatron datasets require DDP to instantiate.
+        See the PyTorch Lightning documentation for more information:
+        https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup
+
         Args:
-            stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
+            stage (str, optional):
+                Can be 'fit', 'validate', 'test', or 'predict'. Defaults to None.
         """
         if self.model.rng:
             self.model.rng.manual_seed(self.cfg.seed + 100 * parallel_state.get_data_parallel_rank())
diff --git a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
index 2c3b30f2fc74..a9e51610bedd 100644
--- a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
+++ b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
@@ -1337,11 +1337,16 @@ def build_train_valid_test_datasets(self):
         return self._train_ds, self._validation_ds, self._test_ds
 
     def setup(self, stage=None):
-        """PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+
+        We setup datasets here as Megatron datasets require DDP to instantiate.
+        See the PyTorch Lightning documentation for more information:
+        https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup
+
         Args:
-            stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
+            stage (str, optional):
+                Can be 'fit', 'validate', 'test', or 'predict'. Defaults to None.
         """
 
         # log number of parameters
diff --git a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py
index 4882708f698f..f62613db891b 100644
--- a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py
@@ -127,7 +127,7 @@ def __init__(
                 index_mapping_dir=index_mapping_dir,
             )
 
-        if is_distributed:
+        if is_distributed and not _lightning_prepare_data():
             torch.distributed.barrier()
 
         if is_distributed and AppState().local_rank == 0:
@@ -152,7 +152,7 @@ def __init__(
                 index_mapping_dir=index_mapping_dir,
             )
 
-        if is_distributed:
+        if is_distributed and not _lightning_prepare_data():
             torch.distributed.barrier()
 
         logging.info(f"Loading data files")
@@ -260,7 +260,8 @@ def load_file(self, fn, index_mapping_dir: Optional[str] = None):
                 raise RuntimeError(f"Missing header, expected {self._header_lines} header lines")
 
             # load meta info
-            idx_info_dict = pickle.load(open(idx_fn + ".info", "rb"))
+            with open(idx_fn + ".info", "rb") as fp:
+                idx_info_dict = pickle.load(fp)
             # test for mismatch in expected newline_int
             if "newline_int" in idx_info_dict:
                 newline_int = idx_info_dict["newline_int"]
@@ -378,9 +379,7 @@ def __init__(
         self._data_sep = data_sep
 
     def _build_data_from_text(self, text: str):
-        """
-
-        """
+        """ """
         _build_data_from_text = super()._build_data_from_text
         data = {}
         text_fields = text.split(self._data_sep)
@@ -513,7 +512,11 @@ def _build_memmap_index_files(newline_int, build_index_fn, fn, index_mapping_dir
 
 
 def build_index_files(
-    dataset_paths, newline_int, workers=None, build_index_fn=_build_index_from_memdata, index_mapping_dir: str = None,
+    dataset_paths,
+    newline_int,
+    workers=None,
+    build_index_fn=_build_index_from_memdata,
+    index_mapping_dir: str = None,
 ):
     """Auxiliary method to build multiple index files"""
     if len(dataset_paths) < 1:
@@ -528,7 +531,12 @@ def build_index_files(
     ctx = mp.get_context("fork")
     with ctx.Pool(workers) as p:
         build_status = p.map(
-            partial(_build_memmap_index_files, newline_int, build_index_fn, index_mapping_dir=index_mapping_dir,),
+            partial(
+                _build_memmap_index_files,
+                newline_int,
+                build_index_fn,
+                index_mapping_dir=index_mapping_dir,
+            ),
             dataset_paths,
         )
 
@@ -741,3 +749,19 @@ def get_sample_block(self, block_idx: int) -> np.ndarray:
         sample_block = sample_block % self.dataset_size
 
         return sample_block
+
+
+def _lightning_prepare_data():
+    """
+    This function checks whether it is invoked in lightning's hook "prepare_data", which is run only on rank 0.
+    TextMemMapDataset contains a torch.distributed.barrier operation, so when run inside the single-process hook
+    prepare_data, the barrier operation would hang forever.
+    """
+    import inspect
+
+    return any(
+        [
+            frame.function == 'prepare_data' and 'prepare_packed_sequence_data' in frame.code_context[0]
+            for frame in inspect.stack()
+        ]
+    )
diff --git a/nemo/collections/nlp/modules/common/text_generation_server.py b/nemo/collections/nlp/modules/common/text_generation_server.py
index 6c257317b99f..3f8e34b94134 100644
--- a/nemo/collections/nlp/modules/common/text_generation_server.py
+++ b/nemo/collections/nlp/modules/common/text_generation_server.py
@@ -15,11 +15,17 @@
 
 import json
 import threading
+import time
+import uuid
 
 import torch
 from flask import Flask, jsonify, request
 from flask_restful import Api, Resource
 
+from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_chat_dataset import (
+    _get_header_conversation_type_mask_role,
+    get_prompt_template_example,
+)
 from nemo.collections.nlp.modules.common.retro_inference_strategies import (
     RetroModelTextGenerationStrategy,
     RetroQAModelTextGenerationStrategy,
@@ -61,6 +67,189 @@ def send_do_generate():
         choice = torch.cuda.LongTensor([GENERATE_NUM])
         torch.distributed.broadcast(choice, 0)
 
+    def convert_messages(self, input_list):
+        output_dict = {
+            'system': '',
+            'conversations': [],
+            'mask': 'User',
+            'type': 'VALUE_TO_TEXT',
+        }
+
+        # Extract the system message
+        for msg in input_list:
+            if msg['role'] == 'system':
+                output_dict['system'] = msg['content']
+                break  # Assuming only one system message
+
+        # Build the conversations list
+        for msg in input_list:
+            if msg['role'] != 'system':
+                conversation_entry = {
+                    'from': msg['role'].capitalize(),  # Capitalize 'user' and 'assistant'
+                    'value': msg['content'],
+                    'label': None,
+                }
+                output_dict['conversations'].append(conversation_entry)
+
+        return output_dict
+
+    def completion(self, data):
+        output_sentence = ""
+        with lock:  # Need to get lock to keep multiple threads from hitting code
+            MegatronGenerate.send_do_generate()  # Tell other ranks we're doing generate
+            extra = {}
+            if self.inference_strategy is not None:
+                extra['strategy'] = self.inference_strategy
+
+            all_probs = False
+            add_BOS = False
+            top_p = data.get("top_p", 1.0)
+            top_k = data.get("top_k", 0)
+            max_tokens = data.get("max_tokens", 32)
+            temperature = data.get("temperature", 0.0)
+            logprobs = data.get("logprobs", False)
+            greedy = temperature == 0.0
+            end_strings = ['<|endoftext|>'] + data.get("end_strings", [])
+            prompt = data["prompt"]
+            random_seed = data.get("seed", 1234)
+
+            output = generate(
+                self.model,
+                [prompt],
+                tokens_to_generate=max_tokens,
+                all_probs=all_probs,
+                temperature=temperature,
+                add_BOS=add_BOS,
+                top_k=top_k,
+                top_p=top_p,
+                greedy=greedy,
+                repetition_penalty=1.0,
+                end_strings=end_strings,
+                min_tokens_to_generate=0,
+                compute_logprob=logprobs,
+                random_seed=random_seed,
+                **extra,
+            )
+            for k in output:
+                if isinstance(output[k], torch.Tensor):
+                    output[k] = output[k].tolist()
+
+            output_sentence = output['sentences'][0][len(prompt) :]
+            tokens = output['tokens'][0]
+            logprobs = output['logprob'][0] if output['logprob'] is not None else None
+            num_prompt_tokens = len(prompt.split())
+            num_output_sentence = len(output_sentence.split())
+
+        return jsonify(
+            {
+                "choices": [
+                    {
+                        "finish_reason": "",
+                        "index": 0,
+                        "logprobs": logprobs,
+                        "text": output_sentence,
+                        "tokens": tokens,
+                    }
+                ],
+                "created": int(time.time()),
+                "id": f"cmpl-{uuid.uuid4()}",
+                "model": "nemo model",
+                "object": "text_completion",
+                "usage": {
+                    "completion_tokens": num_output_sentence,
+                    "prompt_tokens": num_prompt_tokens,
+                    "total_tokens": num_output_sentence + num_prompt_tokens,
+                },
+            }
+        )
+
+    def chat_completion(self, data):
+        data['messages'] = data['messages'] + [
+            {'role': 'assistant', 'content': ''}
+        ]  # adding trailing assistant message so that prompt ends with Assistant tag.
+        special_tokens = self.model.cfg.data.chat_prompt_tokens
+        nemo_source = self.convert_messages(data['messages'])
+        header, conversation, data_type, mask_role = _get_header_conversation_type_mask_role(
+            nemo_source, special_tokens
+        )
+        len_strip = len(special_tokens['end_of_turn'] + special_tokens['turn_start'])
+        conversation = conversation[:-len_strip]
+        # Return a response mimicking the OpenAI ChatCompletion API format
+        with lock:  # Need to get lock to keep multiple threads from hitting code
+            MegatronGenerate.send_do_generate()  # Tell other ranks we're doing generate
+            extra = {}
+            if self.inference_strategy is not None:
+                extra['strategy'] = self.inference_strategy
+
+            all_probs = False
+            add_BOS = False
+            top_k = 0
+            greedy = data['temperature'] == 0.0
+            logprobs = data.get("logprobs", False)
+            end_strings = ['<|endoftext|>', special_tokens['turn_start'], special_tokens['label_start']]
+            random_seed = None
+
+            output = generate(
+                self.model,
+                [conversation],
+                data.get('max_tokens', 32),
+                all_probs=all_probs,
+                temperature=data.get('temperature', 1.0),
+                add_BOS=add_BOS,
+                top_k=top_k,
+                top_p=data.get("top_p", 0.95),
+                greedy=greedy,
+                repetition_penalty=1.0,
+                end_strings=end_strings,
+                min_tokens_to_generate=0,
+                compute_logprob=logprobs,
+                random_seed=random_seed,
+                **extra,
+            )
+            for k in output:
+                if isinstance(output[k], torch.Tensor):
+                    output[k] = output[k].tolist()
+
+        output_sentence = output['sentences'][0][len(conversation) :]
+        tokens = output['tokens'][0]
+        logprobs = output['logprob'][0] if output['logprob'] is not None else None
+        num_prompt_tokens = len(conversation.split())  # @adithyare only produces an approx. number of tokens
+        num_output_sentence = len(output_sentence.split())
+
+        return jsonify(
+            {
+                "id": f"chatcmpl-{uuid.uuid4()}",
+                "object": "chat.completion",
+                "created": int(time.time()),
+                "model": data.get("model", "nemo model"),
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {"role": "assistant", "content": output_sentence},
+                        "logprobs": logprobs,
+                        "tokens": tokens,
+                        "finish_reason": "",
+                    }
+                ],
+                "usage": {
+                    "prompt_tokens": num_prompt_tokens,
+                    "completion_tokens": num_output_sentence,
+                    "total_tokens": num_output_sentence + num_prompt_tokens,
+                },
+            }
+        )
+
+    def post(self):
+        # Access the request data if needed
+        if request.endpoint == "oai_completions":
+            data = request.get_json()
+            return self.completion(data)
+        elif request.endpoint == "oai_chat_completions":
+            data = request.get_json()
+            return self.chat_completion(data)
+        else:
+            raise RuntimeError("Unknown enpoint requested.")
+
     def put(self):
         logging.info("request IP: " + str(request.remote_addr))
         logging.info(json.dumps(request.get_json()))
@@ -135,7 +324,7 @@ def put(self):
             if not (0.0 <= top_p <= 1.0):
                 return "top_p must be a positive number less than or equal to 1.0"
 
-        repetition_penalty = 1.2
+        repetition_penalty = 1.0
         if "repetition_penalty" in request.get_json():
             repetition_penalty = request.get_json()["repetition_penalty"]
             if not (type(repetition_penalty) == int or type(repetition_penalty) == float):
@@ -231,7 +420,24 @@ class MegatronServer(object):
     def __init__(self, model, inference_strategy=None):
         self.app = Flask(__name__, static_url_path='')
         api = Api(self.app)
-        api.add_resource(MegatronGenerate, '/generate', resource_class_args=[model, inference_strategy])
+        api.add_resource(
+            MegatronGenerate,
+            '/generate',
+            endpoint="generate",
+            resource_class_kwargs={"model": model, "inference_strategy": inference_strategy},
+        )
+        api.add_resource(
+            MegatronGenerate,
+            '/v1/completions',
+            endpoint="oai_completions",
+            resource_class_kwargs={"model": model, "inference_strategy": inference_strategy},
+        )
+        api.add_resource(
+            MegatronGenerate,
+            '/v1/chat/completions',
+            endpoint="oai_chat_completions",
+            resource_class_kwargs={"model": model, "inference_strategy": inference_strategy},
+        )
 
     def run(self, url, port=5000):
         self.app.run(url, threaded=True, port=port, debug=False)
diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py
index 4e6f9e15b839..dfc55a6c9065 100644
--- a/nemo/collections/nlp/modules/common/tokenizer_utils.py
+++ b/nemo/collections/nlp/modules/common/tokenizer_utils.py
@@ -69,7 +69,8 @@ def get_tokenizer(
             To see the list of all HuggingFace pretrained models, use:
             nemo_nlp.modules.common.get_huggingface_pretrained_lm_models_list()
         tokenizer_model: tokenizer model file of sentencepiece
-        special_tokens: dict of special tokens
+        special_tokens: dict of special tokens.
+            For additional special tokens besides standard special tokens (bos, eos, pad, etc.), such as sentinel tokens for T5 (<extra_id_0>, <extra_id_1>, etc.), use key 'additional_special_tokens'
         vocab_file: path to vocab file
         use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer
         bpe_dropout: (experimental) BPE dropout tries to corrupt the standard segmentation
@@ -224,7 +225,11 @@ def get_nmt_tokenizer(
             f'Getting Megatron tokenizer for pretrained model name: {model_name}, custom vocab file: {vocab_file}, and merges file: {merges_file}'
         )
         return get_tokenizer(
-            tokenizer_name=model_name, vocab_file=vocab_file, merges_file=merges_file, chat_template=chat_template
+            tokenizer_name=model_name,
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            special_tokens=special_tokens_dict,
+            chat_template=chat_template,
         )
     elif library == 'tabular':
         from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer
diff --git a/nemo/collections/tts/data/dataset.py b/nemo/collections/tts/data/dataset.py
index 83d2b969ea91..901b4168130f 100644
--- a/nemo/collections/tts/data/dataset.py
+++ b/nemo/collections/tts/data/dataset.py
@@ -204,7 +204,8 @@ def __init__(
             self.text_normalizer_call = None
         elif not PYNINI_AVAILABLE:
             raise ImportError(
-                "`nemo_text_processing` is not installed, see https://github.com/NVIDIA/NeMo-text-processing for details"
+                "`nemo_text_processing` is not installed, see https://github.com/NVIDIA/NeMo-text-processing for details. "
+                "If you wish to continue without text normalization, please remove the text_normalizer part in your TTS yaml file."
             )
         else:
             self.text_normalizer_call = (
diff --git a/nemo/collections/tts/models/aligner.py b/nemo/collections/tts/models/aligner.py
index 72d023e9ee10..d8e65d6e6821 100644
--- a/nemo/collections/tts/models/aligner.py
+++ b/nemo/collections/tts/models/aligner.py
@@ -24,6 +24,7 @@
 from torch import nn
 
 from nemo.collections.tts.losses.aligner_loss import BinLoss, ForwardSumLoss
+from nemo.collections.tts.models.base import NeedsNormalizer
 from nemo.collections.tts.parts.utils.helpers import (
     binarize_attention,
     g2p_backward_compatible_support,
@@ -41,7 +42,7 @@
     HAVE_WANDB = False
 
 
-class AlignerModel(ModelPT):
+class AlignerModel(NeedsNormalizer, ModelPT):
     """Speech-to-text alignment model (https://arxiv.org/pdf/2108.10447.pdf) that is used to learn alignments between mel spectrogram and text."""
 
     def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
@@ -77,29 +78,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self.bin_loss_start_ratio = cfg.bin_loss_start_ratio
         self.bin_loss_warmup_epochs = cfg.bin_loss_warmup_epochs
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-
-            self.text_normalizer_call = self.normalizer.normalize
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def _setup_tokenizer(self, cfg):
         text_tokenizer_kwargs = {}
         if "g2p" in cfg.text_tokenizer:
diff --git a/nemo/collections/tts/models/base.py b/nemo/collections/tts/models/base.py
index fe19ae75a3b3..b4b0ea9c43fa 100644
--- a/nemo/collections/tts/models/base.py
+++ b/nemo/collections/tts/models/base.py
@@ -18,6 +18,7 @@
 from typing import List, Optional
 
 import torch
+from hydra.utils import instantiate
 from omegaconf import DictConfig
 from tqdm import tqdm
 
@@ -28,9 +29,39 @@
 from nemo.core.neural_types.neural_type import NeuralType
 from nemo.utils import logging, model_utils
 
+PYNINI_AVAILABLE = True
+try:
+    import nemo_text_processing
+except (ImportError, ModuleNotFoundError):
+    PYNINI_AVAILABLE = False
 
-class SpectrogramGenerator(ModelPT, ABC):
-    """ Base class for all TTS models that turn text into a spectrogram """
+
+class NeedsNormalizer:
+    """Base class for all TTS models that needs text normalization(TN)"""
+
+    def _setup_normalizer(self, cfg):
+        if "text_normalizer" in cfg:
+            if not PYNINI_AVAILABLE:
+                logging.error(
+                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details."
+                )
+                logging.error("The normalizer will be disabled.")
+                return
+            normalizer_kwargs = {}
+
+            if "whitelist" in cfg.text_normalizer:
+                normalizer_kwargs["whitelist"] = self.register_artifact(
+                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
+                )
+
+            self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
+            self.text_normalizer_call = self.normalizer.normalize
+            if "text_normalizer_call_kwargs" in cfg:
+                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
+
+
+class SpectrogramGenerator(NeedsNormalizer, ModelPT, ABC):
+    """Base class for all TTS models that turn text into a spectrogram"""
 
     @abstractmethod
     def parse(self, str_input: str, **kwargs) -> 'torch.tensor':
@@ -115,7 +146,7 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]':
 
 
 class GlowVocoder(Vocoder):
-    """ Base class for all Vocoders that use a Glow or reversible Flow-based setup. All child class are expected
+    """Base class for all Vocoders that use a Glow or reversible Flow-based setup. All child class are expected
     to have a parameter called audio_to_melspec_precessor that is an instance of
     nemo.collections.asr.parts.FilterbankFeatures"""
 
@@ -175,7 +206,11 @@ def yet_another_patch(audio, n_fft, hop_length, win_length, window):
                 return torch.sqrt(spec.pow(2).sum(-1)), torch.atan2(spec[..., -1], spec[..., 0])
 
             self.stft = lambda x: yet_another_patch(
-                x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window,
+                x,
+                n_fft=n_fft,
+                hop_length=hop_length,
+                win_length=win_length,
+                window=window,
             )
             self.istft = lambda x, y: torch.istft(
                 torch.complex(x * torch.cos(y), x * torch.sin(y)),
@@ -252,15 +287,15 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]':
         return list_of_models
 
 
-class TextToWaveform(ModelPT, ABC):
-    """ Base class for all end-to-end TTS models that generate a waveform from text """
+class TextToWaveform(NeedsNormalizer, ModelPT, ABC):
+    """Base class for all end-to-end TTS models that generate a waveform from text"""
 
     @abstractmethod
     def parse(self, str_input: str, **kwargs) -> 'torch.tensor':
         """
-       A helper function that accepts a raw python string and turns it into a tensor. The tensor should have 2
-        dimensions. The first is the batch, which should be of size 1. The second should represent time. The tensor
-        should represent either tokenized or embedded text, depending on the model.
+        A helper function that accepts a raw python string and turns it into a tensor. The tensor should have 2
+         dimensions. The first is the batch, which should be of size 1. The second should represent time. The tensor
+         should represent either tokenized or embedded text, depending on the model.
         """
 
     @abstractmethod
@@ -299,7 +334,6 @@ def convert_graphemes_to_phonemes(
         num_workers: int = 0,
         pred_field: Optional[str] = "pred_text",
     ) -> List[str]:
-
         """
         Main function for Inference. Converts grapheme entries from the manifest "graheme_field" to phonemes
         Args:
diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py
index 3235a096a04b..b1e702c89124 100644
--- a/nemo/collections/tts/models/fastpitch.py
+++ b/nemo/collections/tts/models/fastpitch.py
@@ -200,28 +200,6 @@ def _get_default_text_tokenizer_conf(self):
         text_tokenizer: TextTokenizerConfig = TextTokenizerConfig()
         return OmegaConf.create(OmegaConf.to_yaml(text_tokenizer))
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-
-            self.text_normalizer_call = self.normalizer.normalize
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def _setup_tokenizer(self, cfg):
         text_tokenizer_kwargs = {}
 
@@ -240,12 +218,14 @@ def _setup_tokenizer(self, cfg):
 
             if "phoneme_dict" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["phoneme_dict"] = self.register_artifact(
-                    'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict,
+                    'text_tokenizer.g2p.phoneme_dict',
+                    cfg.text_tokenizer.g2p.phoneme_dict,
                 )
 
             if "heteronyms" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["heteronyms"] = self.register_artifact(
-                    'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms,
+                    'text_tokenizer.g2p.heteronyms',
+                    cfg.text_tokenizer.g2p.heteronyms,
                 )
 
             # for backward compatability
@@ -478,16 +458,25 @@ def training_step(self, batch, batch_idx):
             )
             spec_predict = mels_pred[0].data.cpu().float().numpy()
             self.tb_logger.add_image(
-                "train_mel_predicted", plot_spectrogram_to_numpy(spec_predict), self.global_step, dataformats="HWC",
+                "train_mel_predicted",
+                plot_spectrogram_to_numpy(spec_predict),
+                self.global_step,
+                dataformats="HWC",
             )
             if self.learn_alignment:
                 attn = attn_hard[0].data.cpu().float().numpy().squeeze()
                 self.tb_logger.add_image(
-                    "train_attn", plot_alignment_to_numpy(attn.T), self.global_step, dataformats="HWC",
+                    "train_attn",
+                    plot_alignment_to_numpy(attn.T),
+                    self.global_step,
+                    dataformats="HWC",
                 )
                 soft_attn = attn_soft[0].data.cpu().float().numpy().squeeze()
                 self.tb_logger.add_image(
-                    "train_soft_attn", plot_alignment_to_numpy(soft_attn.T), self.global_step, dataformats="HWC",
+                    "train_soft_attn",
+                    plot_alignment_to_numpy(soft_attn.T),
+                    self.global_step,
+                    dataformats="HWC",
                 )
 
         return loss
@@ -527,7 +516,20 @@ def validation_step(self, batch, batch_idx):
             )
 
         # Calculate val loss on ground truth durations to better align L2 loss in time
-        (mels_pred, _, _, log_durs_pred, pitch_pred, _, _, _, attn_hard_dur, pitch, energy_pred, energy_tgt,) = self(
+        (
+            mels_pred,
+            _,
+            _,
+            log_durs_pred,
+            pitch_pred,
+            _,
+            _,
+            _,
+            attn_hard_dur,
+            pitch,
+            energy_pred,
+            energy_tgt,
+        ) = self(
             text=text,
             durs=durs,
             pitch=pitch,
@@ -587,7 +589,10 @@ def on_validation_epoch_end(self):
             )
             spec_predict = spec_predict[0].data.cpu().float().numpy()
             self.tb_logger.add_image(
-                "val_mel_predicted", plot_spectrogram_to_numpy(spec_predict), self.global_step, dataformats="HWC",
+                "val_mel_predicted",
+                plot_spectrogram_to_numpy(spec_predict),
+                self.global_step,
+                dataformats="HWC",
             )
             self.log_train_images = True
         self.validation_step_outputs.clear()  # free memory)
@@ -598,7 +603,10 @@ def _setup_train_dataloader(self, cfg):
             phon_mode = self.vocab.set_phone_prob(self.vocab.phoneme_probability)
 
         with phon_mode:
-            dataset = instantiate(cfg.dataset, text_tokenizer=self.vocab,)
+            dataset = instantiate(
+                cfg.dataset,
+                text_tokenizer=self.vocab,
+            )
 
         sampler = dataset.get_sampler(cfg.dataloader_params.batch_size, world_size=self.trainer.world_size)
         return torch.utils.data.DataLoader(
@@ -611,7 +619,10 @@ def _setup_test_dataloader(self, cfg):
             phon_mode = self.vocab.set_phone_prob(0.0)
 
         with phon_mode:
-            dataset = instantiate(cfg.dataset, text_tokenizer=self.vocab,)
+            dataset = instantiate(
+                cfg.dataset,
+                text_tokenizer=self.vocab,
+            )
 
         return torch.utils.data.DataLoader(dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params)
 
diff --git a/nemo/collections/tts/models/mixer_tts.py b/nemo/collections/tts/models/mixer_tts.py
index 1a44cd5b31c8..c260df22e3c0 100644
--- a/nemo/collections/tts/models/mixer_tts.py
+++ b/nemo/collections/tts/models/mixer_tts.py
@@ -123,29 +123,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self.decoder = instantiate(cfg.decoder)
         self.proj = nn.Linear(self.decoder.d_model, cfg.n_mel_channels)
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-
-            self.text_normalizer_call = self.normalizer.normalize
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def _setup_tokenizer(self, cfg):
         text_tokenizer_kwargs = {}
         if "g2p" in cfg.text_tokenizer:
@@ -163,12 +140,14 @@ def _setup_tokenizer(self, cfg):
 
             if "phoneme_dict" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["phoneme_dict"] = self.register_artifact(
-                    'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict,
+                    'text_tokenizer.g2p.phoneme_dict',
+                    cfg.text_tokenizer.g2p.phoneme_dict,
                 )
 
             if "heteronyms" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["heteronyms"] = self.register_artifact(
-                    'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms,
+                    'text_tokenizer.g2p.heteronyms',
+                    cfg.text_tokenizer.g2p.heteronyms,
                 )
 
             text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs)
@@ -269,7 +248,10 @@ def _metrics(
     def run_aligner(self, text, text_len, text_mask, spect, spect_len, attn_prior):
         text_emb = self.symbol_emb(text)
         attn_soft, attn_logprob = self.aligner(
-            spect, text_emb.permute(0, 2, 1), mask=text_mask == 0, attn_prior=attn_prior,
+            spect,
+            text_emb.permute(0, 2, 1),
+            mask=text_mask == 0,
+            attn_prior=attn_prior,
         )
         attn_hard = binarize_attention_parallel(attn_soft, text_len, spect_len)
         attn_hard_dur = attn_hard.sum(2)[:, 0, :]
@@ -444,7 +426,16 @@ def training_step(self, batch, batch_idx):
         pitch = (pitch - self.pitch_mean) / self.pitch_std
         pitch[zero_pitch_idx] = 0.0
 
-        (pred_spect, _, pred_log_durs, pred_pitch, attn_soft, attn_logprob, attn_hard, attn_hard_dur,) = self(
+        (
+            pred_spect,
+            _,
+            pred_log_durs,
+            pred_pitch,
+            attn_soft,
+            attn_logprob,
+            attn_hard,
+            attn_hard_dur,
+        ) = self(
             text=text,
             text_len=text_len,
             pitch=pitch,
@@ -454,7 +445,17 @@ def training_step(self, batch, batch_idx):
             lm_tokens=lm_tokens,
         )
 
-        (loss, durs_loss, acc, acc_dist_1, acc_dist_3, pitch_loss, mel_loss, ctc_loss, bin_loss,) = self._metrics(
+        (
+            loss,
+            durs_loss,
+            acc,
+            acc_dist_1,
+            acc_dist_3,
+            pitch_loss,
+            mel_loss,
+            ctc_loss,
+            bin_loss,
+        ) = self._metrics(
             pred_durs=pred_log_durs,
             pred_pitch=pred_pitch,
             true_durs=attn_hard_dur,
@@ -496,7 +497,16 @@ def validation_step(self, batch, batch_idx):
         pitch = (pitch - self.pitch_mean) / self.pitch_std
         pitch[zero_pitch_idx] = 0.0
 
-        (pred_spect, _, pred_log_durs, pred_pitch, attn_soft, attn_logprob, attn_hard, attn_hard_dur,) = self(
+        (
+            pred_spect,
+            _,
+            pred_log_durs,
+            pred_pitch,
+            attn_soft,
+            attn_logprob,
+            attn_hard,
+            attn_hard_dur,
+        ) = self(
             text=text,
             text_len=text_len,
             pitch=pitch,
@@ -506,7 +516,17 @@ def validation_step(self, batch, batch_idx):
             lm_tokens=lm_tokens,
         )
 
-        (loss, durs_loss, acc, acc_dist_1, acc_dist_3, pitch_loss, mel_loss, ctc_loss, bin_loss,) = self._metrics(
+        (
+            loss,
+            durs_loss,
+            acc,
+            acc_dist_1,
+            acc_dist_3,
+            pitch_loss,
+            mel_loss,
+            ctc_loss,
+            bin_loss,
+        ) = self._metrics(
             pred_durs=pred_log_durs,
             pred_pitch=pred_pitch,
             true_durs=attn_hard_dur,
@@ -605,7 +625,9 @@ def validation_step(self, batch, batch_idx):
             "raw_texts": [NeuralType(optional=True)],
             "lm_model": NeuralType(optional=True),
         },
-        output_types={"spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()),},
+        output_types={
+            "spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()),
+        },
     )
     def generate_spectrogram(
         self,
@@ -694,7 +716,9 @@ def _loader(self, cfg):
             text_tokenizer=self.tokenizer,
         )
         return torch.utils.data.DataLoader(  # noqa
-            dataset=dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params,
+            dataset=dataset,
+            collate_fn=dataset.collate_fn,
+            **cfg.dataloader_params,
         )
 
     def setup_training_data(self, cfg):
@@ -749,7 +773,11 @@ def output_types(self):
 
     def input_example(self, max_text_len=10, max_lm_tokens_len=10):
         text = torch.randint(
-            low=0, high=len(self.tokenizer.tokens), size=(1, max_text_len), device=self.device, dtype=torch.long,
+            low=0,
+            high=len(self.tokenizer.tokens),
+            size=(1, max_text_len),
+            device=self.device,
+            dtype=torch.long,
         )
 
         inputs = {'text': text}
diff --git a/nemo/collections/tts/models/radtts.py b/nemo/collections/tts/models/radtts.py
index 959720910f11..82f85d1ed6a2 100644
--- a/nemo/collections/tts/models/radtts.py
+++ b/nemo/collections/tts/models/radtts.py
@@ -296,7 +296,9 @@ def _loader(self, cfg):
             text_tokenizer=self.tokenizer,
         )
         return torch.utils.data.DataLoader(  # noqa
-            dataset=dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params,
+            dataset=dataset,
+            collate_fn=dataset.collate_fn,
+            **cfg.dataloader_params,
         )
 
     def setup_training_data(self, cfg):
@@ -315,7 +317,9 @@ def setup_test_data(self, cfg):
             "speaker": NeuralType(('B'), Index(), optional=True),
             "sigma": NeuralType(optional=True),
         },
-        output_types={"spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()),},
+        output_types={
+            "spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()),
+        },
     )
     def generate_spectrogram(self, tokens: 'torch.tensor', speaker: int = 0, sigma: float = 1.0) -> torch.tensor:
         self.eval()
@@ -350,12 +354,14 @@ def _setup_tokenizer(self, cfg):
 
             if "phoneme_dict" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["phoneme_dict"] = self.register_artifact(
-                    'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict,
+                    'text_tokenizer.g2p.phoneme_dict',
+                    cfg.text_tokenizer.g2p.phoneme_dict,
                 )
 
             if "heteronyms" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["heteronyms"] = self.register_artifact(
-                    'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms,
+                    'text_tokenizer.g2p.heteronyms',
+                    cfg.text_tokenizer.g2p.heteronyms,
                 )
 
             text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs)
@@ -374,29 +380,6 @@ def _setup_tokenizer(self, cfg):
             self.text_tokenizer_pad_id = text_tokenizer_pad_id
             self.tokens = tokens
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-                self.text_normalizer_call = self.normalizer.normalize
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-            self.text_normalizer_call = self.normalizer.normalize
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def parse(self, text: str, normalize=False) -> torch.Tensor:
         if self.training:
             logging.warning("parse() is meant to be called in eval mode.")
@@ -479,7 +462,11 @@ def input_example(self, max_batch=1, max_dim=400):
         inp[inp == pad_id] = pad_id - 1 if pad_id > 0 else pad_id + 1
 
         inputs.update(
-            {'speaker_id': speaker, 'speaker_id_text': speaker, 'speaker_id_attributes': speaker,}
+            {
+                'speaker_id': speaker,
+                'speaker_id_text': speaker,
+                'speaker_id_attributes': speaker,
+            }
         )
         new_inputs = {
             'text': inp,
@@ -495,11 +482,24 @@ def input_example(self, max_batch=1, max_dim=400):
         return (new_inputs,)
 
     def forward_for_export(
-        self, text, batch_lengths, speaker_id, speaker_id_text, speaker_id_attributes, pitch, pace, volume,
+        self,
+        text,
+        batch_lengths,
+        speaker_id,
+        speaker_id_text,
+        speaker_id_attributes,
+        pitch,
+        pace,
+        volume,
     ):
         if self.export_config["enable_ragged_batches"]:
             text, pitch, pace, volume_tensor, lens = batch_from_ragged(
-                text, pitch, pace, batch_lengths=batch_lengths, padding_idx=self.tokenizer_pad, volume=volume,
+                text,
+                pitch,
+                pace,
+                batch_lengths=batch_lengths,
+                padding_idx=self.tokenizer_pad,
+                volume=volume,
             )
             if volume is not None:
                 volume = volume_tensor
diff --git a/nemo/collections/tts/models/tacotron2.py b/nemo/collections/tts/models/tacotron2.py
index 3fcdee9832ef..2fb005d80ca6 100644
--- a/nemo/collections/tts/models/tacotron2.py
+++ b/nemo/collections/tts/models/tacotron2.py
@@ -322,29 +322,6 @@ def on_validation_epoch_end(self):
         self.log('val_loss', avg_loss)
         self.validation_step_outputs.clear()  # free memory
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-
-            self.text_normalizer_call = self.normalizer.normalize
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def _setup_tokenizer(self, cfg):
         text_tokenizer_kwargs = {}
         if "g2p" in cfg.text_tokenizer and cfg.text_tokenizer.g2p is not None:
@@ -362,12 +339,14 @@ def _setup_tokenizer(self, cfg):
 
             if "phoneme_dict" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["phoneme_dict"] = self.register_artifact(
-                    'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict,
+                    'text_tokenizer.g2p.phoneme_dict',
+                    cfg.text_tokenizer.g2p.phoneme_dict,
                 )
 
             if "heteronyms" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["heteronyms"] = self.register_artifact(
-                    'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms,
+                    'text_tokenizer.g2p.heteronyms',
+                    cfg.text_tokenizer.g2p.heteronyms,
                 )
 
             text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs)
diff --git a/nemo/collections/tts/models/vits.py b/nemo/collections/tts/models/vits.py
index 319221d04ee0..4a891fa8823e 100644
--- a/nemo/collections/tts/models/vits.py
+++ b/nemo/collections/tts/models/vits.py
@@ -92,28 +92,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
 
         self.automatic_optimization = False
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-                self.text_normalizer_call = self.normalizer.normalize
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def _setup_tokenizer(self, cfg):
         text_tokenizer_kwargs = {}
         if "g2p" in cfg.text_tokenizer and cfg.text_tokenizer.g2p is not None:
@@ -131,12 +109,14 @@ def _setup_tokenizer(self, cfg):
 
             if "phoneme_dict" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["phoneme_dict"] = self.register_artifact(
-                    'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict,
+                    'text_tokenizer.g2p.phoneme_dict',
+                    cfg.text_tokenizer.g2p.phoneme_dict,
                 )
 
             if "heteronyms" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["heteronyms"] = self.register_artifact(
-                    'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms,
+                    'text_tokenizer.g2p.heteronyms',
+                    cfg.text_tokenizer.g2p.heteronyms,
                 )
 
             text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs)
@@ -164,8 +144,14 @@ def configure_optimizers(self):
         sched_config = optim_config.pop("sched", None)
         OmegaConf.set_struct(optim_config, True)
 
-        optim_g = instantiate(optim_config, params=self.net_g.parameters(),)
-        optim_d = instantiate(optim_config, params=self.net_d.parameters(),)
+        optim_g = instantiate(
+            optim_config,
+            params=self.net_g.parameters(),
+        )
+        optim_d = instantiate(
+            optim_config,
+            params=self.net_d.parameters(),
+        )
 
         if sched_config is not None:
             if sched_config.name == 'ExponentialLR':
@@ -173,10 +159,14 @@ def configure_optimizers(self):
                 scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=sched_config.lr_decay)
             elif sched_config.name == 'CosineAnnealing':
                 scheduler_g = CosineAnnealing(
-                    optimizer=optim_g, max_steps=sched_config.max_steps, min_lr=sched_config.min_lr,
+                    optimizer=optim_g,
+                    max_steps=sched_config.max_steps,
+                    min_lr=sched_config.min_lr,
                 )
                 scheduler_d = CosineAnnealing(
-                    optimizer=optim_d, max_steps=sched_config.max_steps, min_lr=sched_config.min_lr,
+                    optimizer=optim_d,
+                    max_steps=sched_config.max_steps,
+                    min_lr=sched_config.min_lr,
                 )
             else:
                 raise ValueError("Unknown optimizer.")
@@ -362,7 +352,9 @@ def _loader(self, cfg):
             text_tokenizer=self.tokenizer,
         )
         return torch.utils.data.DataLoader(  # noqa
-            dataset=dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params,
+            dataset=dataset,
+            collate_fn=dataset.collate_fn,
+            **cfg.dataloader_params,
         )
 
     def train_dataloader(self):
@@ -377,7 +369,10 @@ def train_dataloader(self):
         train_sampler = DistributedBucketSampler(dataset, **self.cfg.train_ds.batch_sampler)
 
         dataloader = torch.utils.data.DataLoader(
-            dataset, collate_fn=dataset.collate_fn, batch_sampler=train_sampler, **self.cfg.train_ds.dataloader_params,
+            dataset,
+            collate_fn=dataset.collate_fn,
+            batch_sampler=train_sampler,
+            **self.cfg.train_ds.dataloader_params,
         )
         return dataloader
 
@@ -412,7 +407,9 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]':
         return list_of_models
 
     @typecheck(
-        input_types={"tokens": NeuralType(('B', 'T_text'), TokenIndex(), optional=True),},
+        input_types={
+            "tokens": NeuralType(('B', 'T_text'), TokenIndex(), optional=True),
+        },
         output_types={"audio": NeuralType(('B', 'T_audio'), AudioSignal())},
     )
     def convert_text_to_waveform(self, *, tokens, speakers=None):
diff --git a/nemo/core/optim/mcore_optim.py b/nemo/core/optim/mcore_optim.py
index c0f3464a1e7c..76929325fdfc 100644
--- a/nemo/core/optim/mcore_optim.py
+++ b/nemo/core/optim/mcore_optim.py
@@ -61,7 +61,7 @@ def sharded_state_dict(
             model_sharded_state_dict, is_loading=is_loading, sharding_type=sharding_type
         )
 
-    def step(self, closure):
+    def step(self, closure=None):
         """Clip gradients (if needed) and step the base optimizer.
         Always return successful since there is no overflow."""
         # Apply closure
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index de5b07787a1f..cbf3ea39921e 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -18,7 +18,7 @@
 
 import torch
 import torch.distributed as dist
-from megatron.core import mpu, parallel_state
+from megatron.core import parallel_state
 from megatron.core.transformer.module import Float16Module
 from omegaconf.omegaconf import DictConfig, open_dict
 
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 4aae83efd6f6..fb43224d59a9 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -176,6 +176,7 @@ def export(
         multiple_profiles: bool = False,
         gpt_attention_plugin: str = "auto",
         gemm_plugin: str = "auto",
+        use_mcore_path: bool = False,
         reduce_fusion: bool = True,
         fp8_quantized: Optional[bool] = None,
         fp8_kvcache: Optional[bool] = None,
@@ -213,11 +214,11 @@ def export(
             multiple_profiles: (bool): enables multiple profiles feature of TRT-LLM. Default = False
             gpt_attention_plugin (str): enable the gpt attention plugin. Default = "auto"
             gemm_plugin (str): enable the gpt plugin. Default = "auto"
+            use_mcore_path (bool) : Use the more recent mcore path for export
             reduce_fusion (bool): enables fusing extra kernels after custom TRT-LLM allReduce
             fp8_quantized (Optional[bool]): enables exporting to FP8 TRT-LLM checkpoints. If not set, autodetects the type.
             fp8_kvcache (Optional[bool]): enables FP8 KV-cache quantization. If not set, autodetects the type.
         """
-
         if n_gpus is not None:
             warnings.warn(
                 "Parameter n_gpus is deprecated and will be removed in the next release. "
@@ -326,56 +327,175 @@ def export(
                         "Supported model types are: {1}.".format(model_type, self.get_supported_models_list)
                     )
 
-                if model_type == "gpt" or model_type == "starcoder":
-                    model_type = "gptnext"
+                model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
+                if use_mcore_path:
+                    from megatron.core.export.data_type import DataType
+                    from megatron.core.export.export_config import ExportConfig
+                    from megatron.core.export.model_type import ModelType
+                    from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import (
+                        DEFAULT_CONVERSION_DICT,
+                    )
+                    from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+                    from megatron.core.transformer.transformer_config import TransformerConfig
+                    from tensorrt_llm.layers import MoeConfig
+
+                    def get_transformer_config(nemo_model_config):
+                        normalization = nemo_model_config.get('normalization', 'layernorm')
+                        transformer_config_normalization = 'LayerNorm'
+                        layernorm_zero_centered_gamma = False
+                        if normalization == 'layernorm1p':
+                            layernorm_zero_centered_gamma = True
+                        elif normalization == 'rmsnorm':
+                            transformer_config_normalization = 'RMSNorm'
+
+                        conf = TransformerConfig(
+                            num_layers=nemo_model_config.get('num_layers'),
+                            moe_router_topk=nemo_model_config.get('moe_router_topk', 0),
+                            num_attention_heads=nemo_model_config.get('num_attention_heads'),
+                            num_query_groups=nemo_model_config.get(
+                                'num_query_groups', nemo_model_config['num_attention_heads']
+                            ),
+                            kv_channels=nemo_model_config.get("kv_channels", None),
+                            hidden_size=nemo_model_config.get('hidden_size'),
+                            ffn_hidden_size=nemo_model_config.get('ffn_hidden_size'),
+                            layernorm_epsilon=nemo_model_config.get('layernorm_epsilon'),
+                            add_bias_linear=nemo_model_config.get('bias'),
+                            num_moe_experts=nemo_model_config.get('num_moe_experts', None),
+                            normalization=transformer_config_normalization,
+                            layernorm_zero_centered_gamma=layernorm_zero_centered_gamma,
+                        )
 
-                if model_type == "mixtral":
-                    model_type = "llama"
+                        return conf
+
+                    # We build the transformer config using the nemo model config.
+                    transformer_config = get_transformer_config(model_configs)
+                    input_model_type = getattr(ModelType, model_type)
+
+                    # MCore export supports some default conversion dictionaries
+                    mcore_model_conversion_dict = DEFAULT_CONVERSION_DICT[input_model_type]
+                    # All Mcore conversion dicts start with "decoder.layers.4.blah.blah" , while nemo models start with "model.decoder.layers.4.blahblah". so we append model. to the keys
+                    nemo_model_conversion_dict = {
+                        f'model.{key}': value for key, value in mcore_model_conversion_dict.items()
+                    }
+
+                    trtllm_helper = TRTLLMHelper(
+                        transformer_config=transformer_config,
+                        model_type=input_model_type,
+                        trtllm_conversion_dict=nemo_model_conversion_dict,
+                        position_embedding_type=model_configs.get('position_embedding_type'),
+                        max_position_embeddings=model_configs.get('max_position_embeddings'),
+                        rotary_percentage=model_configs.get('rotary_percentage', 1.0),
+                        rotary_base=model_configs.get('rotary_base', 10000),
+                        moe_tp_mode=model_configs.get('moe_tp_mode', 2),
+                        multi_query_mode=model_configs.get("multi_query_mode", False),
+                        activation=model_configs.get('activation', "gelu"),
+                        seq_len_interpolation_factor=model_configs.get("seq_len_interpolation_factor"),
+                        moe_renorm_mode=model_configs.get(
+                            'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
+                        ),
+                        share_embeddings_and_output_weights=model_configs.get(
+                            "share_embeddings_and_output_weights", False
+                        ),
+                    )
 
-                model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
-                weights_dicts, model_configs = model_to_trtllm_ckpt(
-                    model=model,
-                    nemo_model_config=model_configs,
-                    nemo_export_dir=nemo_export_dir,
-                    decoder_type=model_type,
-                    dtype=dtype,
-                    tensor_parallel_size=tensor_parallelism_size,
-                    pipeline_parallel_size=pipeline_parallelism_size,
-                    gpus_per_node=gpus_per_node,
-                    use_parallel_embedding=use_parallel_embedding,
-                    use_embedding_sharing=use_embedding_sharing,
-                    fp8_quantized=fp8_quantized,
-                    fp8_kvcache=fp8_kvcache,
-                )
+                    input_dtype = getattr(DataType, dtype)
+                    export_config = ExportConfig(
+                        tensor_parallelism_size,
+                        pipeline_parallelism_size,
+                        use_parallel_embedding,
+                        use_embedding_sharing,
+                    )
 
-                for weight_dict, model_config in zip(weights_dicts, model_configs):
-                    build_and_save_engine(
-                        max_input_len=max_input_len,
-                        max_output_len=max_output_len,
-                        max_batch_size=max_batch_size,
-                        model_config=model_config,
-                        model_weights=weight_dict,
-                        model_dir=self.model_dir,
-                        model_type=model_type,
-                        lora_ckpt_list=self.lora_ckpt_list,
-                        use_lora_plugin=use_lora_plugin,
-                        max_lora_rank=max_lora_rank,
-                        lora_target_modules=lora_target_modules,
-                        max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-                        paged_kv_cache=paged_kv_cache,
-                        remove_input_padding=remove_input_padding,
-                        paged_context_fmha=paged_context_fmha,
-                        max_num_tokens=max_num_tokens,
-                        opt_num_tokens=opt_num_tokens,
-                        max_seq_len=max_seq_len,
-                        multiple_profiles=multiple_profiles,
-                        gpt_attention_plugin=gpt_attention_plugin,
-                        gemm_plugin=gemm_plugin,
+                    trtllm_model_weights_list, trtllm_model_config_list = (
+                        trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                            model_state_dict=model,
+                            export_config=export_config,
+                            dtype=input_dtype,
+                            state_dict_split_by_layer_numbers=False,
+                        )
+                    )
+
+                    for trtllm_model_weights, trtllm_model_config in zip(
+                        trtllm_model_weights_list, trtllm_model_config_list
+                    ):
+                        trtllm_helper.build_and_save_engine(
+                            max_input_len=max_input_len,
+                            max_output_len=max_output_len,
+                            max_batch_size=max_batch_size,
+                            engine_dir=self.model_dir,
+                            trtllm_model_weights=trtllm_model_weights,
+                            trtllm_model_config=trtllm_model_config,
+                            lora_ckpt_list=self.lora_ckpt_list,
+                            use_lora_plugin=use_lora_plugin,
+                            max_lora_rank=max_lora_rank,
+                            lora_target_modules=lora_target_modules,
+                            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                            paged_kv_cache=paged_kv_cache,
+                            remove_input_padding=remove_input_padding,
+                            paged_context_fmha=paged_context_fmha,
+                            use_refit=False,
+                            max_num_tokens=max_num_tokens,
+                            max_seq_len=max_seq_len,
+                            opt_num_tokens=opt_num_tokens,
+                            max_beam_width=1,
+                            tokens_per_block=128,
+                            multiple_profiles=multiple_profiles,
+                            gpt_attention_plugin=gpt_attention_plugin,
+                            gemm_plugin=gemm_plugin,
+                        )
+                else:
+                    if model_type == "gpt" or model_type == "starcoder":
+                        model_type = "gptnext"
+
+                    if model_type == "mixtral":
+                        model_type = "llama"
+
+                    weights_dicts, model_configs = model_to_trtllm_ckpt(
+                        model=model,
+                        nemo_model_config=model_configs,
+                        nemo_export_dir=nemo_export_dir,
+                        decoder_type=model_type,
+                        dtype=dtype,
+                        tensor_parallel_size=tensor_parallelism_size,
+                        pipeline_parallel_size=pipeline_parallelism_size,
+                        gpus_per_node=gpus_per_node,
+                        use_parallel_embedding=use_parallel_embedding,
+                        use_embedding_sharing=use_embedding_sharing,
+                        fp8_quantized=fp8_quantized,
+                        fp8_kvcache=fp8_kvcache,
                     )
 
+                    for weight_dict, model_config in zip(weights_dicts, model_configs):
+                        build_and_save_engine(
+                            max_input_len=max_input_len,
+                            max_output_len=max_output_len,
+                            max_batch_size=max_batch_size,
+                            model_config=model_config,
+                            model_weights=weight_dict,
+                            model_dir=self.model_dir,
+                            model_type=model_type,
+                            lora_ckpt_list=self.lora_ckpt_list,
+                            use_lora_plugin=use_lora_plugin,
+                            max_lora_rank=max_lora_rank,
+                            lora_target_modules=lora_target_modules,
+                            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                            paged_kv_cache=paged_kv_cache,
+                            remove_input_padding=remove_input_padding,
+                            paged_context_fmha=paged_context_fmha,
+                            max_num_tokens=max_num_tokens,
+                            opt_num_tokens=opt_num_tokens,
+                            max_seq_len=max_seq_len,
+                            multiple_profiles=multiple_profiles,
+                            gpt_attention_plugin=gpt_attention_plugin,
+                            gemm_plugin=gemm_plugin,
+                        )
+
             tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
+            tokenizer_path_nemo2 = os.path.join(nemo_export_dir, "nemo_context")
             if os.path.exists(tokenizer_path):
                 shutil.copy(tokenizer_path, self.model_dir)
+            elif os.path.exists(tokenizer_path_nemo2):
+                shutil.copytree(tokenizer_path_nemo2, Path(self.model_dir) / "nemo_context")
             else:
                 self.tokenizer.save_pretrained(os.path.join(self.model_dir, 'huggingface_tokenizer'))
 
@@ -451,7 +571,6 @@ def convert_to_safe_tensors(
                     weight_dict[k] = numpy_to_torch(v)
 
                 safetensors.torch.save_file(weight_dict, os.path.join(self.model_dir, f'rank{rank}.safetensors'))
-
             model_configs[0].to_json_file(os.path.join(self.model_dir, 'config.json'))
 
             tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
diff --git a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
index 148e67307ae1..db1aec0f5a55 100644
--- a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
+++ b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
@@ -157,7 +157,8 @@ def convert_model_to_trt_llm_ckpt(
             num_kv_heads = num_attention_heads
 
     export_config = {
-        "apply_layernorm_1p": nemo_model_config.get("normalization", "") == "layernorm1p",
+        "apply_layernorm_1p": nemo_model_config.get("normalization", "") == "layernorm1p"
+        or nemo_model_config.get("layernorm_zero_centered_gamma", False),
         "tp_size": training_tp_size,
         "split_gated_activation": nemo_model_config.get("activation", "gelu")
         in ["swiglu", "geglu", "fast-swiglu", "fast-geglu"]
@@ -195,7 +196,7 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
 
             val = val.to(storage_type).cpu()
             model_level_weights["transformer.vocab_embedding.weight"].append(val)
-        if has_lm_head and pp_idx == training_pp_size - 1:
+        if has_lm_head and pp_idx == training_pp_size - 1 and decoder_type != "gemma":
             val = model.get("state_dict", model)[get_layer_name("output_layer", prefix)]
             val = val.to(storage_type).cpu()
             model_level_weights["lm_head.weight"].append(val)
diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
index 74e0aac758da..171932d84cfb 100644
--- a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
+++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
@@ -17,6 +17,8 @@
 import json
 import logging
 import os
+import re
+import shutil
 from io import BytesIO
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
@@ -114,6 +116,11 @@ def load_scaling_factors(state_dict: dict, basename: str, size: int) -> Optional
     return load_scales_from_bytes(bytes_list)
 
 
+def filter_experts_extra_states(state_dict: dict):
+    pattern = r'module\.decoder\.layers\.mlp\.experts\.experts\.linear_fc\d+\._extra_state/shard_\d+\.\d+_\d+\.\d+'
+    return {k: v for k, v in state_dict.items() if not re.fullmatch(pattern, k)}
+
+
 def standarize_distributed_scaling_factors(state_dict: dict) -> dict:
     while key := get_extra_state_key(state_dict):
         basename, size = unpack_extra_state_key(key)
@@ -144,6 +151,7 @@ def load_sharded_metadata_torch_dist(checkpoint_dir: Union[Path, TarPath], torch
         storage_reader=fs_reader,
         no_dist=True,
     )
+    state_dict = filter_experts_extra_states(state_dict)
     state_dict = standarize_distributed_scaling_factors(state_dict)
 
     if not torch_tensor:
@@ -277,12 +285,20 @@ def copy_tokenizer_files(config, out_dir):
 
 def get_tokenzier(tokenizer_dir_or_path: Path) -> PreTrainedTokenizer:
     """Loads the tokenizer from the decoded NEMO weights dir."""
-    if os.path.isdir(os.path.join(tokenizer_dir_or_path, "huggingface_tokenizer")):
-        return AutoTokenizer.from_pretrained(os.path.join(tokenizer_dir_or_path, "huggingface_tokenizer"))
+    if (tokenizer_dir_or_path / "nemo_context").exists():
+        from nemo.lightning import io
+
+        tokenizer_spec = io.load_context((tokenizer_dir_or_path / "nemo_context"), subpath="model.tokenizer")
+        return build_tokenizer(tokenizer_spec)
+    else:
+        if os.path.isdir(os.path.join(tokenizer_dir_or_path, "huggingface_tokenizer")):
+            return AutoTokenizer.from_pretrained(os.path.join(tokenizer_dir_or_path, "huggingface_tokenizer"))
 
-    model_path = tokenizer_dir_or_path / "tokenizer.model" if tokenizer_dir_or_path.is_dir() else tokenizer_dir_or_path
-    tokenizer_config = {"library": "sentencepiece", "model": str(model_path)}
-    return build_tokenizer(tokenizer_config)
+        model_path = (
+            tokenizer_dir_or_path / "tokenizer.model" if tokenizer_dir_or_path.is_dir() else tokenizer_dir_or_path
+        )
+        tokenizer_config = {"library": "sentencepiece", "model": str(model_path)}
+        return build_tokenizer(tokenizer_config)
 
 
 def build_tokenizer(tokenizer):
@@ -309,6 +325,7 @@ def build_tokenizer(tokenizer):
                 def batch_encode_patch(self, ids):
                     if torch.is_tensor(ids):
                         ids = ids.cpu().numpy()
+                        ids = ids[0] if len(ids.shape) > 1 else ids
                     return self.ids_to_text(ids)
 
                 tokenizer.bos_token_id = tokenizer.bos_id
@@ -331,11 +348,13 @@ def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Pat
     else:
         nemo_dir = TarPath(nemo_ckpt)
 
+    tokenizer = None
     try:
         unpacked_checkpoint_dir = UnpackedNemoCheckpointDir(nemo_dir, load_checkpoints_to_cpu=True)
 
-        dist_ckpt_folder = nemo_dir / "model_weights"
-        if dist_ckpt_folder.exists():
+        if (nemo_dir / "model_weights").exists():
+            dist_ckpt_folder = nemo_dir / "model_weights"
+
             model = load_sharded_metadata(dist_ckpt_folder)
             nemo_model_config = unpacked_checkpoint_dir.model_config
 
@@ -350,6 +369,45 @@ def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Pat
 
                 tokenizer_config["model"] = os.path.join(nemo_export_dir, "tokenizer.model")
                 tokenizer = build_tokenizer(tokenizer_config)
+        elif (nemo_dir / "weights").exists():
+            dist_ckpt_folder = nemo_dir / "weights"
+            model = load_sharded_metadata(dist_ckpt_folder)
+            io_folder = nemo_dir / "context"
+
+            if (io_folder / "model.yaml").exists():
+                with open(io_folder / "model.yaml", 'r') as stream:
+                    config = yaml.safe_load(stream)
+
+                nemo_model_config = {}
+                for k, v in config["config"].items():
+                    if isinstance(v, (float, int, str, bool)):
+                        nemo_model_config[k] = v
+                    elif k == "activation_func":
+                        nemo_model_config["activation"] = v["_target_"].rsplit('.', 1)[-1]
+            else:
+                from nemo.lightning import io
+
+                config = io.load_context(io_folder, subpath="model.config")
+
+                nemo_model_config = {}
+                for k, v in config.__dict__.items():
+                    if isinstance(v, (float, int, str, bool)):
+                        nemo_model_config[k] = v
+                    elif k == "activation_func":
+                        nemo_model_config["activation"] = v.__name__
+
+            if nemo_model_config.get("num_moe_experts") is None:
+                nemo_model_config["num_moe_experts"] = 0
+                nemo_model_config["moe_router_topk"] = 0
+            if nemo_model_config["activation"] == "silu":
+                nemo_model_config["activation"] = "fast-swiglu"
+            elif nemo_model_config["activation"] == "openai_gelu":
+                nemo_model_config["activation"] = "geglu"
+
+            nemo_model_config["mcore_gpt"] = True
+            nemo_model_config["max_position_embeddings"] = nemo_model_config.get("seq_length", 4096)
+
+            shutil.copytree(io_folder, nemo_export_dir / "nemo_context")
         else:
             raise Exception("Not a supported NeMo file format: only distributed MCore NeMo checkpoints are supported.")
     finally:
diff --git a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
index 47d6b635c14d..7a1f7a6cc31d 100644
--- a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
+++ b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
@@ -18,6 +18,7 @@
 import warnings
 from typing import List, Optional
 
+import tensorrt_llm
 from tensorrt_llm.models import PretrainedConfig
 
 from nemo.export.trt_llm.qnemo.utils import CONFIG_NAME, WEIGHTS_NAME
@@ -65,10 +66,14 @@ def qnemo_to_tensorrt_llm(
 
     quant_algo = config.quantization.quant_algo
 
-    use_fused_mlp = quant_algo in [
-        "FP8",
-        None,
-    ] and config.hidden_act in ["silu", "swiglu", "fast-swiglu", "gelu", "geglu"]
+    use_fused_mlp = True
+    if config.quantization.exclude_modules:
+        for module_name in config.quantization.exclude_modules:
+            # For AutoQuant, fc and gate might not be quantized at the same time
+            # TODO: relax this limitation on the TRT-LLM side
+            if "gate" in module_name or "fc" in module_name:
+                use_fused_mlp = False
+    use_fused_mlp = use_fused_mlp and 'RecurrentGemma' not in config.architecture
 
     use_qdq = quant_algo in ["FP8", "W8A8_SQ_PER_CHANNEL"]
 
@@ -86,19 +91,19 @@ def qnemo_to_tensorrt_llm(
     build_cmd += f"--max_beam_width {max_beam_width} "
     build_cmd += f"--max_prompt_embedding_table_size {max_prompt_embedding_table_size} "
     build_cmd += f"--builder_opt {builder_opt} "
-    build_cmd += f"--gpt_attention_plugin {config.dtype} "
-    build_cmd += f"--nccl_plugin {config.dtype} "
     build_cmd += f"--paged_kv_cache {'enable' if paged_kv_cache else 'disable'} "
     build_cmd += f"--use_paged_context_fmha {'enable' if paged_context_fmha else 'disable'} "
     build_cmd += f"--remove_input_padding {'enable' if remove_input_padding else 'disable'} "
     build_cmd += f"--multiple_profiles {'enable' if multiple_profiles else 'disable'} "
     build_cmd += f"--reduce_fusion {'enable' if reduce_fusion else 'disable'} "
-
-    if use_fused_mlp:
-        build_cmd += "--use_fused_mlp " if "RecurrentGemma" not in config.architecture else ""
+    # TODO: resolve version check for setting use_fused_mlp once we move to 0.13.0 in the NeMo container
+    if tensorrt_llm.__version__ >= "0.13.0":
+        build_cmd += f"--use_fused_mlp {'enable' if use_fused_mlp else 'disable'} "
+    else:
+        build_cmd += "--use_fused_mlp " if use_fused_mlp else ""
 
     if not use_qdq:
-        build_cmd += f"--gemm_plugin {config.dtype} "
+        build_cmd += f"--gemm_plugin auto "
 
     if max_seq_len is not None:
         build_cmd += f"--max_seq_len {max_seq_len} "
diff --git a/nemo/export/vllm_exporter.py b/nemo/export/vllm_exporter.py
index 934df815065f..24905e6e6956 100644
--- a/nemo/export/vllm_exporter.py
+++ b/nemo/export/vllm_exporter.py
@@ -144,6 +144,11 @@ def export(
             max_seq_len_to_capture=None,
         )
 
+        if model_config.nemo_model_config.get("fp8", False):
+            LOGGER.warning(
+                "NeMo FP8 checkpoint detected, but exporting FP8 quantized engines is not supported for vLLM."
+            )
+
         parallel_config = ParallelConfig(
             pipeline_parallel_size=pipeline_parallel_size, tensor_parallel_size=tensor_parallel_size
         )
diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py
index a047e32537c1..abcd4556602f 100644
--- a/nemo/lightning/_strategy_lib.py
+++ b/nemo/lightning/_strategy_lib.py
@@ -68,6 +68,9 @@ def init_parallel_ranks(
         init_local_rank = app_state.local_rank
     else:
         init_world_size = world_size
+        pp = parallel_config.pipeline_model_parallel_size or 1
+        if world_size < pp:
+            raise ValueError(f"Expected world_size ({world_size}) to be greater than/equal to pipeline size ({pp})")
         init_global_rank = global_rank
         init_local_rank = local_rank
 
@@ -154,6 +157,10 @@ def set_model_parallel_attributes(model, parallelism):
             setattr(config, attr_name, getattr(parallelism, attr_name))
             if hasattr(config, "__io__"):
                 setattr(config.__io__, attr_name, getattr(parallelism, attr_name))
+        if hasattr(config, '__post_init__'):
+            # MCore does not use args in __post_init__
+            # @akoumparouli: is there a better way (e.g. reinit config)?
+            config.__post_init__()
 
         return config
 
@@ -603,15 +610,17 @@ def sharded_state_dict(
             )
             return state_dict
 
+    # megatron optimizer expects McoreDDP
+    ddp_modules = [m.module for m in model]
     mcore_opt = get_megatron_optimizer(
         config,
-        list(model),
+        ddp_modules,
         no_weight_decay_cond=no_weight_decay_cond,
         scale_lr_cond=scale_lr_cond,
         lr_mult=lr_mult,
     )
 
-    if getattr(model.ddp_config, "overlap_param_sync", False) and getattr(
+    if getattr(model.ddp_config, "overlap_param_gather", False) and getattr(
         model.ddp_config, "align_param_gather", False
     ):
         param_sync_func = [model_chunk.start_param_sync for model_chunk in model]
diff --git a/nemo/lightning/fabric/fabric.py b/nemo/lightning/fabric/fabric.py
index 60da546fd2b3..55431d940193 100644
--- a/nemo/lightning/fabric/fabric.py
+++ b/nemo/lightning/fabric/fabric.py
@@ -4,8 +4,9 @@
 
 import fiddle as fdl
 import lightning_fabric as lb
+import pytorch_lightning as pl
 from torch import nn
-from torch.optim import Optimizer
+
 from typing_extensions import Self, override
 
 from nemo.lightning.io.mixin import IOMixin, serialization, track_io
@@ -130,6 +131,14 @@ def setup_module(self, module: nn.Module, move_to_device: bool = True, _reapply_
 
         return out
 
+    def setup_datamodule(self, datamodule: pl.LightningDataModule, stage: str = "") -> pl.LightningDataModule:
+        datamodule.setup(stage)
+
+        if hasattr(self.strategy, "process_datamodule"):
+            datamodule = self.strategy.process_datamodule(datamodule)
+
+        return datamodule
+
 
 @runtime_checkable
 class DistributedModel(Protocol[ModelT]):
diff --git a/nemo/lightning/fabric/plugins.py b/nemo/lightning/fabric/plugins.py
index 513d6b86e62a..4026fb9b549e 100644
--- a/nemo/lightning/fabric/plugins.py
+++ b/nemo/lightning/fabric/plugins.py
@@ -112,7 +112,6 @@ def convert_config(self, config: ConfigT) -> ConfigT:
         """Convert the config to the precision type this plugin handles.
 
         This is optional and depends on the precision limitations during optimization.
-
         """
         return update_config_with_dtype_overrides(self.dtype_config, config)
 
@@ -122,6 +121,9 @@ def convert_module(self, module: nn.Module) -> nn.Module:
         This is optional and depends on the precision limitations during optimization.
 
         """
+        if not hasattr(module, "module"):
+            return module
+
         from megatron.core.transformer.module import Float16Module
         from megatron.core.utils import get_model_config
 
@@ -141,7 +143,6 @@ def convert_optimizer(self, optimizer: Optimizer) -> Optimizer:
         """Convert the optimizer parameters to the precision type this plugin handles.
 
         This is optional and depends on the precision limitations during optimization.
-
         """
         for optim_config in get_optim_config(optimizer):
             assert optim_config.bf16 == self.dtype_config.bf16, "BF16 model/optim config mismatch"
diff --git a/nemo/lightning/fabric/strategies.py b/nemo/lightning/fabric/strategies.py
index 789a0b1f328f..695595bca4d0 100644
--- a/nemo/lightning/fabric/strategies.py
+++ b/nemo/lightning/fabric/strategies.py
@@ -26,6 +26,7 @@
 from lightning_fabric.utilities.types import _PATH, _Stateful
 from megatron.core.distributed import DistributedDataParallelConfig
 from megatron.core.optimizer import OptimizerConfig
+from pytorch_lightning import LightningDataModule
 from pytorch_lightning.loops.fetchers import _DataFetcher
 from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO
 from pytorch_lightning.utilities.combined_loader import CombinedLoader
@@ -106,6 +107,7 @@ def __init__(
         if megatron_callbacks:
             self.megatron_callbacks.add(megatron_callbacks)
         self.output_data_idx = output_data_idx
+        self.data_sampler: Optional["DataSampler"] = data_sampler
 
         # used in NVIDIA NGC PyTorch containers
         _strategy_lib.enable_nvidia_optimizations()
@@ -141,13 +143,25 @@ def _setup_distributed(self) -> None:
         #     _strategy_lib.initialize_data(self.cluster_environment.global_rank(), self.data_config)
         _strategy_lib.init_model_parallel()
 
+    def process_datamodule(self, datamodule: LightningDataModule) -> LightningDataModule:
+        datamodule.setup()
+
+        if not self.data_sampler and hasattr(datamodule, "data_sampler"):
+            self.data_sampler = datamodule.data_sampler
+
+        if self.data_sampler:
+            self.data_sampler.setup(self.cluster_environment.global_rank())
+
+        return datamodule
+
     @override
     def process_dataloader(self, dataloader: DataLoader) -> Iterator:
-        loader = _strategy_lib.process_dataloader(dataloader, self.data_config)
+        if self.data_sampler:
+            dataloader = self.data_sampler.transform_dataloader(dataloader)
 
         # Code taken from: https://github.com/Lightning-AI/pytorch-lightning/blob/6cbe9ceb560d798892bdae9186291acf9bf5d2e3/src/lightning/pytorch/loops/fit_loop.py#L258-L260
-        output = _MegatronDataLoaderIterDataFetcher(self.data_config, output_data_idx=self.output_data_idx)
-        output.setup(CombinedLoader(loader, "max_size_cycle"))
+        output = _MegatronDataLoaderIterDataFetcher(output_data_idx=self.output_data_idx)
+        output.setup(CombinedLoader(dataloader, "max_size_cycle"))
         iter(output)
 
         return output
@@ -160,6 +174,11 @@ def setup_megatron_optimizer(
         scale_lr_cond: Optional[Callable] = None,
         lr_mult: float = 1.0,
     ) -> Optimizer:
+        if hasattr(self.precision, "convert_config"):
+            optimizer_config = self.precision.convert_config(optimizer_config)
+
+        assert optimizer_config.lr is not None, "Learning rate must be set in optimizer config"
+
         return _strategy_lib.setup_megatron_optimizer(
             model,
             optimizer_config,
@@ -180,16 +199,23 @@ def setup_optimizer(self, optimizer: Optimizer) -> Optimizer:
 
     @override
     def setup_module(self, module: Module) -> MegatronParallel:
-        _strategy_lib.set_model_parallel_attributes(module, self.parallelism)
+        from megatron.core.utils import get_model_config
 
-        # Call configure_model if it's overridden (relevant for LightningModules with lazy initialization)
-        if hasattr(module, "configure_model"):
-            module.configure_model()
+        _strategy_lib.set_model_parallel_attributes(module, self.parallelism)
 
         convert_module_fn = None
         if hasattr(self.precision, "convert_module"):
             convert_module_fn = self.precision.convert_module
 
+        if hasattr(self.precision, "convert_config"):
+            self.precision.convert_config(get_model_config(module))
+            if self.ddp_config:
+                self.precision.convert_config(self.ddp_config)
+
+        # Call configure_model if it's overridden (relevant for LightningModules with lazy initialization)
+        if hasattr(module, "configure_model"):
+            module.configure_model()
+
         megatron_parallel = MegatronParallel(
             module,
             precision_plugin=self.precision,
@@ -202,6 +228,9 @@ def setup_module(self, module: Module) -> MegatronParallel:
         if self._init_model_parallel:
             megatron_parallel.init_model_parallel()
 
+        if self.data_sampler:
+            megatron_parallel.callbacks.add(self.data_sampler)
+
         if not self.ddp_config:
             from megatron.core import mpu
 
@@ -321,13 +350,20 @@ def load_module_state_dict(
 
     @contextmanager
     def megatron_context(self) -> Generator[None, None, None]:
-        def monkey_patched(config):
-            return {"device": "meta"}
-
         from megatron.core.extensions import transformer_engine as _te
 
         original = _te._get_extra_te_kwargs  # noqa: SLF001
-        _te._get_extra_te_kwargs = monkey_patched  # noqa: SLF001
+
+        def _get_extra_te_kwargs_meta(c):
+            """Forces device to meta"""
+            kwargs = original(c)
+            kwargs['device'] = 'meta'
+            return kwargs
+
+        _te._get_extra_te_kwargs = _get_extra_te_kwargs_meta  # noqa: SLF001
+
+        _orig_perform_initialization = self.parallelism.perform_initialization
+        _orig_use_cpu_initialization = self.parallelism.use_cpu_initialization
 
         self.parallelism.perform_initialization = False
         self.parallelism.use_cpu_initialization = True
@@ -335,6 +371,8 @@ def monkey_patched(config):
         yield
 
         _te._get_extra_te_kwargs = original  # noqa: SLF001
+        self.parallelism.perform_initialization = _orig_perform_initialization
+        self.parallelism.use_cpu_initialization = _orig_use_cpu_initialization
 
     @property
     @override
@@ -364,9 +402,8 @@ def parallelism(self):
 
 # TODO: Fix this
 class _MegatronDataLoaderIterDataFetcher(_DataFetcher):
-    def __init__(self, data_config, *args: Any, output_data_idx: bool = False, **kwargs: Any) -> None:
+    def __init__(self, *args: Any, output_data_idx: bool = False, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
-        self.data_config = data_config
         self.output_data_idx = output_data_idx
         self._batch: Any = None
         self._batch_idx: int = 0
diff --git a/nemo/lightning/io/__init__.py b/nemo/lightning/io/__init__.py
index 2dcc53945fff..d53fa1e5f57e 100644
--- a/nemo/lightning/io/__init__.py
+++ b/nemo/lightning/io/__init__.py
@@ -1,3 +1,4 @@
+from nemo.lightning.io import registry  # noqa: F401
 from nemo.lightning.io.api import export_ckpt, import_ckpt, load, load_context, model_exporter, model_importer
 from nemo.lightning.io.capture import reinit
 from nemo.lightning.io.connector import Connector, ModelConnector
@@ -5,7 +6,6 @@
 from nemo.lightning.io.pl import TrainerContext, is_distributed_ckpt
 from nemo.lightning.io.state import TransformCTX, apply_transforms, state_transform
 
-
 __all__ = [
     "apply_transforms",
     "Connector",
diff --git a/nemo/lightning/io/api.py b/nemo/lightning/io/api.py
index 1bbbe43f8df9..643b671d1d85 100644
--- a/nemo/lightning/io/api.py
+++ b/nemo/lightning/io/api.py
@@ -1,9 +1,7 @@
 from pathlib import Path
-from typing import Any, Callable, Optional, Type, TypeVar
+from typing import Callable, Optional, Type
 
-import fiddle as fdl
 import pytorch_lightning as pl
-from fiddle._src.experimental import serialization
 
 from nemo.lightning.io.mixin import ConnectorMixin, ConnT, ModelConnector, load
 from nemo.lightning.io.pl import TrainerContext
diff --git a/nemo/lightning/io/artifact/base.py b/nemo/lightning/io/artifact/base.py
index ec451de9753b..7d2d608c4149 100644
--- a/nemo/lightning/io/artifact/base.py
+++ b/nemo/lightning/io/artifact/base.py
@@ -6,10 +6,10 @@
 
 
 class Artifact(ABC, Generic[ValueT]):
-    def __init__(self, attr: str, required: bool = True):
+    def __init__(self, attr: str, required: bool = True, skip: bool = False):
         self.attr = attr
         self.required = required
-        self.skip = False
+        self.skip = skip
 
     @abstractmethod
     def dump(self, value: ValueT, absolute_dir: Path, relative_dir: Path) -> ValueT:
@@ -18,3 +18,6 @@ def dump(self, value: ValueT, absolute_dir: Path, relative_dir: Path) -> ValueT:
     @abstractmethod
     def load(self, path: Path) -> ValueT:
         pass
+
+    def __repr__(self):
+        return f"{type(self).__name__}(skip= {self.skip}, attr= {self.attr}, required= {self.required})"
diff --git a/nemo/lightning/io/artifact/file.py b/nemo/lightning/io/artifact/file.py
index 1364468cde0a..1cd63b706c9a 100644
--- a/nemo/lightning/io/artifact/file.py
+++ b/nemo/lightning/io/artifact/file.py
@@ -2,6 +2,7 @@
 import shutil
 from pathlib import Path
 from typing import Union
+import fiddle as fdl
 
 from nemo.lightning.io.artifact.base import Artifact
 
@@ -19,8 +20,7 @@ class FileArtifact(Artifact[str]):
     def dump(self, value: str, absolute_dir: Path, relative_dir: Path) -> str:
         if not pathize(value).exists():
             # This is Artifact is just a string.
-            self.skip = True
-            return value
+            return fdl.Config(FileArtifact, attr=value, skip=True)
         new_value = copy_file(value, absolute_dir, relative_dir)
         return str(new_value)
 
@@ -65,8 +65,7 @@ class DirOrStringArtifact(DirArtifact):
     def dump(self, value: str, absolute_dir: Path, relative_dir: Path) -> str:
         if not pathize(value).exists():
             # This is Artifact is just a string.
-            self.skip = True
-            return value
+            return fdl.Config(DirOrStringArtifact, attr=value, skip=True)
         return super().dump(value, absolute_dir, relative_dir)
 
     def load(self, path: str) -> str:
diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
index 1165793656bd..e7ba67b277f8 100644
--- a/nemo/lightning/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -1,4 +1,3 @@
-import inspect
 import logging
 import os
 import shutil
@@ -8,6 +7,7 @@
 import pytorch_lightning as pl
 from filelock import FileLock, Timeout
 from pytorch_lightning.trainer.states import TrainerFn
+
 from nemo.lightning.ckpt_utils import ckpt_to_context_subdir, ckpt_to_weights_subdir
 
 # Dynamically inherit from the correct Path subclass based on the operating system.
@@ -134,7 +134,9 @@ class ModelConnector(Connector, Generic[SourceT, TargetT]):
             Loads a model from the specified path, optionally using a CPU-focused strategy, and returns the model and trainer.
     """
 
-    def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = None) -> pl.Trainer:
+    def nemo_setup(
+        self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = None, *args, **kwargs
+    ) -> pl.Trainer:
         """
         Sets up the model and trainer using a specified strategy, preparing it for training or inference.
 
@@ -150,7 +152,7 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] =
         _trainer = trainer or Trainer(
             devices=1,
             accelerator="cpu",
-            strategy=MegatronStrategy(ckpt_save_optimizer=False, always_save_context=True),
+            strategy=MegatronStrategy(ckpt_save_optimizer=False, always_save_context=True, *args, **kwargs),
         )
         # Note: set trainer to fitting state to avoid the following code path. Feel free to refactor if we no longer
         #  need to avoid this:
@@ -188,7 +190,7 @@ def nemo_save(self, output_path: Path, trainer: pl.Trainer, dump_io: bool = True
         from nemo.utils.get_rank import is_global_rank_zero
 
         if is_global_rank_zero() and dump_io:
-            TrainerContext.from_trainer(trainer).io_dump(ckpt_to_context_subdir(output_path))
+            TrainerContext.from_trainer(trainer).io_dump(ckpt_to_context_subdir(output_path), yaml_attrs=["model"])
 
     def nemo_load(
         self, path: Path, trainer: Optional[pl.Trainer] = None, cpu: bool = True
diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
index 8db33d62cdfc..27cb3b18b55b 100644
--- a/nemo/lightning/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -15,6 +15,8 @@
 import fiddle._src.experimental.dataclasses as fdl_dc
 from cloudpickle import dump
 from cloudpickle import load as pickle_load
+from fiddle._src import config as config_lib
+from fiddle._src import partial
 from fiddle._src.experimental import serialization
 from typing_extensions import Self
 
@@ -33,6 +35,37 @@
 _thread_local = threading.local()
 
 
+def _ordered_arguments_with_default(data: config_lib.Config) -> Dict[Union[int, str], Any]:
+    result = config_lib.ordered_arguments(data, include_defaults=True)
+    for key, arg in result.items():
+        if isinstance(arg, config_lib.Config):
+            ordered_arg = _ordered_arguments_with_default(arg)
+            result[key] = ordered_arg
+
+    if "__fn_or_cls__" in result:
+        raise ValueError(
+            "It is not supported to dump objects of functions/classes " "that have a __fn_or_cls__ parameter."
+        )
+
+    result["_target_"] = (
+        f"{inspect.getmodule(config_lib.get_callable(data)).__name__}.{config_lib.get_callable(data).__qualname__}"  # type: ignore
+    )
+    if isinstance(data, partial.Partial):
+        result["_partial_"] = True
+
+    return result
+
+
+def _config_representer_with_defaults(dumper, data, type_name="Config"):
+    """Returns a YAML representation of `data`."""
+    value = _ordered_arguments_with_default(data)
+    return dumper.represent_data(value)
+
+
+def _partial_representer_with_defaults(dumper, data):
+    return _config_representer_with_defaults(dumper, data, type_name="Partial")
+
+
 class IOMixin:
     """
     A mixin class designed to capture the arguments passed to the `__init__` method,
@@ -130,7 +163,7 @@ def io_init(self, **kwargs) -> fdl.Config[Self]:
     def io_artifacts(cls) -> List[Artifact]:
         return []
 
-    def io_dump(self, output: Path):
+    def io_dump(self, output: Path, yaml_attrs: list[str]):
         """
         Serializes the configuration object (`__io__`) to a file, allowing the object state to be
         saved and later restored. Also creates an artifacts directory and stores it in a thread-local
@@ -156,6 +189,11 @@ def io_dump(self, output: Path):
             json = serialization.dump_json(io)
             f.write(json)
 
+        yaml_configs = self._io_dump_yaml(io, attrs=yaml_attrs)
+        for attr, serialized_str in yaml_configs.items():
+            _path = output_path / f"{attr}.yaml"
+            _path.write_text(serialized_str)
+
         # Clear thread-local storage after io_dump is complete
         del _thread_local.local_artifacts_dir
         del _thread_local.output_path
@@ -164,6 +202,29 @@ def io_dump(self, output: Path):
         if not any(artifacts_dir.iterdir()):
             shutil.rmtree(artifacts_dir)
 
+    def _io_dump_yaml(self, io: config_lib.Config, attrs: list[str]):
+        import yaml
+
+        original_representers = yaml.SafeDumper.yaml_representers.copy()
+
+        from nemo_run.config import Config, Partial
+        from nemo_run.core.serialization.yaml import YamlSerializer, _function_representer
+
+        yaml.SafeDumper.add_representer(config_lib.Config, _config_representer_with_defaults)
+        yaml.SafeDumper.add_representer(partial.Partial, _partial_representer_with_defaults)
+        yaml.SafeDumper.add_representer(Config, _config_representer_with_defaults)
+        yaml.SafeDumper.add_representer(Partial, _partial_representer_with_defaults)
+
+        yaml.SafeDumper.add_multi_representer(object, _function_representer)
+
+        serializer = YamlSerializer()
+        result = {}
+        for attr in attrs:
+            result[attr] = serializer.serialize(getattr(io, attr))
+
+        yaml.SafeDumper.yaml_representers = original_representers
+        return result
+
 
 class ConnectorMixin:
     """
@@ -523,8 +584,12 @@ def _io_path_elements_fn(x):
 def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "."):
     for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []):
         # Allow optional artifacts
-        if artifact.skip:
+        if artifact.skip or (not hasattr(cfg, artifact.attr) and not artifact.required):
             continue
+
+        if not hasattr(cfg, artifact.attr) and artifact.required:
+            raise ValueError(f"Artifact '{artifact.attr}' is required but not provided")
+
         current_val = getattr(cfg, artifact.attr)
         if current_val is None:
             if artifact.required:
@@ -544,6 +609,15 @@ def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: P
 
 def _artifact_transform_load(cfg: fdl.Config, path: Path):
     for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []):
+        # We expect an artifact.attr to be a string or a fdl.Config.
+        # Some parameteres can be a string or a filepath. When those parameters are just strings,
+        # we will represent it with a fdl.Config, and will skip the rest of the loop (base-dir adjustment).
+        current_val = getattr(cfg, artifact.attr)
+        if isinstance(current_val, fdl.Config):
+            # artifact.attr is a string not a path.
+            setattr(cfg, artifact.attr, fdl.build(current_val).attr)
+            continue
+
         if artifact.skip:
             continue
         current_val = getattr(cfg, artifact.attr)
diff --git a/nemo/collections/llm/tokenizer.py b/nemo/lightning/io/registry.py
similarity index 89%
rename from nemo/collections/llm/tokenizer.py
rename to nemo/lightning/io/registry.py
index ef8cc53db7e5..f299ed3d4aff 100644
--- a/nemo/collections/llm/tokenizer.py
+++ b/nemo/lightning/io/registry.py
@@ -12,16 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 from nemo.lightning.io.artifact import DirOrStringArtifact, FileArtifact
 from nemo.lightning.io.mixin import track_io
 
-__all__ = []
-
-
-def extract_name(cls):
-    return str(cls).split('.')[-1].rstrip('>').rstrip("'")
-
-
+# Registers all required classes with track_io functionality
 try:
     # Track HF tokenizers
     from transformers import AutoTokenizer as HfAutoTokenizer
@@ -36,7 +31,6 @@ def extract_name(cls):
                 for attr_name in ['vocab_file', 'merges_file', 'tokenizer_file', 'name_or_path']
             ],
         )
-        __all__.append(extract_name(cls))
 
     from nemo.collections.common.tokenizers import AutoTokenizer
 
@@ -48,8 +42,8 @@ def extract_name(cls):
             DirOrStringArtifact("pretrained_model_name", required=False),
         ],
     )
-    __all__.append("AutoTokenizer")
 except ImportError:
+    # HF tokenizers are not available, no need to track them
     pass
 
 
@@ -57,6 +51,6 @@ def extract_name(cls):
     from nemo.collections.common.tokenizers import SentencePieceTokenizer
 
     track_io(SentencePieceTokenizer, artifacts=[FileArtifact("model_path")])
-    __all__.append("SentencePieceTokenizer")
 except ImportError:
+    # SentencePieceTokenizer is not available, no need to track it
     pass
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index f373fedee10c..2a0e346ced2a 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -18,7 +18,7 @@
 import inspect
 import queue
 from collections import defaultdict
-from contextlib import nullcontext
+from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from typing import (
     TYPE_CHECKING,
@@ -45,6 +45,7 @@
 from megatron.core import parallel_state
 from megatron.core.distributed import DistributedDataParallel as McoreDDP
 from megatron.core.distributed import DistributedDataParallelConfig
+from megatron.core.optimizer import OptimizerConfig
 from megatron.core.transformer.transformer_config import TransformerConfig
 from pytorch_lightning.utilities import move_data_to_device
 from torch import Tensor, nn
@@ -221,6 +222,7 @@ def forward(
         seq_length: Optional[int] = None,
         micro_batch_size: Optional[int] = None,
         num_microbatches: Optional[int] = None,
+        step_i: Optional[int] = None,
         wrap_forward_step: bool = True,
     ) -> torch.Tensor:
         """The method performs the forward pass of the model.
@@ -268,6 +270,7 @@ def forward(
             micro_batch_size=micro_batch_size,
             num_microbatches=num_microbatches,
             seq_length=seq_length,
+            step_i=step_i,
         )
         _forward_context["step"] = step
         step = self.callbacks.transform_event("on_megatron_step_start", step)
@@ -333,6 +336,7 @@ def validation_step(
         seq_length: Optional[int] = None,
         micro_batch_size: Optional[int] = None,
         num_microbatches: Optional[int] = None,
+        step_i: Optional[int] = None,
         **kwargs,
     ) -> STEP_OUTPUT:
         return self._step(
@@ -344,6 +348,7 @@ def validation_step(
             seq_length=seq_length,
             micro_batch_size=micro_batch_size,
             num_microbatches=num_microbatches,
+            step_i=step_i,
             forward_only=True,
             **kwargs,
         )
@@ -357,6 +362,7 @@ def test_step(
         seq_length: Optional[int] = None,
         micro_batch_size: Optional[int] = None,
         num_microbatches: Optional[int] = None,
+        step_i: Optional[int] = None,
         **kwargs,
     ) -> STEP_OUTPUT:
         return self._step(
@@ -368,6 +374,7 @@ def test_step(
             seq_length=seq_length,
             micro_batch_size=micro_batch_size,
             num_microbatches=num_microbatches,
+            step_i=step_i,
             forward_only=True,
             **kwargs,
         )
@@ -381,6 +388,7 @@ def predict_step(
         seq_length: Optional[int] = None,
         micro_batch_size: Optional[int] = None,
         num_microbatches: Optional[int] = None,
+        step_i: Optional[int] = None,
         **kwargs,
     ) -> STEP_OUTPUT:
         return self._step(
@@ -392,6 +400,7 @@ def predict_step(
             seq_length=seq_length,
             micro_batch_size=micro_batch_size,
             num_microbatches=num_microbatches,
+            step_i=step_i,
             forward_only=True,
             **kwargs,
         )
@@ -407,6 +416,7 @@ def _step(
         micro_batch_size: Optional[int] = None,
         num_microbatches: Optional[int] = None,
         forward_only: bool = True,
+        step_i: Optional[int] = None,
         **kwargs,
     ) -> STEP_OUTPUT:
         if not hasattr(self.module, f"{step_type}_step"):
@@ -425,6 +435,7 @@ def _step(
             micro_batch_size=micro_batch_size,
             num_microbatches=num_microbatches,
             forward_only=forward_only,
+            step_i=step_i,
             **kwargs,
         )
 
@@ -563,6 +574,15 @@ def init_ddp(self):
             # Mcore DistributedDataParallel has to be called with grad. Normally this call is redundant, but for
             # PEFT with num_sanity_val_steps > 0 this is necessary.
             init_ddp_context = nullcontext if all(x.requires_grad for x in module.parameters()) else torch.enable_grad
+
+            # Turn off bucketing for model_chunk 2 onwards, since communication for these
+            # model chunks is overlapped with compute anyway, or if using VP and overlapping
+            # data parallel param gather with optimizer
+            overlap_param_gather_with_optimizer_step = False
+            if hasattr(self, "optim") and isinstance(self.optim.config, OptimizerConfig):
+                overlap_param_gather_with_optimizer_step = self.optim.config.overlap_param_gather_with_optimizer_step
+            disable_bucketing = (model_chunk_idx > 0) or overlap_param_gather_with_optimizer_step
+
             with init_ddp_context():
                 ddp = DDP(
                     module.config,
@@ -570,9 +590,7 @@ def init_ddp(self):
                     module,
                     data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True),
                     expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(),
-                    # Turn off bucketing for model_chunk 2 onwards, since communication for these
-                    # model chunks is overlapped with compute anyway.
-                    disable_bucketing=(model_chunk_idx > 0),
+                    disable_bucketing=disable_bucketing,
                 )
 
             model_chunk.module = ddp
@@ -1035,6 +1053,7 @@ class MegatronStep(Generic[ModelT, DataT]):
     micro_batch_size: Optional[int] = None
     seq_length: Optional[int] = None
     num_microbatches: Optional[int] = None
+    step_i: Optional[int] = None
 
     @classmethod
     def infer(
@@ -1046,6 +1065,7 @@ def infer(
         micro_batch_size: Optional[int] = None,
         seq_length: Optional[int] = None,
         num_microbatches: Optional[int] = None,
+        step_i: Optional[int] = None,
     ) -> "MegatronStep[ModelT, DataT]":
         """
         Creates a MegatronStep instance, inferring missing parameters if possible.
@@ -1061,10 +1081,13 @@ def infer(
             micro_batch_size (Optional[int]): Size of each micro-batch.
             seq_length (Optional[int]): Sequence length for the current step.
             num_microbatches (Optional[int]): Number of micro-batches in this step.
-
+            step_i (Optional[int]): Step index for the current step.
         Returns:
             MegatronStep[ModelT, DataT]: An instance of MegatronStep with inferred parameters.
         """
+        if step_i is None and pipeline.trainer:
+            step_i = pipeline.trainer.global_step
+
         return cls(
             pipeline=pipeline,
             data=data,
@@ -1073,6 +1096,7 @@ def infer(
             micro_batch_size=micro_batch_size or cls.infer_micro_batch_size(data),
             seq_length=seq_length or cls.infer_seq_length(data),
             num_microbatches=num_microbatches or cls.infer_num_microbatches(data),
+            step_i=step_i,
         )
 
     def __call__(self) -> List[Any]:
@@ -1679,3 +1703,30 @@ def masked_token_loss_context_parallel(tensor: Tensor, mask: Tensor, num_valid_t
     torch.distributed.all_reduce(loss, group=parallel_state.get_context_parallel_group())
 
     return loss
+
+
+@contextmanager
+def moe_loss_tracker_ctx():
+    from megatron.core.transformer.moe.moe_utils import (
+        clear_aux_losses_tracker,
+        reduce_aux_losses_tracker_across_ranks,
+    )
+
+    reduce_aux_losses_tracker_across_ranks()
+    try:
+        yield
+    finally:
+        clear_aux_losses_tracker()
+
+
+@torch.no_grad()
+def aggregate_moe_loss_stats(loss_scale=1.0):
+    with moe_loss_tracker_ctx():
+        tracker = parallel_state.get_moe_layer_wise_logging_tracker()
+        aux_losses = {k: v['values'].float() * loss_scale for k, v in tracker.items()}
+        total_loss_dict = {}
+        for name, loss_list in aux_losses.items():
+            if name not in total_loss_dict:
+                total_loss_dict[name] = 0
+            total_loss_dict[name] += loss_list.mean().item()
+        return total_loss_dict
diff --git a/nemo/lightning/nemo_logger.py b/nemo/lightning/nemo_logger.py
index d611dccdcf5f..8b10f9aca50a 100644
--- a/nemo/lightning/nemo_logger.py
+++ b/nemo/lightning/nemo_logger.py
@@ -88,8 +88,8 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], resume_if_exists: bool =
         from nemo.constants import NEMO_ENV_VARNAME_VERSION
         from nemo.utils.get_rank import is_global_rank_zero
 
-        self.local_rank = int(os.environ.get("LOCAL_RANK", 0))
-        self.global_rank = trainer.node_rank * trainer.world_size + self.local_rank
+        self.local_rank = trainer.local_rank
+        self.global_rank = trainer.global_rank
         logging.rank = self.global_rank
 
         if self.explicit_log_dir and isinstance(trainer, pl.Trainer):  # If explicit log_dir was passed, short circuit
diff --git a/nemo/lightning/pytorch/callbacks/debugging.py b/nemo/lightning/pytorch/callbacks/debugging.py
index 421ee3056c7b..1a3c528c741f 100644
--- a/nemo/lightning/pytorch/callbacks/debugging.py
+++ b/nemo/lightning/pytorch/callbacks/debugging.py
@@ -2,7 +2,6 @@
 
 import pytorch_lightning as pl
 import torch
-from prettytable import PrettyTable
 from pytorch_lightning.callbacks import Callback
 
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
@@ -141,6 +140,8 @@ def find_grad_tensor(param: torch.Tensor) -> Optional[torch.Tensor]:
 
         # create table only if there is something to print
         if any(param_keys) or any(grad_keys):
+            from prettytable import PrettyTable
+
             debug_table = PrettyTable()
             debug_table.add_column("Parameter", names_col)
 
diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
index 8402edb4e594..adf890a8fb11 100644
--- a/nemo/lightning/pytorch/callbacks/model_checkpoint.py
+++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
@@ -49,7 +49,7 @@ class ModelCheckpoint(PTLModelCheckpoint):
             ``every_n_epochs`` or ``every_n_train_steps``.
         save_on_train_epoch_end: Whether to run checkpointing at the end of the training epoch
         save_optim_on_train_end: Whether to include the optimizer states in the final checkpoint
-            at the end of training. Only applicable when save_weights_only is ``True``.
+            at the end of training. Only applicable when save_weights_only is ``False``.
         always_save_context: Whether to dump the artifacts needed to reinintialize the current
             model, trainer, and dataloader to allow for reproducibility of experiments.
         save_context_on_train_end: Whether to dump the artifacts on_train_end regardless of whether
@@ -290,7 +290,9 @@ def on_train_end(self, trainer, pl_module):
                 else:
                     super()._save_last_checkpoint(trainer, monitor_candidates)
             if self.save_context_on_train_end and not self.always_save_context and is_global_rank_zero():
-                TrainerContext.from_trainer(trainer).io_dump(ckpt_to_dir(self.last_model_path) / "context")
+                TrainerContext.from_trainer(trainer).io_dump(
+                    ckpt_to_dir(self.last_model_path) / "context", yaml_attrs=["model"]
+                )
         # Call parent on_train_end() to save the -last checkpoint
         super().on_train_end(trainer, pl_module)
 
@@ -488,7 +490,7 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
             trainer.save_checkpoint(ckpt_filepath, save_weights_only, storage_options=storage_options)
 
             if self.always_save_context and is_global_rank_zero():
-                TrainerContext.from_trainer(trainer).io_dump(ckpt_to_dir(filepath) / "context")
+                TrainerContext.from_trainer(trainer).io_dump(ckpt_to_dir(filepath) / "context", yaml_attrs=["model"])
 
             if self.async_save:
                 self._last_checkpoint_saved = filepath
diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py
index 1e3cde0bbcde..f8a10802ffbd 100644
--- a/nemo/lightning/pytorch/callbacks/peft.py
+++ b/nemo/lightning/pytorch/callbacks/peft.py
@@ -90,12 +90,26 @@ def __call__(self, model: nn.Module) -> nn.Module:
         Returns:
             nn.Module: The transformed model with PEFT applied.
         """
-
-        model.freeze()
+        self.freeze_model(model)
         model.walk(self.transform)
 
         return model
 
+    def freeze_model(self, model: nn.Module) -> None:
+        """Apply a default freeze method to the model.
+
+        This method freezes all the model parameters. This method can be overridden by subclasses to
+        implement custom freeze strategies (e.g. freeze only parts of the model)
+
+        Args:
+            model (nn.Module): The model to be fine-tuned.
+
+        Returns:
+            nn.Module: The transformed model with PEFT applied.
+        """
+        model.freeze()
+        model.train(mode=True)
+
     def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str) -> None:
         super().setup(trainer, pl_module, stage=stage)
 
diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py
index 4fadae8dc722..55bafce5f71e 100644
--- a/nemo/lightning/pytorch/plugins/data_sampler.py
+++ b/nemo/lightning/pytorch/plugins/data_sampler.py
@@ -44,6 +44,7 @@ def __init__(
         init_consumed_samples: int = 0,
         init_global_step: int = 0,
         output_log: bool = True,
+        drop_last: bool = True,
     ):
         self.seq_len = seq_len
         self.output_log = output_log
@@ -56,6 +57,7 @@ def __init__(
         self.if_first_step = 0
         self.prev_global_batch_size = None
         self.init_global_step = init_global_step
+        self.drop_last = drop_last
 
     def setup(self, global_rank: int) -> None:
         from nemo.lightning.data import setup_microbatch_calculator
@@ -73,13 +75,14 @@ def transform_dataloader(self, dataloader: DataLoader, consumed_samples: int = 0
             rampup_batch_size=self.rampup_batch_size,
             consumed_samples=self.init_consumed_samples if mode == 'train' else 0,
             dataloader_type=self.dataloader_type,
+            drop_last=self.drop_last,
         )
 
     def compute_consumed_samples(self, steps_since_resume=0) -> int:
         from nemo.lightning.pytorch.strategies import MegatronStrategy
         from nemo.utils import AppState
 
-        if not isinstance(self.trainer.strategy, MegatronStrategy):
+        if not hasattr(self, "trainer") or not isinstance(self.trainer.strategy, MegatronStrategy):
             return 0
 
         app_state = AppState()
@@ -104,6 +107,9 @@ def on_megatron_step_start(self, step: MegatronStep) -> MegatronStep:
         )
 
     def on_megatron_microbatches_start(self, step: MegatronStep) -> None:
+        if not step.trainer:
+            return
+
         # do validation and save the checkpoint when gbs is changed
         if (
             self.rampup_batch_size is not None
@@ -125,23 +131,24 @@ def on_megatron_step_end(self, step: MegatronStep) -> None:
 
         self.prev_global_batch_size = self.current_global_batch_size
 
-        consumed_samples = self.compute_consumed_samples(trainer.global_step + 1 - self.init_global_step)
-        if self.output_log and self.trainer.training:
-            # You may need to turn off logging, for example when doing trainer.predict(model, data)
-            pl_module.log(
-                'consumed_samples',
-                consumed_samples,
-                prog_bar=True,
-                batch_size=1,
+        if step.step_i:
+            consumed_samples = self.compute_consumed_samples(step.step_i + 1 - self.init_global_step)
+            if self.output_log and trainer and getattr(trainer, "training", False):
+                # You may need to turn off logging, for example when doing trainer.predict(model, data)
+                pl_module.log(
+                    'consumed_samples',
+                    consumed_samples,
+                    prog_bar=True,
+                    batch_size=1,
+                )
+
+            self.prev_consumed_samples = consumed_samples
+
+            update_num_microbatches(
+                consumed_samples=consumed_samples,
+                consistency_check=False,
             )
-
-        self.prev_consumed_samples = consumed_samples
-
-        update_num_microbatches(
-            consumed_samples=consumed_samples,
-            consistency_check=False,
-        )
-        if self.output_log:
+        if self.output_log and trainer:
             # You may need to turn off logging, for example when doing trainer.predict(model, data)
             pl_module.log(
                 "global_batch_size",
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
index 0bf812f3a91b..c5195511c522 100644
--- a/nemo/lightning/pytorch/strategies/megatron_strategy.py
+++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -58,7 +58,12 @@
 from nemo.core.optim.mcore_optim import McoreDistributedOptimizer
 from nemo.lightning import _strategy_lib, io
 from nemo.lightning.ckpt_utils import ckpt_to_weights_subdir
-from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel, _ModuleStepFunction
+from nemo.lightning.megatron_parallel import (
+    CallbackConnector,
+    MegatronParallel,
+    _ModuleStepFunction,
+    aggregate_moe_loss_stats,
+)
 from nemo.lightning.pytorch.callbacks import ModelTransform
 from nemo.lightning.pytorch.strategies.utils import (
     RestoreConfig,
@@ -529,6 +534,10 @@ def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTP
                 self.lightning_module.log(
                     "reduced_train_loss", reduced_train_loss, prog_bar=True, batch_size=1, sync_dist=False
                 )
+                # Log any MoE losses.
+                # TODO(@akoumparouli): loss_scale depends on the GBS.
+                for loss_name, loss_value in aggregate_moe_loss_stats(loss_scale=1.0).items():
+                    self.lightning_module.log(loss_name, loss_value, prog_bar=True, rank_zero_only=True, batch_size=1)
 
             return out
 
diff --git a/nemo/lightning/resume.py b/nemo/lightning/resume.py
index 28b5451c0a86..40b4aa704575 100644
--- a/nemo/lightning/resume.py
+++ b/nemo/lightning/resume.py
@@ -98,7 +98,14 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], model=None):
             trainer.checkpoint_callback.last_model_path = trainer_ckpt_path
             # Load artifacts
             if getattr(self.restore_config, 'load_artifacts', False):
-                context_path = self.get_context_path(model)
+                if isinstance(trainer_ckpt_path, AdapterPath):
+                    # load tokenizer from the base model during peft resume, in case the first peft checkpoint
+                    # is deleted before the current peft checkpoint is saved
+                    context_path = trainer_ckpt_path.base_model_path / "context"
+                    if not context_path.exists():
+                        context_path = trainer_ckpt_path.base_model_path
+                else:
+                    context_path = self.get_context_path(model)
                 model = _try_restore_tokenizer(model, context_path)
 
         elif self.restore_config:
@@ -150,9 +157,9 @@ def _resume_peft(self, adapter_meta_path, model):
 
         assert self.restore_config, "PEFT resume requires specifying restore_config"
         base_model_path = self._extract_path(model, self.restore_config.path)
-        if base_model_path != Path(metadata['model_ckpt_path']):
-            raise ValueError(
-                f"When trying to resume a PEFT training run, found mismatching values: "
+        if base_model_path not in [Path(metadata['model_ckpt_path']), Path(metadata['model_ckpt_path']).parent]:
+            logging.warning(
+                f"⚠️ When trying to resume a PEFT training run, found mismatching values: "
                 f"your specified restore_path points to {base_model_path}, "
                 f"but the PEFT checkpoint was trained with "
                 f"model_ckpt_path={metadata['model_ckpt_path']}"
diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py
index 61da7303d9cb..dfcc7c1650ce 100644
--- a/nemo/lightning/run/plugins.py
+++ b/nemo/lightning/run/plugins.py
@@ -25,6 +25,7 @@
 from pytorch_lightning.loggers import WandbLogger
 
 from nemo.lightning.pytorch.callbacks import NsysCallback, PreemptionCallback
+from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy
 from nemo.utils import logging
 
 # This file contains plugins based on NeMo-Run's run.Plugin API.
@@ -241,3 +242,67 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
             if isinstance(executor, run.SlurmExecutor):
                 assert task.trainer.num_nodes == executor.nodes
                 assert task.trainer.devices == executor.nproc_per_node()
+
+
+@dataclass(kw_only=True)
+class PerfEnvPlugin(run.Plugin):
+    """
+    A plugin for setting up performance optimized environments.
+
+    Attributes:
+        enable_layernorm_sm_margin (bool): Set SM margin for TransformerEngine's Layernorm, so
+            in order to not block DP level communication overlap.
+        layernorm_sm_margin (int): The SM margin for TransformerEngine Layernorm.
+        enable_vboost (bool): Whether to steer more power towards tensor cores via
+            `sudo nvidia-smi boost-slider --vboost 1`. May not work on all systems.
+    """
+
+    enable_layernorm_sm_margin: bool = True
+    layernorm_sm_margin: int = 16
+    enable_vboost: bool = False
+
+    def get_vboost_srun_cmd(self, nodes, job_dir):
+        import shlex
+
+        vboost_cmd = " ".join(
+            [
+                "\n# Command 0: enable vboost\n\n",
+                "srun",
+                f"--ntasks={nodes}",
+                "--output",
+                os.path.join(job_dir, "vboost.out"),
+                "--error",
+                os.path.join(job_dir, "vboost.err"),
+                "bash -c ",
+                shlex.quote("sudo nvidia-smi boost-slider --vboost 1"),
+            ],
+        )
+
+        return vboost_cmd
+
+    def setup(self, task: run.Partial | run.Script, executor: run.Executor):
+
+        if task.trainer.strategy.__fn_or_cls__ == MegatronStrategy:
+            # Force program order kernel launch for TP, CP overlap
+            tp_size = task.trainer.strategy.tensor_model_parallel_size
+            cp_size = task.trainer.strategy.context_parallel_size
+            if tp_size > 1 and cp_size > 1:
+                executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+
+            # Set LayerNorm SM margin to support the overlap with LayerNorm kernel
+            if self.enable_layernorm_sm_margin:
+                executor.env_vars["NVTE_FWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)
+                executor.env_vars["NVTE_BWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)
+
+        # Force Transformer Engine to use cuDNN attention over HazyResearch's Flash Attention
+        executor.env_vars["NVTE_FLASH_ATTN"] = "0"
+        executor.env_vars["NVTE_FUSED_ATTN"] = "1"
+
+        # Improve perf by steering power to tensor cores, may not work on all systems
+        if self.enable_vboost and isinstance(executor, run.SlurmExecutor):
+            vboost_cmd = self.get_vboost_srun_cmd(executor.nodes, executor.job_dir)
+            executor.setup_lines = (
+                executor.setup_lines + vboost_cmd
+                if (executor.setup_lines and len(executor.setup_lines) > 0)
+                else vboost_cmd
+            )
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index 543c7e0781d2..3d4b7189f56e 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -19,6 +19,7 @@
 import sys
 import time
 import warnings
+from collections import defaultdict
 from dataclasses import dataclass, field
 from datetime import timedelta
 from pathlib import Path
@@ -38,7 +39,6 @@
 from pytorch_lightning.strategies.ddp import DDPStrategy
 from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
-
 from nemo.collections.common.callbacks import EMA
 from nemo.constants import NEMO_ENV_VARNAME_TESTING, NEMO_ENV_VARNAME_VERSION
 from nemo.utils import logging, timers
@@ -214,6 +214,8 @@ class ExpManagerConfig:
     files_to_copy: Optional[List[str]] = None
     # logs timing of train/val/test steps
     log_step_timing: Optional[bool] = True
+    # log step time with nemo logger instead of lightning logger to avoid lightning logger overhead
+    log_delta_step_timing: Optional[bool] = False
     step_timing_kwargs: Optional[StepTimingParams] = field(default_factory=lambda: StepTimingParams())
     # Configures creation of log files for different ranks
     log_local_rank_0_only: Optional[bool] = False
@@ -294,6 +296,53 @@ def on_after_backward(self, trainer, pl_module):
         self._on_batch_end("train_backward_timing", pl_module)
 
 
+class DeltaTimingCallback(Callback):
+    """
+    Logs execution time of train/val/test steps using nemo logger. Calculates
+    time from previous batch end to current batch end. This ensures accuracy.
+
+    Note: step time will only be printed in stdout. If you have initialized
+    loggers like TensorBoard, WandB, etc, step time will not be recorded there.
+    Use this callback instead of 'TimingCallback' to avoid logging overhead with
+    lightning logger used in the latter.
+    """
+
+    def __init__(self, timer_kwargs={}):
+        self._sync_cuda = timer_kwargs.get("sync_cuda", False)
+        self.timers = defaultdict(defaultdict)
+
+    def _on_epoch_start(self, name, trainer, pl_module):
+        # synchronize pytorch cuda execution if supported
+        if self._sync_cuda and torch.cuda.is_initialized():
+            torch.cuda.synchronize()
+
+        self.timers[name]["step"] = 0
+        self.timers[name]["start"] = time.time()
+
+    def _on_batch_end(self, name, trainer, pl_module):
+        # synchronize pytorch cuda execution if supported
+        if self._sync_cuda and torch.cuda.is_initialized():
+            torch.cuda.synchronize()
+
+        end = time.time()
+        dt = end - self.timers[name]["start"]
+        logging.info(f'Step {self.timers[name]["step"]}: {name} in s={dt}')
+        self.timers[name]["step"] += 1
+        self.timers[name]["start"] = end
+
+    def on_train_epoch_start(self, trainer, pl_module):
+        self._on_epoch_start("train_step_timing in s", trainer, pl_module)
+
+    def on_validation_epoch_start(self, trainer, pl_module):
+        self._on_epoch_start("validation_step_timing in s", trainer, pl_module)
+
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        self._on_batch_end("train_step_timing in s", trainer, pl_module)
+
+    def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        self._on_batch_end("validation_step_timing in s", trainer, pl_module)
+
+
 def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictConfig, Dict]] = None) -> Optional[Path]:
     """
     exp_manager is a helper function used to manage folders for experiments. It follows the pytorch lightning paradigm
@@ -512,7 +561,10 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
         )
 
     # add loggers timing callbacks
-    if cfg.log_step_timing:
+    if cfg.log_delta_step_timing:
+        timing_callback = DeltaTimingCallback(timer_kwargs=cfg.step_timing_kwargs or {})
+        trainer.callbacks.insert(0, timing_callback)
+    elif cfg.log_step_timing:
         timing_callback = TimingCallback(timer_kwargs=cfg.step_timing_kwargs or {})
         trainer.callbacks.insert(0, timing_callback)
 
diff --git a/nemo/utils/nemo_logging.py b/nemo/utils/nemo_logging.py
index 95e17e5c5f6c..bcc7ad199603 100644
--- a/nemo/utils/nemo_logging.py
+++ b/nemo/utils/nemo_logging.py
@@ -76,7 +76,7 @@ def __init__(self, capture_warnings=True):
         self.rank = 0 if is_global_rank_zero() else "UNK"
 
     def _define_logger(self, capture_warnings=True):
-        """ Creates the logger if not already created. Called in init"""
+        """Creates the logger if not already created. Called in init"""
 
         # Use double-checked locking to avoid taking lock unnecessarily.
         if self._logger is not None:
@@ -126,7 +126,7 @@ def record_factory(*args, **kwargs):
         self._logger.propagate = False
 
     def remove_stream_handlers(self):
-        """ Removes StreamHandler that log to stdout and stderr from the logger."""
+        """Removes StreamHandler that log to stdout and stderr from the logger."""
         if self._logger is None:
             raise RuntimeError("Impossible to set handlers if the Logger is not predefined")
 
@@ -236,7 +236,7 @@ def set_verbosity(self, verbosity_level):
 
     @contextmanager
     def patch_stderr_handler(self, stream):
-        """ Sends messages that should log to stderr to stream instead. Useful for unittests """
+        """Sends messages that should log to stderr to stream instead. Useful for unittests"""
         if self._logger is not None:
             try:
                 old_stream = self._handlers["stream_stderr"].stream
@@ -268,7 +268,7 @@ def patch_stderr_handler(self, stream):
 
     @contextmanager
     def patch_stdout_handler(self, stream):
-        """ Sends messages that should log to stdout to stream instead. Useful for unittests """
+        """Sends messages that should log to stdout to stream instead. Useful for unittests"""
         if self._logger is not None:
             try:
                 old_stream = self._handlers["stream_stdout"].stream
@@ -339,6 +339,16 @@ def captureWarnings(self, capture):
                 warnings.showwarning = self.old_warnings_showwarning
                 self.old_warnings_showwarning = None
 
+    def _warning_is_ignored(self, category):
+        from warnings import filters
+
+        # Search the filters
+        for action, msg, cat, mod, ln in filters:
+            # least-common demoninator if multiple filters for the same class.
+            if cat == category and action == 'ignore':
+                return True
+        return False
+
     def _showwarning(self, message, category, filename, lineno, file=None, line=None):
         """
         Implementation of showwarnings which redirects to logging.
@@ -346,6 +356,8 @@ def _showwarning(self, message, category, filename, lineno, file=None, line=None
         with level logging.WARNING.
         """
         s = warnings.formatwarning(message, category, filename, lineno, line)
+        if self._warning_is_ignored(category):
+            return
         self.warning("%s", s)
 
     def _logged_once(self, msg, mode):
diff --git a/requirements/requirements_multimodal.txt b/requirements/requirements_multimodal.txt
index 8b56c3974a25..18abe82c9f96 100644
--- a/requirements/requirements_multimodal.txt
+++ b/requirements/requirements_multimodal.txt
@@ -1,6 +1,6 @@
 addict
 clip
-decord
+decord; sys_platform == 'linux'
 diffusers>=0.19.3
 einops_exts
 imageio
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 3d168ad3b12a..7ef03689b9b5 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -9,7 +9,7 @@ gdown
 h5py
 ijson
 jieba
-mamba-ssm==2.2.2
+mamba-ssm==2.2.2; sys_platform == 'linux'
 markdown2
 matplotlib>=3.3.2
 #megatron_core>0.6.0 # add back once mcore on pypi is compatible again
diff --git a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
index ba9012de01a8..796819c38ba4 100644
--- a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
@@ -65,6 +65,7 @@ def load_config(hf_model_name, nemo_config):
         logging.warning(f"Got unknown activation function {nemo_config.activation}")
 
     hf_config.rope_theta = nemo_config['rotary_base']
+    hf_config.tie_word_embeddings = getattr(nemo_config, "share_embeddings_and_output_weights", False)
     return hf_config
 
 
@@ -213,7 +214,13 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
         output_layer_base_name = 'model.output_layer.weight'
     else:
         output_layer_base_name = 'model.language_model.output_layer.weight'
-    state_dict[hf_output_layer_weight_name] = param_to_weights(ckpt[output_layer_base_name])
+
+    if getattr(nemo_config, "share_embeddings_and_output_weights", False):
+        # tie_word_embeddings: True
+        state_dict[hf_output_layer_weight_name] = state_dict[embed_weights_base_name]
+    else:
+        # tie_word_embeddings: False
+        state_dict[hf_output_layer_weight_name] = param_to_weights(ckpt[output_layer_base_name])
     return state_dict, nemo_config, dtype
 
 
diff --git a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
index 29b56aa706fa..eeaee9aba461 100644
--- a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
+++ b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
@@ -16,14 +16,13 @@
 Conversion script to convert zarr checkpoints into torch distributed checkpoint.
   Example to run this conversion script:
     python -m torch.distributed.launch --nproc_per_node=<tensor_model_parallel_size> * <pipeline_model_parallel_size> \
-     megatron_zarr_ckpt_to_torch_dist.py \
+     convert_zarr_to_torch_dist.py \
      --model_type <model_type> \
      --checkpoint_folder <path_to_PTL_checkpoints_folder> \
      --checkpoint_name <checkpoint_name> \
      --path_to_save <path_to_output_ckpt_files> \
      --tensor_model_parallel_size <tensor_model_parallel_size> \
      --pipeline_model_parallel_size <pipeline_model_parallel_size> \
-     --hparams_file <path_to_model_yaml_config> \
      --gpus_per_node <gpus_per_node>
 """
 
@@ -64,12 +63,14 @@ def get_args():
         "--hparams_file",
         type=str,
         default=None,
-        required=True,
+        required=False,
         help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
     )
     parser.add_argument("--path_to_save", type=str, default=None, required=True, help="Path to output ckpt files.")
     parser.add_argument(
-        "--save_to_nemo", action="store_true", help="If passed, output will be written as .nemo file.",
+        "--save_to_nemo",
+        action="store_true",
+        help="If passed, output will be written as .nemo file.",
     )
     parser.add_argument("--gpus_per_node", type=int, required=True, default=None)
     parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None)
@@ -81,7 +82,7 @@ def get_args():
         default=None,
         help="If pipeline parallel size > 1, this is the rank at which the encoder ends and the decoder begins.",
     )
-    parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
+    parser.add_argument("--local-rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
     parser.add_argument("--cluster_type", required=False, default=None, help="Whether on BCP platform")
     parser.add_argument(
         "--precision",
@@ -93,7 +94,18 @@ def get_args():
     )
 
     parser.add_argument(
-        "--model_type", type=str, required=True, default="gpt", choices=["gpt", "sft", "bert"],
+        "--model_type",
+        type=str,
+        required=True,
+        default="gpt",
+        choices=["gpt", "sft", "bert"],
+    ),
+    parser.add_argument(
+        "--ckpt_format",
+        type=str,
+        required=False,
+        default="torch_dist",
+        choices=["zarr", "torch_dist"],
     )
 
     args = parser.parse_args()
@@ -114,7 +126,7 @@ def convert(local_rank, rank, world_size, args):
             'precision': args.precision,
         },
         'model': {
-            'native_amp_init_scale': 2 ** 32,
+            'native_amp_init_scale': 2**32,
             'native_amp_growth_interval': 1000,
             'hysteresis': 2,
             'gradient_as_bucket_view': True,
@@ -167,7 +179,7 @@ def convert(local_rank, rank, world_size, args):
         )
 
     with open_dict(model.cfg):
-        model.cfg.torch_distributed_checkpoint = True
+        model.cfg.dist_ckpt_format = args.ckpt_format
 
     model._save_restore_connector = NLPSaveRestoreConnector()
     save_file_path = args.path_to_save
diff --git a/scripts/llm/llama3_generate.py b/scripts/llm/llama3_generate.py
new file mode 100644
index 000000000000..85b18d811b01
--- /dev/null
+++ b/scripts/llm/llama3_generate.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE: This script is just an example of using NeMo checkpoints for generating outputs and is subject to change without notice.
+
+import os
+
+import torch
+import torch.distributed
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+
+import nemo.lightning as nl
+from nemo.collections.llm import api
+
+if __name__ == "__main__":
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=2,
+        pipeline_model_parallel_size=1,
+        context_parallel_size=1,
+        sequence_parallel=False,
+        setup_optimizers=False,
+        store_optimizer_states=False,
+    )
+
+    trainer = nl.Trainer(
+        accelerator="gpu",
+        devices=2,
+        num_nodes=1,
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+            pipeline_dtype=torch.bfloat16,
+            autocast_enabled=False,
+            grad_reduce_in_fp32=False,
+        ),
+    )
+    prompts = [
+        "Hello, how are you?",
+        "How many r's are in the word 'strawberry'?",
+        "Which number is bigger? 10.119 or 10.19?",
+    ]
+    results = api.generate(
+        path=os.path.join(os.environ["NEMO_HOME"], "models", "meta-llama/Meta-Llama-3-8B"),
+        prompts=prompts,
+        trainer=trainer,
+        inference_params=CommonInferenceParams(temperature=0.1, top_k=10, num_tokens_to_generate=512),
+        text_only=True,
+    )
+    if torch.distributed.get_rank() == 0:
+        for i, r in enumerate(results):
+            print(prompts[i])
+            print("*" * 50)
+            print(r)
+            print("\n\n")
diff --git a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
index 31fe822573ce..4715f4826493 100644
--- a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
+++ b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
@@ -53,8 +53,8 @@ def stt_en_fastconformer_transducer_large():
             8,
             True,
             marks=pytest.mark.xfail(
-                reason="""Cannot instantiate the 
-body cuda graph of a conditional node with a persistent kernel (in this case, 
+                reason="""Cannot instantiate the
+body cuda graph of a conditional node with a persistent kernel (in this case,
 a persistent LSTM), which is triggered in cudnn by using a batch size of 8."""
             ),
         ),
diff --git a/tests/collections/asr/decoding/rnnt_alignments_check.py b/tests/collections/asr/decoding/test_rnnt_alignments.py
similarity index 94%
rename from tests/collections/asr/decoding/rnnt_alignments_check.py
rename to tests/collections/asr/decoding/test_rnnt_alignments.py
index ec0656cbce49..5c43af28b1d4 100644
--- a/tests/collections/asr/decoding/rnnt_alignments_check.py
+++ b/tests/collections/asr/decoding/test_rnnt_alignments.py
@@ -13,10 +13,6 @@
 # limitations under the License.
 
 
-# NOTE: the file name does not contain "test" on purpose to avoid executing
-#       these tests outside of the CI machines environment, where test data is
-#       stored
-
 from pathlib import Path
 from typing import Union
 
@@ -27,6 +23,7 @@
 
 from nemo.collections.asr.models import EncDecRNNTBPEModel
 from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest
+from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.asr.parts.utils.transcribe_utils import prepare_audio_data
 
 DEVICES = []
@@ -65,7 +62,7 @@ def get_rnnt_alignments(
     loop_labels: bool = True,
     use_cuda_graph_decoder=False,
     device="cuda",
-):
+) -> list[Hypothesis]:
     cfg = OmegaConf.structured(TranscriptionConfig())
     cfg.rnnt_decoding.confidence_cfg.preserve_frame_confidence = True
     cfg.rnnt_decoding.preserve_alignments = True
@@ -74,12 +71,13 @@ def get_rnnt_alignments(
         cfg.rnnt_decoding.greedy.loop_labels = loop_labels
         cfg.rnnt_decoding.greedy.use_cuda_graph_decoder = use_cuda_graph_decoder
     cfg.dataset_manifest = str(manifest_path)
-    filepaths = prepare_audio_data(cfg)[0][:10]  # selecting 10 files only
+    filepaths = prepare_audio_data(cfg)[0][:8]  # selecting 8 files only
+    # NB: 9th file has the same transcription but a bit different alignment for batched/non-batched decoding
 
     model = model.to(device)
     model.change_decoding_strategy(cfg.rnnt_decoding)
 
-    transcriptions = model.transcribe(
+    transcriptions: list[Hypothesis] = model.transcribe(
         audio=filepaths,
         batch_size=cfg.batch_size,
         num_workers=cfg.num_workers,
diff --git a/tests/collections/asr/mixins/adapters/test_asr_adapter_modules.py b/tests/collections/asr/mixins/adapters/test_asr_adapter_modules.py
index ffaf1e640f3e..ad33a21262f3 100644
--- a/tests/collections/asr/mixins/adapters/test_asr_adapter_modules.py
+++ b/tests/collections/asr/mixins/adapters/test_asr_adapter_modules.py
@@ -178,6 +178,59 @@ def test_relmha_adapter_init(self, n_head, proj_dim):
             assert out.sum().abs() <= 1e-8
             assert out.shape == x.shape
 
+    @pytest.mark.unit
+    def test_relmha_adapter_with_torch_sdpa(self):
+        torch.random.manual_seed(0)
+        x = torch.randn(2, 32, 50)
+        lengths = torch.randint(1, x.size(1), size=(x.size(0),))
+        lengths[torch.randint(0, x.size(0), size=(1,))[0]] = x.size(1)
+
+        adapter_torch_sdpa = adapter_modules.RelPositionMultiHeadAttentionAdapter(
+            n_head=2, n_feat=50, dropout_rate=0.0, proj_dim=-1, use_pytorch_sdpa=True
+        )
+        adapter = adapter_modules.RelPositionMultiHeadAttentionAdapter(
+            n_head=2, n_feat=50, dropout_rate=0.0, proj_dim=-1, use_pytorch_sdpa=False
+        )
+        # to dont reset linear_out parameters to zero
+        adapter.linear_out = torch.nn.Linear(adapter.linear_out.in_features, adapter.linear_out.out_features)
+        for original_param, sdpa_param in zip(adapter.parameters(), adapter_torch_sdpa.parameters()):
+            sdpa_param.data.copy_(original_param.data)
+        relpos_enc = adapter_modules.RelPositionalEncodingAdapter(d_model=50)
+
+        pad_mask, att_mask = get_mask(lengths)
+        relpos_enc.extend_pe(lengths.max(), device='cpu', dtype=torch.float32)
+
+        with torch.no_grad():
+            _, pos_emb = relpos_enc(x)
+            out = adapter(x, x, x, att_mask, pos_emb)
+            out_sdpa = adapter_torch_sdpa(x, x, x, att_mask, pos_emb)
+            assert torch.allclose(out_sdpa, out, atol=1e-5)
+
+    @pytest.mark.unit
+    def test_mha_adapter_with_torch_sdpa(self):
+        torch.random.manual_seed(0)
+        x = torch.randn(2, 32, 50)
+        lengths = torch.randint(1, x.size(1), size=(x.size(0),))
+        lengths[torch.randint(0, x.size(0), size=(1,))[0]] = x.size(1)
+
+        adapter_torch_sdpa = adapter_modules.MultiHeadAttentionAdapter(
+            n_head=2, n_feat=50, dropout_rate=0.0, proj_dim=-1, use_pytorch_sdpa=True
+        )
+        adapter = adapter_modules.MultiHeadAttentionAdapter(
+            n_head=2, n_feat=50, dropout_rate=0.0, proj_dim=-1, use_pytorch_sdpa=False
+        )
+        # to dont reset linear_out parameters to zero
+        adapter.linear_out = torch.nn.Linear(adapter.linear_out.in_features, adapter.linear_out.out_features)
+
+        for original_param, sdpa_param in zip(adapter.parameters(), adapter_torch_sdpa.parameters()):
+            sdpa_param.data.copy_(original_param.data)
+
+        pad_mask, att_mask = get_mask(lengths)
+        with torch.no_grad():
+            out = adapter(x, x, x, att_mask)
+            out_sdpa = adapter_torch_sdpa(x, x, x, att_mask)
+            assert torch.allclose(out_sdpa, out, atol=1e-5)
+
     @pytest.mark.unit
     def test_abspos_encoding_init(self):
         torch.random.manual_seed(0)
diff --git a/tests/collections/asr/test_conformer_encoder.py b/tests/collections/asr/test_conformer_encoder.py
index a7b914120bb8..18cb902d1408 100644
--- a/tests/collections/asr/test_conformer_encoder.py
+++ b/tests/collections/asr/test_conformer_encoder.py
@@ -74,14 +74,18 @@ def test_stochastic_depth_model_creation(self):
         for start_layer in [-1, 0, 5]:
             with pytest.raises(ValueError, match="stochastic_depth_start_layer has to be in"):
                 ConformerEncoder(
-                    feat_in=10, n_layers=n_layers, d_model=4, feat_out=8, stochastic_depth_start_layer=start_layer,
+                    feat_in=10,
+                    n_layers=n_layers,
+                    d_model=4,
+                    feat_out=8,
+                    stochastic_depth_start_layer=start_layer,
                 )
 
     @pytest.mark.pleasefixme
     def test_stochastic_depth_forward(self):
         """Testing that forward works and we get randomness during training, but not during eval."""
         random_input = torch.rand((1, 2, 2))
-        random_length = torch.tensor([2, 2], dtype=torch.int64)
+        random_length = torch.tensor([2], dtype=torch.int64)
 
         model = ConformerEncoder(
             feat_in=2,
diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py
index 97fdca434843..ec682288cd4c 100644
--- a/tests/collections/common/test_lhotse_dataloading.py
+++ b/tests/collections/common/test_lhotse_dataloading.py
@@ -184,6 +184,26 @@ def nemo_tarred_manifest_path_multi(nemo_tarred_manifest_path: tuple[str, str])
     return f"{json_dir}/manifest__OP_0..1_CL_.jsonl", tar_p
 
 
+@pytest.fixture(scope="session")
+def nemo_tarred_manifest_subset_path(nemo_tarred_manifest_path: Tuple[str, str]) -> Tuple[str, str]:
+    """Create a shard manifests with randomly chosen 50% percent of tarred contents."""
+    from lhotse.serialization import load_jsonl
+    from lhotse.shar.writers import JsonlShardWriter
+
+    json_p, tar_p = nemo_tarred_manifest_path
+    json_dir = json_p.parent / "shard_manifests"
+    json_dir.mkdir(exist_ok=True)
+    all_items = list(load_jsonl(json_p))
+    tarr_0_data = all_items[:5]
+    tarr_1_data = all_items[5:]
+
+    subset_items = tarr_0_data[-3:] + tarr_1_data[-3:]
+    with JsonlShardWriter(f"{json_dir}/manifest_%d.jsonl", shard_size=3) as mft_writer:
+        for item in subset_items:
+            mft_writer.write(item)
+    return f"{json_dir}/manifest__OP_0..1_CL_.jsonl", tar_p, subset_items
+
+
 class UnsupervisedAudioDataset(torch.utils.data.Dataset):
     def __getitem__(self, cuts: lhotse.CutSet) -> Dict[str, torch.Tensor]:
         audio, audio_lens = lhotse.dataset.collation.collate_audio(cuts)
@@ -1905,3 +1925,42 @@ def test_dataloader_from_tarred_nemo_manifest_with_offset(nemo_tarred_manifest_p
     np.testing.assert_equal(
         audio, full_audio[:, compute_num_samples(4.0, cut.sampling_rate) : compute_num_samples(9.0, cut.sampling_rate)]
     )
+
+
+def test_dataloader_from_tarred_nemo_subset_manifest(nemo_tarred_manifest_subset_path: tuple[str, str]):
+    json_mft, tar_mft, subset_items = nemo_tarred_manifest_subset_path
+    config = OmegaConf.create(
+        {
+            "manifest_filepath": json_mft,
+            "tarred_audio_filepaths": tar_mft,
+            "sample_rate": 16000,
+            "shuffle": True,
+            "use_lhotse": True,
+            "num_workers": 0,
+            # lhotse specific
+            "use_bucketing": True,
+            "concurrent_bucketing": False,
+            "num_buckets": 2,
+            "drop_last": False,
+            "batch_duration": 4.0,  # seconds
+            "quadratic_duration": 15.0,  # seconds
+            "shuffle_buffer_size": 10,
+            "bucket_buffer_size": 100,
+            "seed": 0,
+            "shard_seed": 0,
+            "tarred_random_access": True,
+            "force_finite": True,
+        }
+    )
+    dl = get_lhotse_dataloader_from_config(
+        config=config, global_rank=0, world_size=1, dataset=UnsupervisedAudioDataset()
+    )
+    seen_ids = list()
+    for batch in dl:
+        current_ids = batch["ids"]
+        seen_ids += current_ids
+
+    expected_ids = set([data['audio_filepath'] for data in subset_items])
+    seen_ids_set = set(seen_ids)
+    assert len(seen_ids_set) == len(seen_ids), "Duplicate IDs found in the batch."
+    assert seen_ids_set == expected_ids, "The set of IDs in the batches does not match the input JSON manifests."
diff --git a/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py b/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py
index df6c1c0ac516..1e7de13666aa 100644
--- a/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py
+++ b/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py
@@ -29,7 +29,7 @@ def tokenizer(vocab_path, merges_path):
 def main(args):
     strategy = MegatronStrategy(
         tensor_model_parallel_size=args.devices,
-        sequence_parallel=True,
+        sequence_parallel=False,
         context_parallel_size=1,
         params_dtype=torch.bfloat16,
         pipeline_dtype=torch.bfloat16,
diff --git a/tests/collections/llm/bitexact/mixtral/run.sh b/tests/collections/llm/bitexact/mixtral/run.sh
index df185f7e1bc8..c32dbbc95b98 100644
--- a/tests/collections/llm/bitexact/mixtral/run.sh
+++ b/tests/collections/llm/bitexact/mixtral/run.sh
@@ -24,7 +24,7 @@ torchrun --nproc-per-node 1 --nnodes 1 /workspace/Megatron-LM/pretrain_gpt.py \
     --init-method-std 0.008 --bf16 --use-mcore-models --transformer-impl transformer_engine \
     --use-distributed-optimizer --train-iters=10 --dataloader-type single --use-dist-ckpt \
     --dist-ckpt-format=torch_dist \
-    --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --sequence-parallel \
+    --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 \
     --no-gradient-accumulation-fusion \
     --data-path "$DATA_PATH" \
     --split 99,1,0 --log-interval 10 --save-interval 20000 --eval-interval 1000 --eval-iters 32 \
diff --git a/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py b/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py
index 67174974f9a3..e0b9862f23e1 100644
--- a/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py
+++ b/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py
@@ -59,6 +59,7 @@ def get_args():
         strategy=nl.MegatronStrategy(
             ckpt_load_optimizer=False,
             ckpt_save_optimizer=False,
+            ckpt_async_save=False,
             tensor_model_parallel_size=1,
         ),
         plugins=nl.MegatronMixedPrecision(
diff --git a/tests/collections/llm/gpt/model/test_mistral.py b/tests/collections/llm/gpt/model/test_mistral.py
index 365bb35b2725..025ea35dd6e9 100644
--- a/tests/collections/llm/gpt/model/test_mistral.py
+++ b/tests/collections/llm/gpt/model/test_mistral.py
@@ -1,6 +1,6 @@
 import torch.nn.functional as F
 
-from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralNeMo2407Config12B, MistralNeMo2407Config123B
+from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralNeMoConfig12B, MistralNeMoConfig123B
 
 
 def test_mistral_config7b():
@@ -25,7 +25,7 @@ def test_mistral_config7b():
 
 
 def test_mistral_nemo_config_12b():
-    config = MistralNeMo2407Config12B()
+    config = MistralNeMoConfig12B()
     assert config.normalization == "RMSNorm"
     assert config.activation_func == F.silu
     assert config.position_embedding_type == "rope"
@@ -49,7 +49,7 @@ def test_mistral_nemo_config_12b():
 
 
 def test_mistral_nemo_config_123b():
-    config = MistralNeMo2407Config123B()
+    config = MistralNeMoConfig123B()
     assert config.normalization == "RMSNorm"
     assert config.activation_func == F.silu
     assert config.position_embedding_type == "rope"
diff --git a/tests/collections/llm/gpt_finetuning.py b/tests/collections/llm/gpt_finetuning.py
index 9eca287669cd..7eaa7744729c 100644
--- a/tests/collections/llm/gpt_finetuning.py
+++ b/tests/collections/llm/gpt_finetuning.py
@@ -19,6 +19,7 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 
 ## NOTE: This script is present for github-actions testing only.
@@ -43,6 +44,7 @@ def get_args():
     parser.add_argument('--mbs', type=int, default=1, help="micro batch size")
     parser.add_argument('--tp_size', type=int, default=1, help="tensor parallel size")
     parser.add_argument('--pp_size', type=int, default=1, help="pipeline parallel size")
+    parser.add_argument('--packed', action='store_true', help="use packed sequence dataset")
 
     return parser.parse_args()
 
@@ -97,7 +99,16 @@ def get_args():
     else:
         peft = None
 
-    squad = llm.SquadDataModule(seq_length=2048, micro_batch_size=args.mbs, global_batch_size=8, num_workers=0)
+    packed_sequence_specs = (
+        PackedSequenceSpecs(packed_sequence_size=2048, tokenizer_model_name="dummy_tokenizer") if args.packed else None
+    )
+    dolly = llm.DollyDataModule(
+        seq_length=2048,
+        micro_batch_size=args.mbs,
+        global_batch_size=8,
+        num_workers=0,
+        packed_sequence_specs=packed_sequence_specs,
+    )
 
     tokenizer = get_nmt_tokenizer(tokenizer_model=os.path.join(args.restore_path, "dummy_tokenizer.model"))
     llama3_8b = llm.LlamaModel(Llama3ConfigCI(), tokenizer=tokenizer)
@@ -109,7 +120,7 @@ def get_args():
 
     llm.finetune(
         model=llama3_8b,
-        data=squad,
+        data=dolly,
         trainer=trainer,
         peft=peft,
         log=logger,
diff --git a/tests/collections/llm/megatron_mixtral_pretraining.py b/tests/collections/llm/megatron_mixtral_pretraining.py
index 4862919f0f8e..82188f75351e 100644
--- a/tests/collections/llm/megatron_mixtral_pretraining.py
+++ b/tests/collections/llm/megatron_mixtral_pretraining.py
@@ -65,7 +65,7 @@ def main(args):
     strategy = MegatronStrategy(
         expert_model_parallel_size=args.devices,
         tensor_model_parallel_size=1,
-        sequence_parallel=True,
+        sequence_parallel=False,
         context_parallel_size=1,
         params_dtype=torch.bfloat16,
         pipeline_dtype=torch.bfloat16,
diff --git a/tests/collections/llm/megatron_t5_finetuning.py b/tests/collections/llm/megatron_t5_finetuning.py
index 76a23d36975b..f54e858cfb43 100644
--- a/tests/collections/llm/megatron_t5_finetuning.py
+++ b/tests/collections/llm/megatron_t5_finetuning.py
@@ -21,6 +21,7 @@ def get_args():
     parser = argparse.ArgumentParser(description='Train a small T5 model using NeMo 2.0')
     parser.add_argument('--devices', type=int, help="Number of devices to use for training")
     parser.add_argument('--max-steps', type=int, help="Number of steps to train for")
+    parser.add_argument('--peft', type=str, default='none', help="none | lora")
     parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to")
     parser.add_argument('--experiment-name', type=str, help="name of experiment")
     parser.add_argument('--wandb-project', type=str, default=None, help="wandb project name")
@@ -34,9 +35,12 @@ def get_args():
 
     args = get_args()
 
+    special_tokens = {}
+    special_tokens['additional_special_tokens'] = [f'<extra_id_{i}>' for i in range(100)]
     tokenizer = get_nmt_tokenizer(
         "megatron",
         "BertWordPieceCase",
+        special_tokens=special_tokens,
     )
 
     data = SquadDataModule(
@@ -69,7 +73,6 @@ def get_args():
         pipeline_model_parallel_size=1,
         pipeline_dtype=torch.float32,
         ckpt_load_optimizer=False,
-        # ckpt_load_optimizer=True,
     )
     checkpoint_callback = ModelCheckpoint(
         every_n_train_steps=5000,
@@ -93,6 +96,11 @@ def get_args():
         config=opt_config,
     )
 
+    if args.peft == 'lora':
+        peft = llm.peft.LoRA()
+    else:
+        peft = None
+
     trainer = nl.Trainer(
         devices=args.devices,
         max_steps=args.max_steps,
@@ -125,6 +133,7 @@ def get_args():
         resume=resume,
         data=data,
         trainer=trainer,
+        peft=peft,
         log=nemo_logger,
         optim=opt,
     )
diff --git a/tests/collections/llm/megatron_t5_pretraining.py b/tests/collections/llm/megatron_t5_pretraining.py
index 5d8f55a7f26f..a5460be3d154 100644
--- a/tests/collections/llm/megatron_t5_pretraining.py
+++ b/tests/collections/llm/megatron_t5_pretraining.py
@@ -50,10 +50,13 @@ def get_args():
 
     args = get_args()
 
+    special_tokens = {}
+    special_tokens['additional_special_tokens'] = [f'<extra_id_{i}>' for i in range(100)]
     tokenizer = get_nmt_tokenizer(
         "megatron",
         "BertWordPieceCase",
         vocab_file=args.vocab_path,
+        special_tokens=special_tokens,
     )
     data = PreTrainingDataModule(
         paths=args.data_path,
diff --git a/tests/collections/llm/recipes/test_llama3_70b.py b/tests/collections/llm/recipes/test_llama3_70b.py
index a842975846dd..d47b674b7b70 100644
--- a/tests/collections/llm/recipes/test_llama3_70b.py
+++ b/tests/collections/llm/recipes/test_llama3_70b.py
@@ -31,7 +31,7 @@ def test_trainer(self, recipe_module):
         assert trainer_config.__fn_or_cls__ == Trainer
         assert trainer_config.accelerator == "gpu"
         assert trainer_config.devices == 8
-        assert trainer_config.num_nodes == 1
+        assert trainer_config.num_nodes == 4
 
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
@@ -79,10 +79,8 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_pretrain_recipe_performance(self, recipe_module):
-        recipe = recipe_module.pretrain_recipe_performance(
-            name="test_perf", dir="/tmp", num_nodes=1, num_gpus_per_node=8
-        )
+    def test_pretrain_performance_optimizations(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe(performance_mode=True)
         assert any(
             isinstance(cb, run.Config) and cb.__fn_or_cls__ == MegatronCommOverlapCallback
             for cb in recipe.trainer.callbacks
diff --git a/tests/collections/llm/recipes/test_llama3_70b_16k.py b/tests/collections/llm/recipes/test_llama3_70b_16k.py
index 60940b062a87..17f0ec5ebd99 100644
--- a/tests/collections/llm/recipes/test_llama3_70b_16k.py
+++ b/tests/collections/llm/recipes/test_llama3_70b_16k.py
@@ -29,15 +29,15 @@ def test_trainer(self, recipe_module):
         assert trainer_config.__fn_or_cls__ == Trainer
         assert trainer_config.accelerator == "gpu"
         assert trainer_config.devices == 8
-        assert trainer_config.num_nodes == 2
+        assert trainer_config.num_nodes == 4
 
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
         assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.tensor_model_parallel_size == 8
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
         assert trainer_config.strategy.context_parallel_size == 2
         assert trainer_config.strategy.sequence_parallel is True
 
@@ -61,14 +61,37 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
-        assert trainer_config.strategy.context_parallel_size == 2
-        assert trainer_config.strategy.sequence_parallel is True
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_llama3_70b_64k.py b/tests/collections/llm/recipes/test_llama3_70b_64k.py
index 89813162fae1..e9f496dfdd2e 100644
--- a/tests/collections/llm/recipes/test_llama3_70b_64k.py
+++ b/tests/collections/llm/recipes/test_llama3_70b_64k.py
@@ -38,7 +38,7 @@ def test_trainer(self, recipe_module):
         assert trainer_config.strategy.tensor_model_parallel_size == 8
         assert trainer_config.strategy.pipeline_model_parallel_size == 4
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
         assert trainer_config.strategy.context_parallel_size == 8
         assert trainer_config.strategy.sequence_parallel is True
 
@@ -67,14 +67,37 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 8
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
-        assert trainer_config.strategy.context_parallel_size == 8
-        assert trainer_config.strategy.sequence_parallel is True
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_llama3_8b.py b/tests/collections/llm/recipes/test_llama3_8b.py
index df4f05eec2ae..88fab6d6325a 100644
--- a/tests/collections/llm/recipes/test_llama3_8b.py
+++ b/tests/collections/llm/recipes/test_llama3_8b.py
@@ -90,10 +90,8 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_pretrain_recipe_performance(self, recipe_module):
-        recipe = recipe_module.pretrain_recipe_performance(
-            name="test_perf", dir="/tmp", num_nodes=1, num_gpus_per_node=8
-        )
+    def test_pretrain_performance_optimizations(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe(performance_mode=True)
         assert any(cb.__fn_or_cls__.__name__ == "MegatronCommOverlapCallback" for cb in recipe.trainer.callbacks)
 
     def test_trainer_parallelism_options(self, recipe_module):
diff --git a/tests/collections/llm/recipes/test_llama3_8b_16k.py b/tests/collections/llm/recipes/test_llama3_8b_16k.py
index d7f3bd40ecb7..fe75f01236ab 100644
--- a/tests/collections/llm/recipes/test_llama3_8b_16k.py
+++ b/tests/collections/llm/recipes/test_llama3_8b_16k.py
@@ -29,15 +29,15 @@ def test_trainer(self, recipe_module):
         assert trainer_config.__fn_or_cls__ == Trainer
         assert trainer_config.accelerator == "gpu"
         assert trainer_config.devices == 8
-        assert trainer_config.num_nodes == 1
+        assert trainer_config.num_nodes == 2
 
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
         assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
         assert trainer_config.strategy.context_parallel_size == 2
         assert trainer_config.strategy.sequence_parallel is True
 
@@ -61,14 +61,37 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
-        assert trainer_config.strategy.context_parallel_size == 2
-        assert trainer_config.strategy.sequence_parallel is True
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_llama3_8b_64k.py b/tests/collections/llm/recipes/test_llama3_8b_64k.py
index f489e12dc55f..0316b736341a 100644
--- a/tests/collections/llm/recipes/test_llama3_8b_64k.py
+++ b/tests/collections/llm/recipes/test_llama3_8b_64k.py
@@ -29,15 +29,15 @@ def test_trainer(self, recipe_module):
         assert trainer_config.__fn_or_cls__ == Trainer
         assert trainer_config.accelerator == "gpu"
         assert trainer_config.devices == 8
-        assert trainer_config.num_nodes == 1
+        assert trainer_config.num_nodes == 4
 
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
         assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
         assert trainer_config.strategy.context_parallel_size == 4
         assert trainer_config.strategy.sequence_parallel is True
 
@@ -61,14 +61,37 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
-        assert trainer_config.strategy.context_parallel_size == 4
-        assert trainer_config.strategy.sequence_parallel is True
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_mistral.py b/tests/collections/llm/recipes/test_mistral.py
index 490f26a363fc..a7d83edcc370 100644
--- a/tests/collections/llm/recipes/test_mistral.py
+++ b/tests/collections/llm/recipes/test_mistral.py
@@ -6,7 +6,7 @@
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel
 from nemo.collections.llm.peft.lora import LoRA
-from nemo.collections.llm.recipes import mistral
+from nemo.collections.llm.recipes import mistral_7b as mistral
 from nemo.lightning import AutoResume, Trainer
 
 
diff --git a/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py b/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py
index 9f52b7117e82..62d6e0e31917 100644
--- a/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py
+++ b/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py
@@ -31,15 +31,15 @@ def test_trainer(self, recipe_module):
         assert trainer_config.__fn_or_cls__ == Trainer
         assert trainer_config.accelerator == "gpu"
         assert trainer_config.devices == 8
-        assert trainer_config.num_nodes == 2
+        assert trainer_config.num_nodes == 4
 
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
         assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
         assert trainer_config.strategy.context_parallel_size == 4
         assert trainer_config.strategy.sequence_parallel is True
         assert trainer_config.strategy.expert_model_parallel_size == 1
@@ -69,15 +69,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8
-        assert trainer_config.strategy.context_parallel_size == 4
-        assert trainer_config.strategy.sequence_parallel is True
-        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py b/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py
index f508e6dfd585..9ff93a89f438 100644
--- a/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py
+++ b/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py
@@ -35,11 +35,11 @@ def test_trainer(self, recipe_module):
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
         assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
-        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.tensor_model_parallel_size == 8
         assert trainer_config.strategy.pipeline_model_parallel_size == 4
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.context_parallel_size == 8
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
+        assert trainer_config.strategy.context_parallel_size == 4
         assert trainer_config.strategy.sequence_parallel is True
         assert trainer_config.strategy.expert_model_parallel_size == 1
 
@@ -63,15 +63,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.context_parallel_size == 8
-        assert trainer_config.strategy.sequence_parallel is True
-        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_nemotron4_15b_16k.py b/tests/collections/llm/recipes/test_nemotron4_15b_16k.py
index e0b4e1f56eb8..6c1f5d90e160 100644
--- a/tests/collections/llm/recipes/test_nemotron4_15b_16k.py
+++ b/tests/collections/llm/recipes/test_nemotron4_15b_16k.py
@@ -47,3 +47,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_valid_trainer_parallelism(self, recipe_module):
+        trainer_config = recipe_module.pretrain_recipe().trainer
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
diff --git a/tests/collections/llm/recipes/test_nemotron4_15b_64k.py b/tests/collections/llm/recipes/test_nemotron4_15b_64k.py
index 9525039eb90e..8ed35fb81893 100644
--- a/tests/collections/llm/recipes/test_nemotron4_15b_64k.py
+++ b/tests/collections/llm/recipes/test_nemotron4_15b_64k.py
@@ -47,3 +47,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_valid_trainer_parallelism(self, recipe_module):
+        trainer_config = recipe_module.pretrain_recipe().trainer
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
diff --git a/tests/collections/llm/recipes/test_nemotron4_22b_16k.py b/tests/collections/llm/recipes/test_nemotron4_22b_16k.py
index 1e501b447d45..6b4a581348e0 100644
--- a/tests/collections/llm/recipes/test_nemotron4_22b_16k.py
+++ b/tests/collections/llm/recipes/test_nemotron4_22b_16k.py
@@ -47,3 +47,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_valid_trainer_parallelism(self, recipe_module):
+        trainer_config = recipe_module.pretrain_recipe().trainer
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
diff --git a/tests/collections/llm/recipes/test_nemotron4_22b_64k.py b/tests/collections/llm/recipes/test_nemotron4_22b_64k.py
index c37a45793aff..68a238a93338 100644
--- a/tests/collections/llm/recipes/test_nemotron4_22b_64k.py
+++ b/tests/collections/llm/recipes/test_nemotron4_22b_64k.py
@@ -47,3 +47,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_valid_trainer_parallelism(self, recipe_module):
+        trainer_config = recipe_module.pretrain_recipe().trainer
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
diff --git a/tests/lightning/_io/artifacts/model.yaml b/tests/lightning/_io/artifacts/model.yaml
new file mode 100644
index 000000000000..2dff485f9be1
--- /dev/null
+++ b/tests/lightning/_io/artifacts/model.yaml
@@ -0,0 +1,200 @@
+_target_: nemo.collections.llm.gpt.model.base.GPTModel
+config:
+  _cpu_offloading_context: null
+  _target_: nemo.collections.llm.gpt.model.base.GPTConfig
+  activation_func:
+    _call_: false
+    _target_: torch._C._nn.gelu
+  activation_func_fp8_input_store: false
+  add_bias_linear: true
+  add_qkv_bias: false
+  apply_query_key_layer_scaling: false
+  apply_residual_connection_post_layernorm: false
+  apply_rope_fusion: false
+  async_tensor_model_parallel_allreduce: false
+  attention_dropout: 0.1
+  attention_softmax_in_fp32: false
+  autocast_dtype: null
+  barrier_with_L1_time: true
+  batch_p2p_comm: true
+  batch_p2p_sync: true
+  bf16: false
+  bias_activation_fusion: false
+  bias_dropout_fusion: false
+  calculate_per_token_loss: false
+  clone_scatter_output_in_embedding: true
+  config_logger_dir: ''
+  context_parallel_size: 1
+  cpu_offloading: false
+  cpu_offloading_activations: true
+  cpu_offloading_num_layers: 0
+  cpu_offloading_weights: true
+  cross_entropy_loss_fusion: true
+  data_step_fn:
+    _call_: false
+    _target_: nemo.collections.llm.gpt.model.base.gpt_data_step
+  deallocate_pipeline_outputs: false
+  defer_embedding_wgrad_compute: false
+  deterministic_mode: false
+  disable_parameter_transpose_cache: false
+  distribute_saved_activations: null
+  enable_autocast: false
+  enable_cuda_graph: false
+  expert_model_parallel_size: 1
+  external_cuda_graph: false
+  ffn_hidden_size: 4096
+  finalize_model_grads_func: null
+  first_pipeline_num_layers: null
+  forward_step_fn:
+    _call_: false
+    _target_: nemo.collections.llm.gpt.model.base.gpt_forward_step
+  fp16: false
+  fp16_lm_cross_entropy: false
+  fp32_residual_connection: false
+  fp8: null
+  fp8_amax_compute_algo: most_recent
+  fp8_amax_history_len: 1
+  fp8_dot_product_attention: false
+  fp8_interval: 1
+  fp8_margin: 0
+  fp8_multi_head_attention: false
+  fp8_wgrad: true
+  gated_linear_unit: false
+  grad_scale_func: null
+  grad_sync_func: null
+  gradient_accumulation_fusion: true
+  hidden_dropout: 0.1
+  hidden_size: 1024
+  init_method: null
+  init_method_std: 0.02
+  kv_channels: null
+  last_pipeline_num_layers: null
+  layernorm_epsilon: 1.0e-05
+  layernorm_zero_centered_gamma: false
+  make_vocab_size_divisible_by: 128
+  masked_softmax_fusion: true
+  memory_efficient_layer_norm: false
+  moe_aux_loss_coeff: 0
+  moe_expert_capacity_factor: null
+  moe_extended_tp: false
+  moe_grouped_gemm: false
+  moe_input_jitter_eps: null
+  moe_layer_recompute: false
+  moe_pad_expert_input_to_capacity: false
+  moe_per_layer_logging: false
+  moe_router_load_balancing_type: aux_loss
+  moe_router_pre_softmax: false
+  moe_router_topk: 2
+  moe_shared_expert_intermediate_size: null
+  moe_shared_expert_overlap: false
+  moe_token_dispatcher_type: allgather
+  moe_token_drop_policy: probs
+  moe_token_dropping: false
+  moe_z_loss_coeff: null
+  no_sync_func: null
+  normalization: LayerNorm
+  num_attention_heads: 8
+  num_layers: 2
+  num_microbatches_with_partial_activation_checkpoints: null
+  num_moe_experts: null
+  num_query_groups: null
+  output_layer_init_method: null
+  overlap_p2p_comm: false
+  parallel_output: true
+  param_sync_func: null
+  params_dtype:
+    _call_: false
+    _target_: torch.float32
+  perform_initialization: true
+  persist_layer_norm: false
+  pipeline_dtype: null
+  pipeline_model_parallel_size: 1
+  pipeline_model_parallel_split_rank: null
+  position_embedding_type: learned_absolute
+  qk_layernorm: false
+  recompute_granularity: null
+  recompute_method: null
+  recompute_num_layers: null
+  rotary_base: 10000
+  rotary_interleaved: false
+  rotary_percent: 1.0
+  seq_len_interpolation_factor: null
+  seq_length: 1024
+  sequence_parallel: false
+  share_embeddings_and_output_weights: true
+  tensor_model_parallel_size: 1
+  test_mode: false
+  timers: null
+  tp_comm_atomic_ag: false
+  tp_comm_atomic_rs: false
+  tp_comm_bulk_dgrad: true
+  tp_comm_bulk_wgrad: true
+  tp_comm_overlap: false
+  tp_comm_overlap_ag: true
+  tp_comm_overlap_disable_fc1: false
+  tp_comm_overlap_disable_qkv: false
+  tp_comm_overlap_rs: true
+  tp_comm_overlap_rs_dgrad: false
+  tp_comm_split_ag: true
+  tp_comm_split_rs: true
+  tp_only_amax_red: false
+  transformer_layer_spec:
+    _call_: false
+    _target_: nemo.collections.llm.gpt.model.base.default_layer_spec
+  use_cpu_initialization: false
+  use_ring_exchange_p2p: false
+  use_te_rng_tracker: false
+  variable_seq_lengths: false
+  virtual_pipeline_model_parallel_size: null
+  wgrad_deferral_limit: 0
+  window_size: null
+model_transform: null
+optim:
+  _target_: nemo.lightning.pytorch.optim.megatron.MegatronOptimizerModule
+  config:
+    _target_: megatron.core.optimizer.optimizer_config.OptimizerConfig
+    adam_beta1: 0.9
+    adam_beta2: 0.999
+    adam_eps: 1.0e-08
+    barrier_with_L1_time: false
+    bf16: false
+    clip_grad: 1.0
+    config_logger_dir: ''
+    decoupled_lr: null
+    decoupled_min_lr: null
+    fp16: false
+    hysteresis: 2
+    initial_loss_scale: 4294967296
+    log_num_zeros_in_grad: false
+    loss_scale: null
+    loss_scale_window: 1000
+    lr: 0.0001
+    min_loss_scale: 1.0
+    min_lr: null
+    optimizer: adam
+    overlap_param_gather_with_optimizer_step: false
+    params_dtype:
+      _call_: false
+      _target_: torch.float32
+    sgd_momentum: 0.9
+    timers: null
+    use_distributed_optimizer: true
+    weight_decay: 0.01
+  lr_mult: 1.0
+  lr_scheduler: null
+  no_weight_decay_cond: null
+  scale_lr_cond: null
+tokenizer:
+  _target_: nemo.collections.common.tokenizers.huggingface.auto_tokenizer.AutoTokenizer
+  bos_token: null
+  cls_token: null
+  eos_token: null
+  mask_token: null
+  merges_file: megatron-gpt-345m_merges
+  pad_token: null
+  pretrained_model_name: gpt2
+  sep_token: null
+  trust_remote_code: false
+  unk_token: null
+  use_fast: false
+  vocab_file: megatron-gpt-345m_vocab
diff --git a/tests/lightning/_io/test_api.py b/tests/lightning/_io/test_api.py
index 83f77390ec6e..386bd5b5fdab 100644
--- a/tests/lightning/_io/test_api.py
+++ b/tests/lightning/_io/test_api.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 from functools import partial
+from pathlib import Path
 
 import pytest
+import yaml
 from pytorch_lightning.loggers import TensorBoardLogger
 
 from nemo import lightning as nl
@@ -24,6 +27,7 @@
 from nemo.utils.import_utils import safe_import
 
 te, HAVE_TE = safe_import("transformer_engine")
+ARTIFACTS_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "artifacts")
 
 
 def dummy_extra(a, b, c=5):
@@ -55,7 +59,7 @@ def test_reload_ckpt(self, tmpdir, partial_function_with_pos_and_key_args):
         )
 
         ckpt = io.TrainerContext(model, trainer, extra={"dummy": partial_function_with_pos_and_key_args})
-        ckpt.io_dump(tmpdir)
+        ckpt.io_dump(tmpdir, yaml_attrs=["model"])
         loaded = io.load_context(tmpdir)
 
         assert loaded.model.config.seq_length == ckpt.model.config.seq_length
@@ -64,3 +68,10 @@ def test_reload_ckpt(self, tmpdir, partial_function_with_pos_and_key_args):
 
         loaded_func = loaded.extra["dummy"]
         assert loaded_func(b=2) == partial_function_with_pos_and_key_args(b=2)
+
+        model_yaml = Path(tmpdir) / "model.yaml"
+        assert model_yaml.exists()
+
+        observed = yaml.safe_load(model_yaml.read_text())
+        expected = yaml.safe_load((Path(ARTIFACTS_DIR) / "model.yaml").read_text())
+        assert observed.keys() == expected.keys()
diff --git a/tests/lightning/test_nemo_run.py b/tests/lightning/test_nemo_run.py
index 8d7814bfe530..947930c84847 100644
--- a/tests/lightning/test_nemo_run.py
+++ b/tests/lightning/test_nemo_run.py
@@ -17,8 +17,8 @@
         ("llama3_70b_16k", "pretrain_recipe", "llama3_70b_16k_pretrain"),
         ("llama3_70b_64k", "pretrain_recipe", "llama3_70b_64k_pretrain"),
         ("llama31_405b", "pretrain_recipe", "llama31_405b_pretrain"),
-        ("mistral", "pretrain_recipe", "mistral_pretrain"),
-        ("mistral", "finetune_recipe", "mistral_finetune"),
+        ("mistral_7b", "pretrain_recipe", "mistral_pretrain"),
+        ("mistral_7b", "finetune_recipe", "mistral_finetune"),
         ("mixtral_8x7b", "pretrain_recipe", "mixtral_8x7b_pretrain"),
         ("mixtral_8x7b", "finetune_recipe", "mixtral_8x7b_finetune"),
         ("mixtral_8x7b_16k", "pretrain_recipe", "mixtral_8x7b_16k_pretrain"),
diff --git a/tests/lightning/test_strategy_lib.py b/tests/lightning/test_strategy_lib.py
index 36143cedb8c4..5a2a5a152185 100644
--- a/tests/lightning/test_strategy_lib.py
+++ b/tests/lightning/test_strategy_lib.py
@@ -41,7 +41,7 @@ def test_set_model_parallel_attributes() -> None:
 
     class DummyModel:
         def __init__(self):
-            self.config = TransformerConfig(hidden_size=128, num_attention_heads=2, num_layers=2)
+            self.config = TransformerConfig(hidden_size=128, num_attention_heads=2, num_layers=2, num_moe_experts=2)
 
         def configure_model(self):
             pass
@@ -80,7 +80,7 @@ def test_init_parallel_ranks(mock_initialize_model_parallel) -> None:
     mock_parallel_config.pipeline_model_parallel_split_rank = None
 
     _strategy_lib.init_parallel_ranks(
-        world_size=2,
+        world_size=3,
         global_rank=1,
         local_rank=0,
         parallel_config=mock_parallel_config,
@@ -88,7 +88,7 @@ def test_init_parallel_ranks(mock_initialize_model_parallel) -> None:
         fp8=False,
     )
     mock_initialize_model_parallel.assert_called_once_with(
-        world_size=2,
+        world_size=3,
         global_rank=1,
         local_rank=0,
         tensor_model_parallel_size=2,
diff --git a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
deleted file mode 100644
index 608685254a0d..000000000000
--- a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
+++ /dev/null
@@ -1,827 +0,0 @@
-{
-    "cells": [
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {
-                "pycharm": {
-                    "name": "#%%\n"
-                }
-            },
-            "outputs": [],
-            "source": [
-                "\"\"\"\n",
-                "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
-                "\n",
-                "Instructions for setting up Colab are as follows:\n",
-                "1. Open a new Python 3 notebook.\n",
-                "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
-                "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
-                "4. Run this cell to set up dependencies.\n",
-                "\"\"\"\n",
-                "# If you're using Google Colab and not running locally, run this cell\n",
-                "\n",
-                "# install NeMo\n",
-                "BRANCH = 'main'\n",
-                "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {
-                "pycharm": {
-                    "name": "#%%\n"
-                }
-            },
-            "outputs": [],
-            "source": [
-                "from nemo.collections import nlp as nemo_nlp\n",
-                "from nemo.utils.exp_manager import exp_manager\n",
-                "from nemo.utils import logging\n",
-                "\n",
-                "import os\n",
-                "import wget\n",
-                "import torch\n",
-                "import pytorch_lightning as pl\n",
-                "from omegaconf import OmegaConf"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "# Task Description\n",
-                "**Joint Intent and Slot classification** - is a task of classifying an Intent and detecting all relevant Slots (Entities)\n",
-                "for this Intent in a query.\n",
-                "For example, in the query:  `What is the weather in Santa Clara tomorrow morning?`, we would like to classify the query\n",
-                "as a `weather` Intent, and detect `Santa Clara` as a `location` slot and `tomorrow morning` as a `date_time` slot.\n",
-                "Intents and Slots names are usually task specific and defined as labels in the training data.\n",
-                "This is a fundamental step that is executed in any task-driven Conversational Assistant.\n",
-                "\n",
-                "Our Bert based model implementation enables to train and then detect both of these tasks together.\n",
-                "\n",
-                "**Multi Label Joint Intent and Slot classification** - is very similar to the task above, but instead of only classifying a single Intent, the task can predict multiple different intents for each query. For example, for the query `Yes, please tell me the weather`, we might want the intents for this utterance to be `yes` and `weather`. You can skip to that tutorial [here](#multi-label)\n"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "# Dataset and NeMo data format\n",
-                "\n",
-                "In this tutorial we are going to use a virtual assistant interaction data set that can be downloaded from here: https://github.com/xliuhw/NLU-Evaluation-Data.\n",
-                "There are about 10K training and 1K testing queries which cover 64 various Intents and 55 Slots. \n",
-                "\n",
-                "To work with NeMo NLP classification model, this dataset should be first converted to the NeMo format, which requires next files:\n",
-                "- **dict.intents.csv** - list of all intent names in the data. One line per an intent name.\n",
-                "- **dict.slots.csv** - list of all slot names in the data. One line per a slot name. It is possible to use both: B- I- notations, for separating between first and intermediate tokens for multi token slots. Or just use one slot type for each token of multi token slot. Our recommendation is to use later one, since it is simpler and there is no visible degradation in performance.\n",
-                "- **train.tsv/test.tsv** - contain original queries, one per line, and intent number separated by tab. For example: `what alarms do i have set right now\t0`. Intent numbers are according to the intent line in the intent dictionary file (dict.intents.csv) starting from 0. First line of these files contains a header line: `sentence \\tab label`.\n",
-                "- **train_slot.tvs/test_slot.tsv** - contain one line per a query, where instead each token there is a number of the token from the slots dictionary file (dict.slots.csv), starting from 0. Last 'out-of scope' token is usually located in the last line of the dictionary. Example: `54 0 0 54 54 12 12` (numbers separated by space). No header line in these files.\n",
-                "\n",
-                "NeMo provides **import_dataset.py** converter for few reference datasets (Assistant / Atis / Snips) which converts them to the NeMo data format for the Intent and Slot classification model. If you have your own annotated dataset in a different format, you will need to write a data converter. Possible recommended format for your own annotation, is to have one text file per all examples of one intent. With one line per query in a form like: `did i set an alarm to [alarm_type : wake up] in the [timeofday : morning]`, using brackets to define slot names. This is very similar to the assistant format from this example and you can use its converter to NeMo format with small changes. \n",
-                "\n",
-                "You can run this utility as follows:\n",
-                "\n",
-                "**python examples/nlp/intent_slot_classification/data/import_datasets.py --dataset_name=assistant --source_data_dir=source_dir_name --target_data_dir=target_dir_name**\n"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "# Download, preprocess and explore the dataset\n",
-                "## Download the dataset and convert it to the NeMo format"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# you can replace DATA_DIR and NEMO_DIR with your own locations\n",
-                "DATA_DIR = \".\"\n",
-                "NEMO_DIR = '.'\n",
-                "\n",
-                "# download the converter files from github for the purpose of this tutorial\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py', NEMO_DIR)\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py', NEMO_DIR)"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {
-                "pycharm": {
-                    "name": "#%%\n"
-                }
-            },
-            "outputs": [],
-            "source": [
-                "# download and unzip the example dataset from github\n",
-                "print('Downloading dataset...')\n",
-                "wget.download('https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip', DATA_DIR)\n",
-                "! unzip {DATA_DIR}/NLU-Evaluation-Data-master.zip -d {DATA_DIR}"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# convert the dataset to the NeMo format\n",
-                "!python {NEMO_DIR}/import_datasets.py --dataset_name=assistant --source_data_dir={DATA_DIR}/NLU-Evaluation-Data-master --target_data_dir={DATA_DIR}/nemo_format\n"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Data exploration\n",
-                "You can see the dataset in both the original and NeMo's formats. We have here 65 different Intents and 55 Slots, which could be typical commands for virtual assistants. Out of scope slot has the name 'O' and is the last in the dictionary of Slots. And we can see examples of queries and also format of training intent and slot files. "
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# list of queries divided by intent files in the original training dataset\n",
-                "! ls -l {DATA_DIR}/NLU-Evaluation-Data-master/dataset/trainset"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# print all intents from the NeMo format intent dictionary\n",
-                "!echo 'Intents: ' $(wc -l < {DATA_DIR}/nemo_format/dict.intents.csv)\n",
-                "! cat {DATA_DIR}/nemo_format/dict.intents.csv"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# print all slots from the NeMo format slot dictionary\n",
-                "!echo 'Slots: ' $(wc -l < {DATA_DIR}/nemo_format/dict.slots.csv)\n",
-                "! cat {DATA_DIR}/nemo_format/dict.slots.csv"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# examples from the intent training file\n",
-                "! head -n 10 {DATA_DIR}/nemo_format/train.tsv"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# examples from the slot training file\n",
-                "! head -n 10 {DATA_DIR}/nemo_format/train_slots.tsv"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "# Training model"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Model configuration\n",
-                "\n",
-                "Our Joint Intent and Slot classification model is comprised of the pretrained [BERT](https://arxiv.org/pdf/1810.04805.pdf) model with an Intent and Slot Classification layer on top of it.\n",
-                "\n",
-                "All model and training parameters are defined in the **intent_slot_classification_config.yaml** config file. This file is located in the folder **examples/nlp/intent_slot_classification/conf/**. It contains 2 main sections:\n",
-                "- **model**: All arguments that are related to the Model - language model, token classifier, optimizer and schedulers, datasets and any other related information\n",
-                "\n",
-                "- **trainer**: Any argument to be passed to PyTorch Lightning\n",
-                "\n",
-                "We will download the config file from repository for the purpose of the tutorial. If you have a version of NeMo installed locally, you can use it from the above folder."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# download the model config file from repository for the purpose of this example\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml', NEMO_DIR)\n",
-                "\n",
-                "# print content of the config file\n",
-                "config_file = \"intent_slot_classification_config.yaml\"\n",
-                "print(config_file)\n",
-                "config = OmegaConf.load(config_file)\n",
-                "print(OmegaConf.to_yaml(config))"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Setting up Data within the config\n",
-                "\n",
-                "Among other things, the config file contains dictionaries called train_ds and validation_ds. These are configurations used to setup the Dataset and DataLoaders of the corresponding config.\n",
-                "\n",
-                "The converter utility creates both training and evaluation files in the same directory, so we need to specify `model.data_dir` parameter to this directory. Also notice that some config lines, including `model.data_dir`, have `???` in place of paths, this means that values for these fields are required to be specified by the user.\n",
-                "\n",
-                "`config.model.intent_loss_weight` parameter - is a balance of training loss between Intent and Slot losses, a number between 0 to 1. Its default value is 0.6 which gives slightly higher priority to the Intent loss and it empirically works quite well. You can experiment with this value if you like.\n",
-                "Also you can try to change `config.model.class_balancing` parameter to `weighted_loss` and see if you get better accuracy.\n",
-                "\n",
-                "Let's now add the data directory path to the config."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "config.model.data_dir = f'{DATA_DIR}/nemo_format'"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Building the PyTorch Lightning Trainer\n",
-                "\n",
-                "NeMo models are primarily PyTorch Lightning modules - and therefore are entirely compatible with the PyTorch Lightning ecosystem. `config.trainer.max_epochs` - param defines number of training epochs. Usually 50-100 epochs or less should be enough to train on your data. Let's instantiate the Trainer object."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# lets modify some trainer configs\n",
-                "# checks if we have GPU available and uses it\n",
-                "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
-                "config.trainer.devices = 1\n",
-                "config.trainer.accelerator = accelerator\n",
-                "\n",
-                "config.trainer.precision = 16 if torch.cuda.is_available() else 32\n",
-                "\n",
-                "# for mixed precision training, uncomment the line below (precision should be set to 16 and amp_level to O1):\n",
-                "# config.trainer.amp_level = O1\n",
-                "\n",
-                "# remove distributed training flags\n",
-                "config.trainer.strategy = 'auto'\n",
-                "\n",
-                "# setup a small number of epochs for demonstration purposes of this tutorial\n",
-                "config.trainer.max_epochs = 5\n",
-                "\n",
-                "trainer = pl.Trainer(**config.trainer)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Setting up a NeMo Experiment\n",
-                "\n",
-                "NeMo has an experiment manager that handles logging and checkpointing for us, so let's use it. Model check points during training will be saved in this directory. "
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))\n",
-                "# the exp_dir provides a path to the current experiment for easy access\n",
-                "print(str(exp_dir))"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Initializing the model and Training\n",
-                "\n",
-                "Initial statistics of the dataset will be displayed at the beginning of the training and then Intent and Slot classification report will be displayed after each training epoch."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# initialize the model\n",
-                "model = nemo_nlp.models.IntentSlotClassificationModel(config.model, trainer=trainer)\n",
-                "\n",
-                "# train\n",
-                "trainer.fit(model)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "After training for 5 epochs, which should take no more than few minutes, you can expect training precision for this data set to be around these numbers (the accuracy will gradually continue to improve for this dataset up to about 50 epochs of training): \n",
-                "```\n",
-                "Intents:\n",
-                "    label                                                precision    recall       f1           support   \n",
-                "    alarm_query (label_id: 0)                               94.74      94.74      94.74         19\n",
-                "    alarm_remove (label_id: 1)                             100.00     100.00     100.00         11\n",
-                "    alarm_set (label_id: 2)                                 85.71      94.74      90.00         19\n",
-                "    audio_volume_down (label_id: 3)                          0.00       0.00       0.00          8\n",
-                "    audio_volume_mute (label_id: 4)                        100.00      86.67      92.86         15\n",
-                "    audio_volume_up (label_id: 5)                           56.52     100.00      72.22         13\n",
-                "    calendar_query (label_id: 6)                            55.00      57.89      56.41         19\n",
-                "    calendar_remove (label_id: 7)                           88.89      84.21      86.49         19\n",
-                "    calendar_set (label_id: 8)                              81.25      68.42      74.29         19\n",
-                "    cooking_recipe (label_id: 9)                            86.36     100.00      92.68         19\n",
-                "    datetime_convert (label_id: 10)                          0.00       0.00       0.00          8\n",
-                "    datetime_query (label_id: 11)                           65.52     100.00      79.17         19\n",
-                "    email_addcontact (label_id: 12)                        100.00      12.50      22.22          8\n",
-                "    email_query (label_id: 13)                              83.33      78.95      81.08         19\n",
-                "    email_querycontact (label_id: 14)                       62.50      78.95      69.77         19\n",
-                "    email_sendemail (label_id: 15)                          70.83      89.47      79.07         19\n",
-                "    general_affirm (label_id: 16)                           95.00     100.00      97.44         19\n",
-                "    general_commandstop (label_id: 17)                     100.00     100.00     100.00         19\n",
-                "    general_confirm (label_id: 18)                         100.00     100.00     100.00         19\n",
-                "    general_dontcare (label_id: 19)                        100.00     100.00     100.00         19\n",
-                "    general_explain (label_id: 20)                         100.00      94.74      97.30         19\n",
-                "    general_joke (label_id: 21)                            100.00     100.00     100.00         12\n",
-                "    general_negate (label_id: 22)                           95.00     100.00      97.44         19\n",
-                "    general_praise (label_id: 23)                          100.00      94.74      97.30         19\n",
-                "    general_quirky (label_id: 24)                           40.00      10.53      16.67         19\n",
-                "    general_repeat (label_id: 25)                          100.00     100.00     100.00         19\n",
-                "    iot_cleaning (label_id: 26)                             84.21     100.00      91.43         16\n",
-                "    iot_coffee (label_id: 27)                               94.74      94.74      94.74         19\n",
-                "    iot_hue_lightchange (label_id: 28)                      94.44      89.47      91.89         19\n",
-                "    iot_hue_lightdim (label_id: 29)                        100.00      83.33      90.91         12\n",
-                "    iot_hue_lightoff (label_id: 30)                         89.47      89.47      89.47         19\n",
-                "    iot_hue_lighton (label_id: 31)                           0.00       0.00       0.00          3\n",
-                "    iot_hue_lightup (label_id: 32)                          81.25      92.86      86.67         14\n",
-                "    iot_wemo_off (label_id: 33)                             60.00     100.00      75.00          9\n",
-                "    iot_wemo_on (label_id: 34)                             100.00      14.29      25.00          7\n",
-                "    lists_createoradd (label_id: 35)                        78.95      78.95      78.95         19\n",
-                "    lists_query (label_id: 36)                              78.95      78.95      78.95         19\n",
-                "    lists_remove (label_id: 37)                             90.00      94.74      92.31         19\n",
-                "    music_likeness (label_id: 38)                           70.59      66.67      68.57         18\n",
-                "    music_query (label_id: 39)                              77.78      73.68      75.68         19\n",
-                "    music_settings (label_id: 40)                            0.00       0.00       0.00          7\n",
-                "    news_query (label_id: 41)                               77.78      73.68      75.68         19\n",
-                "    play_audiobook (label_id: 42)                           90.00      94.74      92.31         19\n",
-                "    play_game (label_id: 43)                                80.00      84.21      82.05         19\n",
-                "    play_music (label_id: 44)                               53.85      73.68      62.22         19\n",
-                "    play_podcasts (label_id: 45)                            89.47      89.47      89.47         19\n",
-                "    play_radio (label_id: 46)                               93.75      78.95      85.71         19\n",
-                "    qa_currency (label_id: 47)                              95.00     100.00      97.44         19\n",
-                "    qa_definition (label_id: 48)                            85.00      89.47      87.18         19\n",
-                "    qa_factoid (label_id: 49)                               45.16      73.68      56.00         19\n",
-                "    qa_maths (label_id: 50)                                100.00     100.00     100.00         14\n",
-                "    qa_stock (label_id: 51)                                 95.00     100.00      97.44         19\n",
-                "    recommendation_events (label_id: 52)                    94.44      89.47      91.89         19\n",
-                "    recommendation_locations (label_id: 53)                 94.74      94.74      94.74         19\n",
-                "    recommendation_movies (label_id: 54)                   100.00     100.00     100.00         10\n",
-                "    social_post (label_id: 55)                              90.00      94.74      92.31         19\n",
-                "    social_query (label_id: 56)                             94.74     100.00      97.30         18\n",
-                "    takeaway_order (label_id: 57)                           93.75      78.95      85.71         19\n",
-                "    takeaway_query (label_id: 58)                           85.71      94.74      90.00         19\n",
-                "    transport_query (label_id: 59)                          83.33      78.95      81.08         19\n",
-                "    transport_taxi (label_id: 60)                          100.00     100.00     100.00         18\n",
-                "    transport_ticket (label_id: 61)                         89.47      89.47      89.47         19\n",
-                "    transport_traffic (label_id: 62)                       100.00     100.00     100.00         19\n",
-                "    weather_query (label_id: 63)                           100.00      89.47      94.44         19\n",
-                "    -------------------\n",
-                "    micro avg                                               85.04      85.04      85.04       1076\n",
-                "    macro avg                                               81.13      80.81      79.36       1076\n",
-                "    weighted avg                                            84.10      85.04      83.54       1076\n",
-                "    \n",
-                "Slots:\n",
-                "    label                                                precision    recall       f1           support   \n",
-                "    alarm_type (label_id: 0)                                 0.00       0.00       0.00          0\n",
-                "    app_name (label_id: 1)                                   0.00       0.00       0.00          6\n",
-                "    artist_name (label_id: 2)                                0.00       0.00       0.00         21\n",
-                "    audiobook_author (label_id: 3)                           0.00       0.00       0.00          1\n",
-                "    audiobook_name (label_id: 4)                             0.00       0.00       0.00         18\n",
-                "    business_name (label_id: 5)                             60.00      56.60      58.25         53\n",
-                "    business_type (label_id: 6)                              0.00       0.00       0.00         24\n",
-                "    change_amount (label_id: 7)                              0.00       0.00       0.00         25\n",
-                "    coffee_type (label_id: 8)                                0.00       0.00       0.00          4\n",
-                "    color_type (label_id: 9)                                 0.00       0.00       0.00         12\n",
-                "    cooking_type (label_id: 10)                              0.00       0.00       0.00          0\n",
-                "    currency_name (label_id: 11)                            84.09      75.51      79.57         49\n",
-                "    date (label_id: 12)                                     57.95      91.07      70.83        112\n",
-                "    definition_word (label_id: 13)                           0.00       0.00       0.00         20\n",
-                "    device_type (label_id: 14)                              74.55      51.25      60.74         80\n",
-                "    drink_type (label_id: 15)                                0.00       0.00       0.00          0\n",
-                "    email_address (label_id: 16)                             0.00       0.00       0.00         14\n",
-                "    email_folder (label_id: 17)                              0.00       0.00       0.00          1\n",
-                "    event_name (label_id: 18)                              100.00      13.24      23.38         68\n",
-                "    food_type (label_id: 19)                                51.72      69.77      59.41         43\n",
-                "    game_name (label_id: 20)                                60.00      14.29      23.08         21\n",
-                "    game_type (label_id: 21)                                 0.00       0.00       0.00          0\n",
-                "    general_frequency (label_id: 22)                         0.00       0.00       0.00          9\n",
-                "    house_place (label_id: 23)                              93.33      42.42      58.33         33\n",
-                "    ingredient (label_id: 24)                                0.00       0.00       0.00          6\n",
-                "    joke_type (label_id: 25)                                 0.00       0.00       0.00          4\n",
-                "    list_name (label_id: 26)                                 0.00       0.00       0.00         21\n",
-                "    meal_type (label_id: 27)                                 0.00       0.00       0.00          0\n",
-                "    media_type (label_id: 28)                                0.00       0.00       0.00         37\n",
-                "    movie_name (label_id: 29)                                0.00       0.00       0.00          0\n",
-                "    movie_type (label_id: 30)                                0.00       0.00       0.00          0\n",
-                "    music_album (label_id: 31)                               0.00       0.00       0.00          0\n",
-                "    music_descriptor (label_id: 32)                          0.00       0.00       0.00          3\n",
-                "    music_genre (label_id: 33)                               0.00       0.00       0.00          9\n",
-                "    news_topic (label_id: 34)                                0.00       0.00       0.00         17\n",
-                "    order_type (label_id: 35)                                0.00       0.00       0.00         17\n",
-                "    person (label_id: 36)                                   44.86      92.31      60.38         52\n",
-                "    personal_info (label_id: 37)                             0.00       0.00       0.00         20\n",
-                "    place_name (label_id: 38)                               71.25      77.03      74.03        148\n",
-                "    player_setting (label_id: 39)                            0.00       0.00       0.00          1\n",
-                "    playlist_name (label_id: 40)                             0.00       0.00       0.00          1\n",
-                "    podcast_descriptor (label_id: 41)                        0.00       0.00       0.00         13\n",
-                "    podcast_name (label_id: 42)                              0.00       0.00       0.00          4\n",
-                "    radio_name (label_id: 43)                               66.67      10.53      18.18         38\n",
-                "    relation (label_id: 44)                                  0.00       0.00       0.00         17\n",
-                "    song_name (label_id: 45)                                 0.00       0.00       0.00         22\n",
-                "    time (label_id: 46)                                     70.27      78.20      74.02        133\n",
-                "    time_zone (label_id: 47)                                 0.00       0.00       0.00          9\n",
-                "    timeofday (label_id: 48)                                 0.00       0.00       0.00         28\n",
-                "    transport_agency (label_id: 49)                          0.00       0.00       0.00          9\n",
-                "    transport_descriptor (label_id: 50)                      0.00       0.00       0.00          0\n",
-                "    transport_name (label_id: 51)                            0.00       0.00       0.00          4\n",
-                "    transport_type (label_id: 52)                           78.38      82.86      80.56         35\n",
-                "    weather_descriptor (label_id: 53)                        0.00       0.00       0.00         17\n",
-                "    O (label_id: 54)                                        92.42      98.80      95.50       5920\n",
-                "    -------------------\n",
-                "    micro avg                                               89.10      89.10      89.10       7199\n",
-                "    macro avg                                               21.86      18.56      18.18       7199\n",
-                "    weighted avg                                            84.42      89.10      86.01       7199\n",
-                "```"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Evaluation\n",
-                "To see how the model performs, we can evaluate the performance of the trained model on a test data file. Here we will reload the model from the `.nemo` file saved during training. By default, the `.nemo` file contains the final checkpoint. We will use the same trainer for testing."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# extract the path of the best checkpoint from the training, you may update it to any other saved checkpoint file\n",
-                "checkpoint_path = trainer.checkpoint_callback.best_model_path\n",
-                "\n",
-                "# load the model from this checkpoint\n",
-                "eval_model = nemo_nlp.models.IntentSlotClassificationModel.load_from_checkpoint(checkpoint_path=checkpoint_path)"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# we will setup testing data reusing the same config (test section)\n",
-                "eval_model.setup_test_data(test_data_config=config.model.test_ds)\n",
-                "\n",
-                "# run the evaluation on the test dataset\n",
-                "trainer.test(model=eval_model, ckpt_path=None, verbose=False)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Inference from Examples\n",
-                "Next step to see how the trained model will classify Intents and Slots for given queries from this domain. To improve the predictions you may need to train the model for more than 5 epochs.\n"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "queries = [\n",
-                "    'set alarm for seven thirty am',\n",
-                "    'lower volume by fifty percent',\n",
-                "    'what is my schedule for tomorrow',\n",
-                "]\n",
-                "\n",
-                "pred_intents, pred_slots = eval_model.predict_from_examples(queries, config.model.test_ds)\n",
-                "\n",
-                "logging.info('The prediction results of some sample queries with the trained model:')\n",
-                "for query, intent, slots in zip(queries, pred_intents, pred_slots):\n",
-                "    logging.info(f'Query : {query}')\n",
-                "    logging.info(f'Predicted Intent: {intent}')\n",
-                "    logging.info(f'Predicted Slots: {slots}')"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Training Script\n",
-                "\n",
-                "If you have NeMo installed locally (eg. cloned from the Github), you can also train the model with the example script: `examples/nlp/intent_slot_classification/intent_slot_classification.py.`\n",
-                "This script contains an example on how to train, evaluate and perform inference with the IntentSlotClassificationModel.\n",
-                "\n",
-                "To run a training script, use:\n",
-                "\n",
-                "`cd examples/nlp/intent_slot_classification`\n",
-                "\n",
-                "`python intent_slot_classification.py model.data_dir=PATH_TO_DATA_DIR`\n",
-                "\n",
-                "By default, this script uses examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.py config file, and you may update all the params inside of this config file or alternatively providing them in the command line."
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "<a id='multi-label'></a>\n",
-                "# Multi-Label Intent Classification\n",
-                "---\n",
-                "\n",
-                "As mentioned above, our multi-label model will be very similar the single intent classification model, with the added functionality of predicting multiple different intents for a single query. For example, the query `show all flights and fares from denver to san francisco` would have intents `atis_airfare` and `atis_flight`. From our list of intents found in `dict.intents.csv`, the model checks whether each individual intent is suitable for the given query.\n",
-                "\n",
-                "For this tutorial, we will be using the ATIS (Airline Travel Information System) dataset, converting it to a multi-label data format, and then using the new data to train our model.\n",
-                "\n"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Download the dataset and convert it to the NeMo format"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# download the converter files from github for the purpose of this tutorial\n",
-                "DATA_DIR = './multiatis'\n",
-                "NEMO_DIR = './atis'\n",
-                "\n",
-                "!mkdir {DATA_DIR}\n",
-                "!mkdir {NEMO_DIR}\n",
-                "\n",
-                "\n",
-                "files = [f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.dict.intent.csv', \n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.dict.slots.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.dict.vocab.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.test.intent.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.test.pkl', \n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.test.query.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.test.slots.csv', \n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.train.intent.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.train.pkl',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.train.query.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.train.slots.csv']\n",
-                "\n",
-                "         \n",
-                "for file in files:\n",
-                "    wget.download(file, DATA_DIR)\n",
-                "\n",
-                "\n",
-                "# download the converter files from github for the purpose of this tutorial\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py', NEMO_DIR)\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py', NEMO_DIR)\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/convert_datasets.py', NEMO_DIR)\n",
-                "\n",
-                "# Get original atis dataset\n",
-                "!python {NEMO_DIR}/import_datasets.py --dataset_name=atis --source_data_dir={DATA_DIR} --target_data_dir={DATA_DIR}/nemo_format\n",
-                "# Script will create new files at {DATA_DIR}/new_format\n",
-                "!mkdir {DATA_DIR}/new_format\n",
-                "!python {NEMO_DIR}/convert_datasets.py --source_data_dir={DATA_DIR}/nemo_format --target_data_dir={DATA_DIR}/new_format"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Data Augmentation (Optional)\n",
-                "---\n",
-                "\n",
-                "In scenarios when we don't have many training examples with multiple intent labels, data augmentation can be very useful. This can be done by concatenating utterances together, and adding it to our training data. Some ways of concatenating include adding a period or \\\"and\\\" between the two utterances. A script has been provided below to help with augmentation, but it can be changed depending on your use case."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# download the data augmentation script\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/augment_training_data.py', NEMO_DIR)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "The script augment_training_data.py allows for four command line arguments to be passed in: \n",
-                "\n",
-                "source_data_dir: directory that contains the original multi-label data <br>\n",
-                "target_data_dir: directory to store the new data directory <br>\n",
-                "num_mixed: number of new utterances to add to dataset per class pair (utterances with labels 1 and 2) <br>\n",
-                "link_string: string that is in between the two utterances (\".\", \"\", \"and\", \"with\") <br>"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "!python {NEMO_DIR}/augment_training_data.py --source_data_dir={DATA_DIR}/new_format --target_data_dir={DATA_DIR}/augmented_data --num_mixed=10"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Training the Model"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# download the model config file from repository for the purpose of this example\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml', NEMO_DIR)\n",
-                "\n",
-                "# print content of the config file\n",
-                "config_file = f\"{NEMO_DIR}/multi_label_intent_slot_classification_config.yaml\"\n",
-                "print(config_file)\n",
-                "config = OmegaConf.load(config_file)\n",
-                "print(OmegaConf.to_yaml(config))"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "config.model.data_dir = f\"{DATA_DIR}/new_format\"\n",
-                "config.model.validation_ds.prefix = \"dev\"\n",
-                "config.model.test_ds.prefix = \"dev\"\n",
-                "config.model.class_balancing = \"weighted_loss\"\n",
-                "config.trainer.max_epochs = 5\n",
-                "run_name = \"test\"\n",
-                "\n",
-                "# checks if we have GPU available and uses it\n",
-                "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
-                "config.trainer.devices = 1\n",
-                "config.trainer.accelerator = accelerator\n",
-                "\n",
-                "# remove distributed training flags\n",
-                "config.trainer.strategy = 'auto'\n",
-                "\n",
-                "trainer = pl.Trainer(**config.trainer)\n",
-                "config.exp_manager.exp_dir = os.path.join(DATA_DIR, \"output/\" + run_name)\n",
-                "config.exp_manager.create_checkpoint_callback = True\n",
-                "\n",
-                "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))\n",
-                "model = nemo_nlp.models.MultiLabelIntentSlotClassificationModel(config.model, trainer=trainer)\n",
-                "trainer.fit(model)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Evaluation"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "To see how the model performs, we can evaluate the performance of the trained model on a test data file. Here we will reload the model from the `.nemo` file saved during training. By default, the `.nemo` file contains the final checkpoint. We will use the same trainer for testing."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# specify checkpoint path with .nemo file\n",
-                "checkpoint_path = os.path.join(exp_dir, \"checkpoints\", \"MultiLabelIntentSlot.nemo\")\n",
-                "\n",
-                "# load the model from this checkpoint\n",
-                "eval_model =  nemo_nlp.models.MultiLabelIntentSlotClassificationModel.restore_from(checkpoint_path)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "### Optimizing Threshold\n",
-                "\n",
-                "As mentioned above, when classifying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n",
-                "\n",
-                "We need to use these probabilities to generate final label predictions of 0 or 1 for each label. While we can use 0.5 as the probability threshold, it is usually the case that there is a better threshold to use depending on the metric we want to optimize. For this tutorial, we will be finding the threshold that gives us the best micro-F1 score on the validation set. After running the `optimize_threshold` method, the threshold attribute for our model will be updated."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "eval_model.optimize_threshold(config.model.test_ds, 'dev')"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "eval_model.threshold"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "###  Inference from Examples\n",
-                "Similar to the previous example we can run inference to see how the trained model will classify Intents and Slots for given queries from this domain. To improve the predictions you may need to train the model for more than 10 epochs.\n"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "queries = [\n",
-                "    'i would like to find a flight from charlotte to las vegas that makes a stop in st. louis',\n",
-                "    'on april first i need a ticket from tacoma to san jose departing before 7 am',\n",
-                "    'how much is the limousine service in boston',\n",
-                "]\n",
-                "\n",
-                "# We use the optimized threshold for predictions\n",
-                "pred_intents, pred_slots, pred_list = eval_model.predict_from_examples(queries, config.model.test_ds)\n",
-                "logging.info('The prediction results of some sample queries with the trained model:')\n",
-                "    \n",
-                "for query, intent, slots in zip(queries, pred_intents, pred_slots):\n",
-                "    logging.info(f'Query : {query}')\n",
-                "    logging.info(f'Predicted Intents: {intent}')\n",
-                "    logging.info(f'Predicted Slots: {slots}')"
-            ]
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "Python 3 (ipykernel)",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.8.12"
-        }
-    },
-    "nbformat": 4,
-    "nbformat_minor": 1
-}