From bc119d021e01f1e1b12b929ba686038af799accf Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Thu, 17 Oct 2024 21:22:40 +0400
Subject: [PATCH 01/37] Fix ASR tests (#10794)

* Make tests required

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Debug torch.load issue

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Run only necessary tests

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Try fix loading

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Avoid caching fixture

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Try restore model several times

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Try customize temporary directory

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Reorder tests

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Disable one test

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Avoid xxlarge model

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Disable test

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Revert changes

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Magic fix

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Revert unnecessary changes

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Clean up

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Disable all jobs except L0

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* RNNT alignments - merge with unit tests

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix CUDA graph frame-looping decoder to handle non-CUDA inputs

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix config

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Log test results

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Use less audio files for tests

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

---------

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml               | 23 ++++---------------
 .../cuda_graph_rnnt_greedy_decoding.py        |  7 ++++++
 .../test_cuda_graph_rnnt_greedy_decoding.py   |  4 ++--
 ...ments_check.py => test_rnnt_alignments.py} | 12 ++++------
 4 files changed, 19 insertions(+), 27 deletions(-)
 rename tests/collections/asr/decoding/{rnnt_alignments_check.py => test_rnnt_alignments.py} (94%)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 800d91acb7ed..2021c7d93136 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -131,16 +131,16 @@ jobs:
         ### \'\'
   
   # L0: GPU unit tests
-  OPTIONAL_L0_Unit_Tests_GPU_ASR:
+  L0_Unit_Tests_GPU_ASR:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure
        TIMEOUT: 20
+       # TODO: remove this hack
        SCRIPT: |
-         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads
-       IS_OPTIONAL: true
+         python -c "from nemo.collections.asr.models import ASRModel" && NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads
 
   L0_Unit_Tests_GPU_Audio:
      needs: [cicd-test-container-setup]
@@ -1212,18 +1212,6 @@ jobs:
         matmul_precision=medium
       AFTER_SCRIPT: |
         rm -rf preds.json
-  
-
-  # L2: Transducer alignment
-  OPTIONAL_L2_Transducer_alignment_Running_pytest:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_Transducer_alignment_Running_pytest') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1 --with_downloads
-      IS_OPTIONAL: true
 
   # L2: Segmentation Tool
   L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav:
@@ -5456,7 +5444,7 @@ jobs:
       - gpu-test
       - cicd-test-container-setup
 
-      #- OPTIONAL_L0_Unit_Tests_GPU_ASR
+      - L0_Unit_Tests_GPU_ASR
       - L0_Unit_Tests_GPU_Audio
       - L0_Unit_Tests_GPU_Common
       - L0_Unit_Tests_GPU_LLM
@@ -5507,7 +5495,6 @@ jobs:
       - L2_ASR_Adapters_Linear_Adapters
       - L2_ASR_Adapters_RelPos_MHA_Adapters
       - L2_Speech_Transcription_Speech_to_Text_Transcribe
-      #- OPTIONAL_L2_Transducer_alignment_Running_pytest
       - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav
       - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
       - L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
diff --git a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
index aa49435ded16..fc501b3d00de 100644
--- a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
@@ -293,6 +293,13 @@ def __call__(
         device: torch.device,
         partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None,
     ):
+        if x.device.type != "cuda":
+            # If CUDA graphs are enabled and "frame-looping" algorithm is requested, current class
+            # is not suitable to handle non-CUDA inputs; thus we are passing them to original caller
+            return self.caller._greedy_decode_blank_as_pad_loop_frames(
+                x=x, out_len=out_len, device=device, partial_hypotheses=partial_hypotheses
+            )
+
         if partial_hypotheses is not None:
             raise NotImplementedError(
                 "`partial_hypotheses` support is not available "
diff --git a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
index 31fe822573ce..4715f4826493 100644
--- a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
+++ b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
@@ -53,8 +53,8 @@ def stt_en_fastconformer_transducer_large():
             8,
             True,
             marks=pytest.mark.xfail(
-                reason="""Cannot instantiate the 
-body cuda graph of a conditional node with a persistent kernel (in this case, 
+                reason="""Cannot instantiate the
+body cuda graph of a conditional node with a persistent kernel (in this case,
 a persistent LSTM), which is triggered in cudnn by using a batch size of 8."""
             ),
         ),
diff --git a/tests/collections/asr/decoding/rnnt_alignments_check.py b/tests/collections/asr/decoding/test_rnnt_alignments.py
similarity index 94%
rename from tests/collections/asr/decoding/rnnt_alignments_check.py
rename to tests/collections/asr/decoding/test_rnnt_alignments.py
index ec0656cbce49..5c43af28b1d4 100644
--- a/tests/collections/asr/decoding/rnnt_alignments_check.py
+++ b/tests/collections/asr/decoding/test_rnnt_alignments.py
@@ -13,10 +13,6 @@
 # limitations under the License.
 
 
-# NOTE: the file name does not contain "test" on purpose to avoid executing
-#       these tests outside of the CI machines environment, where test data is
-#       stored
-
 from pathlib import Path
 from typing import Union
 
@@ -27,6 +23,7 @@
 
 from nemo.collections.asr.models import EncDecRNNTBPEModel
 from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest
+from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.asr.parts.utils.transcribe_utils import prepare_audio_data
 
 DEVICES = []
@@ -65,7 +62,7 @@ def get_rnnt_alignments(
     loop_labels: bool = True,
     use_cuda_graph_decoder=False,
     device="cuda",
-):
+) -> list[Hypothesis]:
     cfg = OmegaConf.structured(TranscriptionConfig())
     cfg.rnnt_decoding.confidence_cfg.preserve_frame_confidence = True
     cfg.rnnt_decoding.preserve_alignments = True
@@ -74,12 +71,13 @@ def get_rnnt_alignments(
         cfg.rnnt_decoding.greedy.loop_labels = loop_labels
         cfg.rnnt_decoding.greedy.use_cuda_graph_decoder = use_cuda_graph_decoder
     cfg.dataset_manifest = str(manifest_path)
-    filepaths = prepare_audio_data(cfg)[0][:10]  # selecting 10 files only
+    filepaths = prepare_audio_data(cfg)[0][:8]  # selecting 8 files only
+    # NB: 9th file has the same transcription but a bit different alignment for batched/non-batched decoding
 
     model = model.to(device)
     model.change_decoding_strategy(cfg.rnnt_decoding)
 
-    transcriptions = model.transcribe(
+    transcriptions: list[Hypothesis] = model.transcribe(
         audio=filepaths,
         batch_size=cfg.batch_size,
         num_workers=cfg.num_workers,

From 1757ff9ed10272bf5ee7332d64fccd4bd9676f1b Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Date: Thu, 17 Oct 2024 12:02:52 -0700
Subject: [PATCH 02/37] Integrating mcore export (#10238)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Integrating mcore export

* Integrating mcore export

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

* Move trt imports in nemo.collections.llm inside respective functions (#10234)

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Add tests for LazyNeMoIterator and fix case with metadata_only=True and offsets in manifest (#10198)

* Add tests for LazyNeMoIterator and fix case with manifest_only=True and offsets in manifest

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Address code review

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix tests

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix tests

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* [NeMo-UX] Fix a serialization bug that prevents users from moving checkpoints (#9939)

* perfor serialization using relative paths to allow users to move checkpoints after they're saved

Signed-off-by: ashors1 <ashors@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: ashors1 <ashors1@users.noreply.github.com>

* remove unused import

Signed-off-by: ashors1 <ashors@nvidia.com>

* fix artifact load

Signed-off-by: ashors1 <ashors@nvidia.com>

* fix path artifact

Signed-off-by: ashors1 <ashors@nvidia.com>

* remove unused import

Signed-off-by: ashors1 <ashors@nvidia.com>

---------

Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: ashors1 <ashors1@users.noreply.github.com>
Co-authored-by: ashors1 <ashors1@users.noreply.github.com>

* Add MemoryProfileCallback (#10166)

* Add MemoryProfileCallback

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>

* Remove reference cycles, save snapshot on specific ranks

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Remove unnecessary imports

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>

* Update docstring

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

---------

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>
Signed-off-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>
Signed-off-by: Shriya Rishab <69161273+ShriyaPalsamudram@users.noreply.github.com>
Co-authored-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>

* Lower bound transformers to support nemotron (#10240)

Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>
Co-authored-by: Dong Hyuk Chang <donghyukc@nvidia.com>

* [Audio] SSL Pretraining framework for flow-matching model for audio processing (#10052)

Flow matching generative model with SSL pretraining framework

Signed-off-by: Pin-Jui Ku <pku@nvidia.com>
Co-authored-by: Kuray107 <Kuray107@users.noreply.github.com>

* Revert torchrun fix for model import (#10251)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [NeMo-UX[ Move nemotron imports inline (#10255)

* Move nemotron transformers + tokenizer imports inline to reduce number of required deps

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

---------

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Wrap CPU model init with megatron_lazy_init_context (#10219)

* Wrap CPU model init with megatron_lazy_init_context

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Cleanup checkpoint-dir if saving fails

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>

* Bump `Dockerfile.ci` (2024-08-22) (#10227)

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 124bcff !

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* fix bert flags

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>

* salm export trtllm (#10245)

Signed-off-by: slyne deng <slyned@nvidia.com>
Co-authored-by: slyne deng <slyned@nvidia.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to ef85bc9 ! (#10250)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 01ca03f ! (#10266)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>

* Load model in the target export precision by default in PTQ (#10267)

* Load model in the target export precision by default

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Enable megatron_amp_O2=true to actually use half-precision

Signed-off-by: Jan Lasek <jlasek@nvidia.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: Jan Lasek <jlasek@nvidia.com>

* Add WandbPlugin, NsysPlugin and PreemptionPlugin to nemo.lightning.run.plugins (#10223)

* Add WandbPlugin, NsysPlugin and PreemptionPlugin to nemo.lightning.run.plugins

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Remove duplicate

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Add entity to wandb logger

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Add documentation

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Add warning

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* PR feedback

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Add comments

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

---------

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>

* [NeMo-UX] Handle absolute logger directories in nemo_logger (#10259)

* handle absolute and relative logger directories

Signed-off-by: Anna Shors <ashors@nvidia.com>

* merge lines

Signed-off-by: ashors1 <ashors@nvidia.com>

---------

Signed-off-by: Anna Shors <ashors@nvidia.com>
Signed-off-by: ashors1 <ashors@nvidia.com>

* Add sdxl notebook (#10139)

* Add sdxl notebook

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Rename

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* final Update SDXL notebook

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

---------

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Updating some coments

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

* Updating some coments

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

* Updating some coments

* Small change

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

* ADD support for layernorm1p

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

* Update Dockerfile.ci

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

* Update Dockerfile.ci

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

* Update Dockerfile.ci

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

---------

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: ashors1 <ashors1@users.noreply.github.com>
Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>
Signed-off-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>
Signed-off-by: Shriya Rishab <69161273+ShriyaPalsamudram@users.noreply.github.com>
Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>
Signed-off-by: Pin-Jui Ku <pku@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
Signed-off-by: slyne deng <slyned@nvidia.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: Jan Lasek <jlasek@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Signed-off-by: Anna Shors <ashors@nvidia.com>
Signed-off-by: mingyuanm <mingyuanm@nvidia.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
Co-authored-by: Hemil Desai <hemild@nvidia.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: Anna Shors <71393111+ashors1@users.noreply.github.com>
Co-authored-by: ashors1 <ashors1@users.noreply.github.com>
Co-authored-by: Shriya Rishab <69161273+ShriyaPalsamudram@users.noreply.github.com>
Co-authored-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>
Co-authored-by: Dong Hyuk Chang <thomaschang26@tutanota.com>
Co-authored-by: Dong Hyuk Chang <donghyukc@nvidia.com>
Co-authored-by: Kuray107 <pku9@gatech.edu>
Co-authored-by: Kuray107 <Kuray107@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: Marc Romeyn <mromeijn@nvidia.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: oliver könig <okoenig@nvidia.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>
Co-authored-by: Slyne Deng <slynedeng@gmail.com>
Co-authored-by: slyne deng <slyned@nvidia.com>
Co-authored-by: Jan Lasek <janek.lasek@gmail.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: Ming <111467530+Victor49152@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
---
 nemo/export/tensorrt_llm.py | 204 ++++++++++++++++++++++++++++--------
 1 file changed, 160 insertions(+), 44 deletions(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index a7107974fbaa..fb43224d59a9 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -176,6 +176,7 @@ def export(
         multiple_profiles: bool = False,
         gpt_attention_plugin: str = "auto",
         gemm_plugin: str = "auto",
+        use_mcore_path: bool = False,
         reduce_fusion: bool = True,
         fp8_quantized: Optional[bool] = None,
         fp8_kvcache: Optional[bool] = None,
@@ -213,11 +214,11 @@ def export(
             multiple_profiles: (bool): enables multiple profiles feature of TRT-LLM. Default = False
             gpt_attention_plugin (str): enable the gpt attention plugin. Default = "auto"
             gemm_plugin (str): enable the gpt plugin. Default = "auto"
+            use_mcore_path (bool) : Use the more recent mcore path for export
             reduce_fusion (bool): enables fusing extra kernels after custom TRT-LLM allReduce
             fp8_quantized (Optional[bool]): enables exporting to FP8 TRT-LLM checkpoints. If not set, autodetects the type.
             fp8_kvcache (Optional[bool]): enables FP8 KV-cache quantization. If not set, autodetects the type.
         """
-
         if n_gpus is not None:
             warnings.warn(
                 "Parameter n_gpus is deprecated and will be removed in the next release. "
@@ -326,53 +327,169 @@ def export(
                         "Supported model types are: {1}.".format(model_type, self.get_supported_models_list)
                     )
 
-                if model_type == "gpt" or model_type == "starcoder":
-                    model_type = "gptnext"
+                model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
+                if use_mcore_path:
+                    from megatron.core.export.data_type import DataType
+                    from megatron.core.export.export_config import ExportConfig
+                    from megatron.core.export.model_type import ModelType
+                    from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import (
+                        DEFAULT_CONVERSION_DICT,
+                    )
+                    from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+                    from megatron.core.transformer.transformer_config import TransformerConfig
+                    from tensorrt_llm.layers import MoeConfig
+
+                    def get_transformer_config(nemo_model_config):
+                        normalization = nemo_model_config.get('normalization', 'layernorm')
+                        transformer_config_normalization = 'LayerNorm'
+                        layernorm_zero_centered_gamma = False
+                        if normalization == 'layernorm1p':
+                            layernorm_zero_centered_gamma = True
+                        elif normalization == 'rmsnorm':
+                            transformer_config_normalization = 'RMSNorm'
+
+                        conf = TransformerConfig(
+                            num_layers=nemo_model_config.get('num_layers'),
+                            moe_router_topk=nemo_model_config.get('moe_router_topk', 0),
+                            num_attention_heads=nemo_model_config.get('num_attention_heads'),
+                            num_query_groups=nemo_model_config.get(
+                                'num_query_groups', nemo_model_config['num_attention_heads']
+                            ),
+                            kv_channels=nemo_model_config.get("kv_channels", None),
+                            hidden_size=nemo_model_config.get('hidden_size'),
+                            ffn_hidden_size=nemo_model_config.get('ffn_hidden_size'),
+                            layernorm_epsilon=nemo_model_config.get('layernorm_epsilon'),
+                            add_bias_linear=nemo_model_config.get('bias'),
+                            num_moe_experts=nemo_model_config.get('num_moe_experts', None),
+                            normalization=transformer_config_normalization,
+                            layernorm_zero_centered_gamma=layernorm_zero_centered_gamma,
+                        )
 
-                if model_type == "mixtral":
-                    model_type = "llama"
+                        return conf
+
+                    # We build the transformer config using the nemo model config.
+                    transformer_config = get_transformer_config(model_configs)
+                    input_model_type = getattr(ModelType, model_type)
+
+                    # MCore export supports some default conversion dictionaries
+                    mcore_model_conversion_dict = DEFAULT_CONVERSION_DICT[input_model_type]
+                    # All Mcore conversion dicts start with "decoder.layers.4.blah.blah" , while nemo models start with "model.decoder.layers.4.blahblah". so we append model. to the keys
+                    nemo_model_conversion_dict = {
+                        f'model.{key}': value for key, value in mcore_model_conversion_dict.items()
+                    }
+
+                    trtllm_helper = TRTLLMHelper(
+                        transformer_config=transformer_config,
+                        model_type=input_model_type,
+                        trtllm_conversion_dict=nemo_model_conversion_dict,
+                        position_embedding_type=model_configs.get('position_embedding_type'),
+                        max_position_embeddings=model_configs.get('max_position_embeddings'),
+                        rotary_percentage=model_configs.get('rotary_percentage', 1.0),
+                        rotary_base=model_configs.get('rotary_base', 10000),
+                        moe_tp_mode=model_configs.get('moe_tp_mode', 2),
+                        multi_query_mode=model_configs.get("multi_query_mode", False),
+                        activation=model_configs.get('activation', "gelu"),
+                        seq_len_interpolation_factor=model_configs.get("seq_len_interpolation_factor"),
+                        moe_renorm_mode=model_configs.get(
+                            'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
+                        ),
+                        share_embeddings_and_output_weights=model_configs.get(
+                            "share_embeddings_and_output_weights", False
+                        ),
+                    )
 
-                model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
-                weights_dicts, model_configs = model_to_trtllm_ckpt(
-                    model=model,
-                    nemo_model_config=model_configs,
-                    nemo_export_dir=nemo_export_dir,
-                    decoder_type=model_type,
-                    dtype=dtype,
-                    tensor_parallel_size=tensor_parallelism_size,
-                    pipeline_parallel_size=pipeline_parallelism_size,
-                    gpus_per_node=gpus_per_node,
-                    use_parallel_embedding=use_parallel_embedding,
-                    use_embedding_sharing=use_embedding_sharing,
-                    fp8_quantized=fp8_quantized,
-                    fp8_kvcache=fp8_kvcache,
-                )
+                    input_dtype = getattr(DataType, dtype)
+                    export_config = ExportConfig(
+                        tensor_parallelism_size,
+                        pipeline_parallelism_size,
+                        use_parallel_embedding,
+                        use_embedding_sharing,
+                    )
 
-                for weight_dict, model_config in zip(weights_dicts, model_configs):
-                    build_and_save_engine(
-                        max_input_len=max_input_len,
-                        max_output_len=max_output_len,
-                        max_batch_size=max_batch_size,
-                        model_config=model_config,
-                        model_weights=weight_dict,
-                        model_dir=self.model_dir,
-                        model_type=model_type,
-                        lora_ckpt_list=self.lora_ckpt_list,
-                        use_lora_plugin=use_lora_plugin,
-                        max_lora_rank=max_lora_rank,
-                        lora_target_modules=lora_target_modules,
-                        max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-                        paged_kv_cache=paged_kv_cache,
-                        remove_input_padding=remove_input_padding,
-                        paged_context_fmha=paged_context_fmha,
-                        max_num_tokens=max_num_tokens,
-                        opt_num_tokens=opt_num_tokens,
-                        max_seq_len=max_seq_len,
-                        multiple_profiles=multiple_profiles,
-                        gpt_attention_plugin=gpt_attention_plugin,
-                        gemm_plugin=gemm_plugin,
+                    trtllm_model_weights_list, trtllm_model_config_list = (
+                        trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                            model_state_dict=model,
+                            export_config=export_config,
+                            dtype=input_dtype,
+                            state_dict_split_by_layer_numbers=False,
+                        )
+                    )
+
+                    for trtllm_model_weights, trtllm_model_config in zip(
+                        trtllm_model_weights_list, trtllm_model_config_list
+                    ):
+                        trtllm_helper.build_and_save_engine(
+                            max_input_len=max_input_len,
+                            max_output_len=max_output_len,
+                            max_batch_size=max_batch_size,
+                            engine_dir=self.model_dir,
+                            trtllm_model_weights=trtllm_model_weights,
+                            trtllm_model_config=trtllm_model_config,
+                            lora_ckpt_list=self.lora_ckpt_list,
+                            use_lora_plugin=use_lora_plugin,
+                            max_lora_rank=max_lora_rank,
+                            lora_target_modules=lora_target_modules,
+                            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                            paged_kv_cache=paged_kv_cache,
+                            remove_input_padding=remove_input_padding,
+                            paged_context_fmha=paged_context_fmha,
+                            use_refit=False,
+                            max_num_tokens=max_num_tokens,
+                            max_seq_len=max_seq_len,
+                            opt_num_tokens=opt_num_tokens,
+                            max_beam_width=1,
+                            tokens_per_block=128,
+                            multiple_profiles=multiple_profiles,
+                            gpt_attention_plugin=gpt_attention_plugin,
+                            gemm_plugin=gemm_plugin,
+                        )
+                else:
+                    if model_type == "gpt" or model_type == "starcoder":
+                        model_type = "gptnext"
+
+                    if model_type == "mixtral":
+                        model_type = "llama"
+
+                    weights_dicts, model_configs = model_to_trtllm_ckpt(
+                        model=model,
+                        nemo_model_config=model_configs,
+                        nemo_export_dir=nemo_export_dir,
+                        decoder_type=model_type,
+                        dtype=dtype,
+                        tensor_parallel_size=tensor_parallelism_size,
+                        pipeline_parallel_size=pipeline_parallelism_size,
+                        gpus_per_node=gpus_per_node,
+                        use_parallel_embedding=use_parallel_embedding,
+                        use_embedding_sharing=use_embedding_sharing,
+                        fp8_quantized=fp8_quantized,
+                        fp8_kvcache=fp8_kvcache,
                     )
 
+                    for weight_dict, model_config in zip(weights_dicts, model_configs):
+                        build_and_save_engine(
+                            max_input_len=max_input_len,
+                            max_output_len=max_output_len,
+                            max_batch_size=max_batch_size,
+                            model_config=model_config,
+                            model_weights=weight_dict,
+                            model_dir=self.model_dir,
+                            model_type=model_type,
+                            lora_ckpt_list=self.lora_ckpt_list,
+                            use_lora_plugin=use_lora_plugin,
+                            max_lora_rank=max_lora_rank,
+                            lora_target_modules=lora_target_modules,
+                            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                            paged_kv_cache=paged_kv_cache,
+                            remove_input_padding=remove_input_padding,
+                            paged_context_fmha=paged_context_fmha,
+                            max_num_tokens=max_num_tokens,
+                            opt_num_tokens=opt_num_tokens,
+                            max_seq_len=max_seq_len,
+                            multiple_profiles=multiple_profiles,
+                            gpt_attention_plugin=gpt_attention_plugin,
+                            gemm_plugin=gemm_plugin,
+                        )
+
             tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
             tokenizer_path_nemo2 = os.path.join(nemo_export_dir, "nemo_context")
             if os.path.exists(tokenizer_path):
@@ -454,7 +571,6 @@ def convert_to_safe_tensors(
                     weight_dict[k] = numpy_to_torch(v)
 
                 safetensors.torch.save_file(weight_dict, os.path.join(self.model_dir, f'rank{rank}.safetensors'))
-
             model_configs[0].to_json_file(os.path.join(self.model_dir, 'config.json'))
 
             tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")

From e21d777ca7d6876bbc9f8360691e8ded47a3d052 Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Thu, 17 Oct 2024 13:39:41 -0700
Subject: [PATCH 03/37] Fix artifact saving (#10914)

Signed-off-by: Hemil Desai <hemild@nvidia.com>
---
 nemo/lightning/io/mixin.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
index aa74e2cf174c..463c18065494 100644
--- a/nemo/lightning/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -584,8 +584,12 @@ def _io_path_elements_fn(x):
 def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "."):
     for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []):
         # Allow optional artifacts
-        if artifact.skip:
+        if artifact.skip or (not hasattr(cfg, artifact.attr) and not artifact.required):
             continue
+
+        if not hasattr(cfg, artifact.attr) and artifact.required:
+            raise ValueError(f"Artifact '{artifact.attr}' is required but not provided")
+
         current_val = getattr(cfg, artifact.attr)
         if current_val is None:
             if artifact.required:

From ec6677865346dcb6c98a0a619df003c8f39e3977 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Thu, 17 Oct 2024 17:50:12 -0400
Subject: [PATCH 04/37] Lora improvement (#10918)

* pull out freeze model

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add wildcard match to lora target modules

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 nemo/collections/llm/peft/lora.py        | 66 ++++++++++++++----------
 nemo/lightning/pytorch/callbacks/peft.py | 18 ++++++-
 2 files changed, 55 insertions(+), 29 deletions(-)

diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py
index db4861e9e987..e7a0d70d0603 100644
--- a/nemo/collections/llm/peft/lora.py
+++ b/nemo/collections/llm/peft/lora.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import re
 from dataclasses import dataclass, field
 from typing import List, Literal
 
 from megatron.core import parallel_state
+from megatron.core.tensor_parallel import ColumnParallelLinear, RowParallelLinear
 from torch import nn
 
 from nemo.lightning.pytorch.callbacks.peft import PEFT, AdapterWrapper
@@ -23,15 +25,16 @@
 from nemo.utils.import_utils import safe_import_from
 
 TEColumnParallelLinear, HAVE_TE_COL_LINEAR = safe_import_from(
-    "megatron.core.transformer.custom_layers.transformer_engine", "TEColumnParallelLinear"
+    "megatron.core.extensions.transformer_engine", "TEColumnParallelLinear"
 )
-TELayerNormColumnParallelLinear, HAVE_TE_COL_LINEAR = safe_import_from(
-    "megatron.core.transformer.custom_layers.transformer_engine",
+TELayerNormColumnParallelLinear, HAVE_TE_LN_COL_LINEAR = safe_import_from(
+    "megatron.core.extensions.transformer_engine",
     "TELayerNormColumnParallelLinear",
 )
 TERowParallelLinear, HAVE_TE_ROW_LINEAR = safe_import_from(
-    "megatron.core.transformer.custom_layers.transformer_engine", "TERowParallelLinear"
+    "megatron.core.extensions.transformer_engine", "TERowParallelLinear"
 )
+HAVE_TE = all((HAVE_TE_COL_LINEAR, HAVE_TE_LN_COL_LINEAR, HAVE_TE_ROW_LINEAR))
 
 
 class AdapterParallelAdd(AdapterWrapper):
@@ -82,6 +85,9 @@ class LoRA(PEFT):
                 - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention modules.
                 - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP.
                 - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP.
+            Target modules can also contain wildcards. For example, you can specify
+                target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv
+                on the first two layers.
         dim (int): Dimension of the low-rank projection space. Defaults to 32.
         alpha (int): Weighting factor for the low-rank projection. Defaults to 32.
         dropout (float): Dropout rate for the low-rank projection. Defaults to 0.0.
@@ -129,37 +135,43 @@ def transform(self, m: nn.Module, name=None, prefix=None):
         """
         from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ParallelLinearAdapter
 
+        def wildcard_match(pattern, key):
+            if key is None:
+                return None
+            regex_pattern = re.compile("^" + pattern.replace("*", "(.*)") + "$")
+            match = regex_pattern.match(key)
+            return match is not None
+
         tp_size = parallel_state.get_tensor_model_parallel_world_size()
-        if name in self.target_modules:
-            if name in ['linear_qkv', 'linear_fc1']:
-                # Column Parallel Linear
+        full_name = f"{prefix}.{name}" if prefix else name
+        if name in self.target_modules or any(wildcard_match(pattern, full_name) for pattern in self.target_modules):
+            if HAVE_TE and isinstance(m, TEColumnParallelLinear) or isinstance(m, TELayerNormColumnParallelLinear):
                 input_is_parallel = False
-                if HAVE_TE_COL_LINEAR and (
-                    isinstance(m, TEColumnParallelLinear) or isinstance(m, TELayerNormColumnParallelLinear)
-                ):
-                    # m.in_features and m.out_features are divided by tp_size already,
-                    # but in_features and out_features passed to ParallelLinearAdapter are not.
-                    in_features = m.in_features
-                    out_features = m.out_features * tp_size
-                else:
-                    in_features = m.input_size
-                    out_features = m.output_size
+                # m.in_features and m.out_features are divided by tp_size already,
+                # but in_features and out_features passed to ParallelLinearAdapter are not.
+                in_features = m.in_features
+                out_features = m.out_features * tp_size
                 # LoRA is applied after layernorm, so layernorm output must be returned
                 m.return_layernorm_output = True
                 # perf optimization for LoRA + SP
                 if m.config.sequence_parallel and not m.ub_overlap_ag:
                     m.return_layernorm_output_gathered = True
-            else:  # name in ['linear_proj', 'linear_fc2']
-                # Row Parallel Linear
+            elif HAVE_TE and isinstance(m, TERowParallelLinear):
+                input_is_parallel = True
+                in_features = m.in_features * tp_size
+                out_features = m.out_features
+            elif isinstance(m, ColumnParallelLinear):
+                input_is_parallel = False
+                in_features = m.input_size
+                out_features = m.output_size
+            elif isinstance(m, RowParallelLinear):
                 input_is_parallel = True
-                if HAVE_TE_ROW_LINEAR and isinstance(m, TERowParallelLinear):
-                    in_features = m.in_features * tp_size
-                    out_features = m.out_features
-                else:
-                    in_features = m.input_size
-                    out_features = m.output_size
-
-            logging.info(f"Adding lora to: {prefix}.{name}")
+                in_features = m.input_size
+                out_features = m.output_size
+            else:
+                raise NotImplementedError(f"Layer type is unrecognized for LoRA: {type(m)}")
+
+            logging.info(f"Adding lora to: {full_name}")
             adapter = ParallelLinearAdapter(
                 in_features,
                 out_features,
diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py
index 1e3cde0bbcde..f8a10802ffbd 100644
--- a/nemo/lightning/pytorch/callbacks/peft.py
+++ b/nemo/lightning/pytorch/callbacks/peft.py
@@ -90,12 +90,26 @@ def __call__(self, model: nn.Module) -> nn.Module:
         Returns:
             nn.Module: The transformed model with PEFT applied.
         """
-
-        model.freeze()
+        self.freeze_model(model)
         model.walk(self.transform)
 
         return model
 
+    def freeze_model(self, model: nn.Module) -> None:
+        """Apply a default freeze method to the model.
+
+        This method freezes all the model parameters. This method can be overridden by subclasses to
+        implement custom freeze strategies (e.g. freeze only parts of the model)
+
+        Args:
+            model (nn.Module): The model to be fine-tuned.
+
+        Returns:
+            nn.Module: The transformed model with PEFT applied.
+        """
+        model.freeze()
+        model.train(mode=True)
+
     def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str) -> None:
         super().setup(trainer, pl_module, stage=stage)
 

From ccd4a9f162d96380a118261333779d921cad7570 Mon Sep 17 00:00:00 2001
From: Huy Vu <86480512+huvunvidia@users.noreply.github.com>
Date: Thu, 17 Oct 2024 19:53:50 -0400
Subject: [PATCH 05/37] Huvu/t5 nemo2.0 peft (#10916)

* adding peft test and cicd

* add setting mcore model to train in peft.py

* adding test for T5 lora

* fix follow Chen's fix

* restore cicd-main.yml

---------

Co-authored-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
---
 .github/workflows/cicd-main.yml                 | 17 +++++++++++++++++
 tests/collections/llm/megatron_t5_finetuning.py |  8 +++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 2021c7d93136..7fc6cc708c31 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -5170,6 +5170,22 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf tests/collections/llm/t5_finetune_results/${{ github.run_id }}
 
+  L2_NeMo_2_T5_LoRA:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_T5_LoRA') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_finetuning.py \
+        --devices=2 \
+        --max-steps=250 \
+        --peft=lora \
+        --experiment-dir=tests/collections/llm/t5_peft_results/${{ github.run_id }} \
+        --checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_150steps
+      AFTER_SCRIPT: |
+        rm -rf tests/collections/llm/t5_peft_results/${{ github.run_id }}
+
   L2_NeMo_2_Mixtral_Pretraining:
       needs: [cicd-test-container-setup]
       uses: ./.github/workflows/_test_template.yml
@@ -5576,6 +5592,7 @@ jobs:
       - L2_NeMo_2_SSM_Finetuning
       - L2_NeMo_2_T5_Pretraining
       - L2_NeMo_2_T5_Finetuning
+      - L2_NeMo_2_T5_LoRA
       - L2_NeMo_2_GPT_SFT_TP1PP1_MBS1
       - L2_NeMo_2_GPT_SFT_TP1PP1_MBS2
       - L2_NeMo_2_GPT_SFT_TP1PP2_MBS2
diff --git a/tests/collections/llm/megatron_t5_finetuning.py b/tests/collections/llm/megatron_t5_finetuning.py
index 76a23d36975b..a204e6797926 100644
--- a/tests/collections/llm/megatron_t5_finetuning.py
+++ b/tests/collections/llm/megatron_t5_finetuning.py
@@ -21,6 +21,7 @@ def get_args():
     parser = argparse.ArgumentParser(description='Train a small T5 model using NeMo 2.0')
     parser.add_argument('--devices', type=int, help="Number of devices to use for training")
     parser.add_argument('--max-steps', type=int, help="Number of steps to train for")
+    parser.add_argument('--peft', type=str, default='none', help="none | lora")
     parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to")
     parser.add_argument('--experiment-name', type=str, help="name of experiment")
     parser.add_argument('--wandb-project', type=str, default=None, help="wandb project name")
@@ -69,7 +70,6 @@ def get_args():
         pipeline_model_parallel_size=1,
         pipeline_dtype=torch.float32,
         ckpt_load_optimizer=False,
-        # ckpt_load_optimizer=True,
     )
     checkpoint_callback = ModelCheckpoint(
         every_n_train_steps=5000,
@@ -93,6 +93,11 @@ def get_args():
         config=opt_config,
     )
 
+    if args.peft == 'lora':
+        peft = llm.peft.LoRA()
+    else:
+        peft = None
+
     trainer = nl.Trainer(
         devices=args.devices,
         max_steps=args.max_steps,
@@ -125,6 +130,7 @@ def get_args():
         resume=resume,
         data=data,
         trainer=trainer,
+        peft=peft,
         log=nemo_logger,
         optim=opt,
     )

From 1c98d01ec20e2d1eb97c6f77afe4577e81dbd1ce Mon Sep 17 00:00:00 2001
From: Yoshi Suhara <y.suhara@gmail.com>
Date: Thu, 17 Oct 2024 23:25:13 -0700
Subject: [PATCH 06/37] Add tie_word_embeddings=True (#10710)

Signed-off-by: Yoshi Suhara <ysuhara@nvidia.com>
---
 .../convert_mistral_7b_nemo_to_hf.py                     | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
index ba9012de01a8..796819c38ba4 100644
--- a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
@@ -65,6 +65,7 @@ def load_config(hf_model_name, nemo_config):
         logging.warning(f"Got unknown activation function {nemo_config.activation}")
 
     hf_config.rope_theta = nemo_config['rotary_base']
+    hf_config.tie_word_embeddings = getattr(nemo_config, "share_embeddings_and_output_weights", False)
     return hf_config
 
 
@@ -213,7 +214,13 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
         output_layer_base_name = 'model.output_layer.weight'
     else:
         output_layer_base_name = 'model.language_model.output_layer.weight'
-    state_dict[hf_output_layer_weight_name] = param_to_weights(ckpt[output_layer_base_name])
+
+    if getattr(nemo_config, "share_embeddings_and_output_weights", False):
+        # tie_word_embeddings: True
+        state_dict[hf_output_layer_weight_name] = state_dict[embed_weights_base_name]
+    else:
+        # tie_word_embeddings: False
+        state_dict[hf_output_layer_weight_name] = param_to_weights(ckpt[output_layer_base_name])
     return state_dict, nemo_config, dtype
 
 

From ce3b28ea6b61848c2a8f07819111a8547ebd65ae Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Fri, 18 Oct 2024 00:38:16 -0700
Subject: [PATCH 07/37] Use a context-manager when opening files (#10895)

* Use a context-manager when opening files

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>
---
 .../language_modeling/text_memmap_dataset.py  | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py
index 4882708f698f..dc4fb8ececc5 100644
--- a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py
@@ -260,7 +260,8 @@ def load_file(self, fn, index_mapping_dir: Optional[str] = None):
                 raise RuntimeError(f"Missing header, expected {self._header_lines} header lines")
 
             # load meta info
-            idx_info_dict = pickle.load(open(idx_fn + ".info", "rb"))
+            with open(idx_fn + ".info", "rb") as fp:
+                idx_info_dict = pickle.load(fp)
             # test for mismatch in expected newline_int
             if "newline_int" in idx_info_dict:
                 newline_int = idx_info_dict["newline_int"]
@@ -378,9 +379,7 @@ def __init__(
         self._data_sep = data_sep
 
     def _build_data_from_text(self, text: str):
-        """
-
-        """
+        """ """
         _build_data_from_text = super()._build_data_from_text
         data = {}
         text_fields = text.split(self._data_sep)
@@ -513,7 +512,11 @@ def _build_memmap_index_files(newline_int, build_index_fn, fn, index_mapping_dir
 
 
 def build_index_files(
-    dataset_paths, newline_int, workers=None, build_index_fn=_build_index_from_memdata, index_mapping_dir: str = None,
+    dataset_paths,
+    newline_int,
+    workers=None,
+    build_index_fn=_build_index_from_memdata,
+    index_mapping_dir: str = None,
 ):
     """Auxiliary method to build multiple index files"""
     if len(dataset_paths) < 1:
@@ -528,7 +531,12 @@ def build_index_files(
     ctx = mp.get_context("fork")
     with ctx.Pool(workers) as p:
         build_status = p.map(
-            partial(_build_memmap_index_files, newline_int, build_index_fn, index_mapping_dir=index_mapping_dir,),
+            partial(
+                _build_memmap_index_files,
+                newline_int,
+                build_index_fn,
+                index_mapping_dir=index_mapping_dir,
+            ),
             dataset_paths,
         )
 

From 5b47a94c110e0ce35a652242e752d8977754c8f0 Mon Sep 17 00:00:00 2001
From: Youngeun Kwon <youngeunk@nvidia.com>
Date: Fri, 18 Oct 2024 03:07:20 -0700
Subject: [PATCH 08/37] long context performance numbers in doc (#10784)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* long context perf

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* update the long context perf

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Akoumparouli/mcore microbatch calculator fix (#10780)

* move tests/lightning/{,_}io

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add microbatch calculator context manager

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* use microbatch calculator context manager

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add on_load_checkpoint test to ValidateModelRestoration; use ctx manager to reconfigure microbatch calculator; update save/restore path; add cleanup step at the end

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove unused var

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* remove 8x3b recipes (#10764)

* remove 8x3b recipes

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove 8x3b from test_nemo_run

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rm from __init__

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* change the figure file name

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Accommodating the reviewer's comment

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* update the y-axis title

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 3f90b98 ! (#10789)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Add ModelOpt transformer model pruning example for Llama models, default to llama3.1-8b-base (#10294)

* Add ModelOpt transformer model pruning example for Llama3 model

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: shengliangxu <shengliangxu@users.noreply.github.com>
Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* examples code is at wrong dir, move them

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* changes as suggested in comment

remove some logging and unused config code, update example model to
llama3.1

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* Add pruning of hidden_size into example

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: shengliangxu <shengliangxu@users.noreply.github.com>
Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* Update examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Add pruning test to cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

---------

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>
Signed-off-by: shengliangxu <shengliangxu@users.noreply.github.com>
Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Co-authored-by: shengliangxu <shengliangxu@users.noreply.github.com>
Co-authored-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Update mamba.rst after dist ckpt addition (#10800)

Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* fix chunked infer (#10581)

Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* fix state transform (#10728)

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* use ckpt_to_weights_subdir in restore (#10786)

* use ckpt_to_weights_subdir in restore

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* make ckpt_to_{weight,context}_subdir idempotent

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Mixtral set seq_length=4k (#10704)

* enable SP & set seq_lenght=4k

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update test expected values

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* 8x22b 4k

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Fix for crashes with tensorboard_logger=false and VP + LoRA (#10792)

* Fix for crashes with tensorboard_logger=false and virtual pipeline parallel + LoRA

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: vysarge <vysarge@users.noreply.github.com>

---------

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: vysarge <vysarge@users.noreply.github.com>
Co-authored-by: vysarge <vysarge@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Disable checkpoint conversion inside AutoResume (#10645)

* Disable checkpoint conversion inside AutoResume

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Update resume docstrings

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* add default finetuning recipe and refactor llama3 8b recipe

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* address comment

Signed-off-by: Chen Cui <chcui@nvidia.com>

* refactor other recipes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* remove 8x3b finetuning recipe for now because HF version not available

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add copyright header

Signed-off-by: Chen Cui <chcui@nvidia.com>

* adjust unit tests based on recipe fixes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix failed unit test

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* replace png file to github assets

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* change image url to github release

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

---------

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>
Signed-off-by: shengliangxu <shengliangxu@users.noreply.github.com>
Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: vysarge <vysarge@users.noreply.github.com>
Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: oliver könig <okoenig@nvidia.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>
Co-authored-by: Shengliang Xu <106840466+shengliangxu@users.noreply.github.com>
Co-authored-by: shengliangxu <shengliangxu@users.noreply.github.com>
Co-authored-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Co-authored-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: vysarge <vysarge@users.noreply.github.com>
Co-authored-by: Hemil Desai <hemild@nvidia.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
---
 .../performance/performance_long_sequence.md  | 155 ++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100644 docs/source/performance/performance_long_sequence.md

diff --git a/docs/source/performance/performance_long_sequence.md b/docs/source/performance/performance_long_sequence.md
new file mode 100644
index 000000000000..9dc9c6c52be3
--- /dev/null
+++ b/docs/source/performance/performance_long_sequence.md
@@ -0,0 +1,155 @@
+# Long Sequence Performance
+
+## LLAMA2-7B (FP8)
+
+- The table below shows the pre-training performance of the LLAMA2-7B with CP (context parallelism) and compares it against the results without CP at various input sequence lengths. The detailed model-parallel configurations and the achieved performance are shown in the training results with CP. In non-CP training runs, we use the most performant model- and data-parallel configurations without CP given the memory capacity constraint of the H100 GPU system.
+
+  - Container: [NeMo24.03.01.framework](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags)
+  - System: DGX-H100
+
+<style>
+  table {
+    border-collapse: collapse;
+  }
+  th {
+    border: 1px solid;
+    padding: 5px;
+    text-align: center; /* Center-align all header cells */
+  }
+  td {
+    border: 1px solid;
+    padding: 5px;
+  }
+  th.top-border {
+    border-top: 2px solid;
+  }
+  td.speedup {
+    font-weight: bold;
+  }
+</style>
+
+
+<table>
+  <thead>
+    <tr>
+      <th rowspan="2" class="top-border">SeqLen (K)</th>
+      <th rowspan="2" class="top-border"># of GPUs</th>
+      <th rowspan="1" class="top-border">Without CP</th>
+      <th colspan="5" class="top-border">With CP</th>
+      <th rowspan="2" class="top-border">Speedup with CP/without CP</th>
+    </tr>
+    <tr>
+      <th>TFLOPS / GPU</th>
+      <th>TP</th>
+      <th>PP</th>
+      <th>DP</th>
+      <th>CP</th>
+      <th>TFLOPS / GPU</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>4</td>
+      <td>4</td>
+      <td>768</td>
+      <td>1</td>
+      <td>1</td>
+      <td>4</td>
+      <td>1</td>
+      <td>768</td>
+      <td class="speedup">1.00</td>
+    </tr>
+    <tr>
+      <td>8</td>
+      <td>8</td>
+      <td>730</td>
+      <td>1</td>
+      <td>2</td>
+      <td>4</td>
+      <td>1</td>
+      <td>730</td>
+      <td class="speedup">1.00</td>
+    </tr>
+    <tr>
+      <td>16</td>
+      <td>16</td>
+      <td>660</td>
+      <td>2</td>
+      <td>1</td>
+      <td>8</td>
+      <td>1</td>
+      <td>660</td>
+      <td class="speedup">1.00</td>
+    </tr>
+    <tr>
+      <td>32</td>
+      <td>32</td>
+      <td>595</td>
+      <td>2</td>
+      <td>1</td>
+      <td>8</td>
+      <td>2</td>
+      <td>610</td>
+      <td class="speedup">1.03</td>
+    </tr>
+    <tr>
+      <td>64</td>
+      <td>64</td>
+      <td>534</td>
+      <td>4</td>
+      <td>1</td>
+      <td>8</td>
+      <td>2</td>
+      <td>574</td>
+      <td class="speedup">1.07</td>
+    </tr>
+    <tr>
+      <td>128</td>
+      <td>128</td>
+      <td>424</td>
+      <td>4</td>
+      <td>1</td>
+      <td>8</td>
+      <td>4</td>
+      <td>555</td>
+      <td class="speedup">1.31</td>
+    </tr>
+    <tr>
+      <td>256</td>
+      <td>256</td>
+      <td>392</td>
+      <td>4</td>
+      <td>1</td>
+      <td>8</td>
+      <td>8</td>
+      <td>549</td>
+      <td class="speedup">1.40</td>
+    </tr>
+    <tr>
+      <td>512</td>
+      <td>512</td>
+      <td>104</td>
+      <td>8</td>
+      <td>1</td>
+      <td>4</td>
+      <td>16</td>
+      <td>549</td>
+      <td class="speedup">5.28</td>
+    </tr>
+    <tr>
+      <td>1024</td>
+      <td>1024</td>
+      <td>26.5</td>
+      <td>8</td>
+      <td>1</td>
+      <td>4</td>
+      <td>32</td>
+      <td>536</td>
+      <td class="speedup">20.23</td>
+    </tr>
+  </tbody>
+</table>
+
+
+### Speedup of LLAMA2 7B training with CP over without CP
+![cp_speedup_figure](https://github.com/NVIDIA/NeMo/releases/download/r2.0.0rc1/tutorial_cp_speedup_figure.png)
\ No newline at end of file

From 3ad16b485739de124ab6da62c120e029ff177fa2 Mon Sep 17 00:00:00 2001
From: malay-nagda <164242706+malay-nagda@users.noreply.github.com>
Date: Fri, 18 Oct 2024 21:33:03 +0530
Subject: [PATCH 09/37] perf recipes and Mcore DistOpt params (#10883)

* 175b gpt3 recipe

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* dist opt params

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* 405b dist opt params

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* perf recipes and dist opt params

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* MoE dist opt params

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* gpt bias fusion params

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* 175b recipe

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* perf params comments

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* MoE perf params comments

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* perf recipes suffix

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* specific models fusion params

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

---------

Signed-off-by: Malay Nagda <malayn@nvidia.com>
Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
Co-authored-by: malay-nagda <malay-nagda@users.noreply.github.com>
---
 nemo/collections/llm/gpt/model/base.py        |   9 +
 nemo/collections/llm/recipes/gpt3_175b.py     | 245 ++++++++++++++++++
 nemo/collections/llm/recipes/llama31_405b.py  |  73 +++++-
 nemo/collections/llm/recipes/llama3_70b.py    |   8 +
 nemo/collections/llm/recipes/llama3_8b.py     |   3 +-
 nemo/collections/llm/recipes/mixtral_8x22b.py |  17 +-
 nemo/collections/llm/recipes/mixtral_8x7b.py  |  13 +-
 nemo/collections/llm/recipes/nemotron.py      |   9 +
 nemo/collections/llm/recipes/nemotron3_8b.py  |  49 +++-
 nemo/collections/llm/recipes/nemotron4_15b.py |  49 +++-
 nemo/collections/llm/recipes/nemotron4_22b.py |  58 ++++-
 .../collections/llm/recipes/nemotron4_340b.py |  58 ++++-
 12 files changed, 582 insertions(+), 9 deletions(-)
 create mode 100644 nemo/collections/llm/recipes/gpt3_175b.py

diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index f48f4a15d327..c7a6e01c673e 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -204,6 +204,9 @@ class GPTConfig5B(GPTConfig):
     ffn_hidden_size: int = 16384
     num_attention_heads: int = 32
 
+    bias_activation_fusion: bool = True
+    bias_dropout_add_fusion: bool = True
+
 
 @dataclass
 class GPTConfig7B(GPTConfig):
@@ -222,6 +225,9 @@ class GPTConfig20B(GPTConfig):
     ffn_hidden_size: int = 24576
     num_attention_heads: int = 48
 
+    bias_activation_fusion: bool = True
+    bias_dropout_add_fusion: bool = True
+
 
 @dataclass
 class GPTConfig40B(GPTConfig):
@@ -240,6 +246,9 @@ class GPTConfig175B(GPTConfig):
     ffn_hidden_size: int = 49152
     num_attention_heads: int = 96
 
+    bias_activation_fusion: bool = True
+    bias_dropout_add_fusion: bool = True
+
 
 class GPTModel(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin):
     def __init__(
diff --git a/nemo/collections/llm/recipes/gpt3_175b.py b/nemo/collections/llm/recipes/gpt3_175b.py
new file mode 100644
index 000000000000..7e016154aa3e
--- /dev/null
+++ b/nemo/collections/llm/recipes/gpt3_175b.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.model import GPTConfig175B, GPTModel
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
+    userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048,
+)
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "gpt3_175b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a GPT3 175B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the GPT3 175B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=gpt3_175b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(GPTModel, config=run.Config(GPTConfig175B))
+
+
+def trainer(
+    tensor_parallelism: int = 4,
+    pipeline_parallelism: int = 8,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
+    virtual_pipeline_parallelism: Optional[int] = 6,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = True,
+    num_nodes: int = 64,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for GPT3 175B model.
+
+    This function sets up the distributed training strategy optimized for the large 175B model.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=gpt3_175b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=64, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        This configuration uses extensive parallelism to handle the large model size efficiently.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for GPT3 175B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory gpt3_175b
+            $ nemo llm pretrain --factory "gpt3_175b(num_nodes=64, name='my_175b_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="gpt3_175b_pretrain", num_nodes=64)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for the large 175B model and requires significant computational resources.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=2048, global_batch_size=2048, micro_batch_size=2),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=0.9e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_performance")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for GPT3 175B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory "gpt3_175b.pretrain_recipe_performance(num_nodes=64, name='perf_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="gpt3_175b_perf", num_nodes=64)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
+    # They are added here for user's knowledge
+    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+            tp_comm_overlap_cfg=userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048,
+            defer_embedding_wgrad_compute=True,
+            wgrad_deferral_limit=50,
+            overlap_param_gather_with_optimizer_step=True,
+            align_param_gather=True,
+        )
+    )
+
+    return recipe
diff --git a/nemo/collections/llm/recipes/llama31_405b.py b/nemo/collections/llm/recipes/llama31_405b.py
index f36773551ea0..45efedc3cbd6 100644
--- a/nemo/collections/llm/recipes/llama31_405b.py
+++ b/nemo/collections/llm/recipes/llama31_405b.py
@@ -13,11 +13,12 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
 import torch
+from megatron.core.distributed import DistributedDataParallelConfig
 from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
@@ -27,6 +28,10 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
+    userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
+)
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "llama31_405b"
@@ -107,6 +112,14 @@ def trainer(
         gradient_as_bucket_view=True,
         ckpt_async_save=True,
         ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+        ),
     )
 
     trainer = run.Config(
@@ -174,3 +187,61 @@ def pretrain_recipe(
         optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
         resume=default_resume(),
     )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_performance")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Llama3.1 405B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory "llama31_405b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="llama31_405b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
+    # They are added here for user's knowledge
+    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+            tp_comm_overlap_cfg=userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
+            defer_embedding_wgrad_compute=True,
+            wgrad_deferral_limit=50,
+            overlap_param_gather_with_optimizer_step=True,
+            align_param_gather=True,
+        )
+    )
+
+    return recipe
diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py
index 9cfc198038f2..ea7864bfbfad 100644
--- a/nemo/collections/llm/recipes/llama3_70b.py
+++ b/nemo/collections/llm/recipes/llama3_70b.py
@@ -117,6 +117,7 @@ def trainer(
             grad_reduce_in_fp32=True,
             overlap_grad_reduce=True,
             overlap_param_gather=True,
+            average_in_collective=True,
         ),
     )
 
@@ -221,6 +222,11 @@ def pretrain_recipe_performance(
     """
     recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
+    # They are added here for user's knowledge
+    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+
     recipe.trainer.callbacks.append(
         run.Config(
             MegatronCommOverlapCallback,
@@ -228,6 +234,8 @@ def pretrain_recipe_performance(
             tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
             defer_embedding_wgrad_compute=True,
             wgrad_deferral_limit=22,
+            overlap_param_gather_with_optimizer_step=True,
+            align_param_gather=True,
         )
     )
 
diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py
index 4b2934739529..dd162ed29914 100644
--- a/nemo/collections/llm/recipes/llama3_8b.py
+++ b/nemo/collections/llm/recipes/llama3_8b.py
@@ -117,6 +117,7 @@ def trainer(
             grad_reduce_in_fp32=True,
             overlap_grad_reduce=True,
             overlap_param_gather=True,
+            average_in_collective=True,
         ),
     )
 
@@ -188,7 +189,7 @@ def pretrain_recipe(
     )
 
 
-@run.cli.factory(target=pretrain, name=NAME + "_optimized")
+@run.cli.factory(target=pretrain, name=NAME + "_performance")
 def pretrain_recipe_performance(
     dir: Optional[str] = None,
     name: str = "default",
diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py
index 222a37d7a0c5..f023eae01440 100644
--- a/nemo/collections/llm/recipes/mixtral_8x22b.py
+++ b/nemo/collections/llm/recipes/mixtral_8x22b.py
@@ -117,6 +117,9 @@ def trainer(
             DistributedDataParallelConfig,
             check_for_nan_in_grad=True,
             grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
         ),
     )
 
@@ -215,10 +218,20 @@ def pretrain_recipe_performance(
         It may not be suitable for all hardware configurations or use cases.
     """
     recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
+    # They are added here for user's knowledge
+    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+
     recipe.trainer.callbacks.extend(
         [
-            run.Config(MegatronTokenDropCallback),
-            run.Config(MegatronCommOverlapCallback),
+            run.Config(
+                MegatronTokenDropCallback,
+            ),
+            run.Config(
+                MegatronCommOverlapCallback, overlap_param_gather_with_optimizer_step=True, align_param_gather=True
+            ),
         ]
     )
 
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py
index d0609761feea..e80be03e3217 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b.py
@@ -116,6 +116,7 @@ def trainer(
             grad_reduce_in_fp32=True,
             overlap_grad_reduce=True,
             overlap_param_gather=True,
+            average_in_collective=True,
         ),
     )
 
@@ -214,10 +215,20 @@ def pretrain_recipe_performance(
         It may not be suitable for all hardware configurations or use cases.
     """
     recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
+    # They are added here for user's knowledge
+    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+
     recipe.trainer.callbacks.extend(
         [
             run.Config(MegatronTokenDropCallback),
-            run.Config(MegatronCommOverlapCallback),
+            run.Config(
+                MegatronCommOverlapCallback,
+                overlap_param_gather_with_optimizer_step=True,
+                align_param_gather=True,
+            ),
         ]
     )
 
diff --git a/nemo/collections/llm/recipes/nemotron.py b/nemo/collections/llm/recipes/nemotron.py
index 1dd1ef2f83bc..aedf3fcf2954 100644
--- a/nemo/collections/llm/recipes/nemotron.py
+++ b/nemo/collections/llm/recipes/nemotron.py
@@ -17,6 +17,7 @@
 import nemo_run as run
 import pytorch_lightning as pl
 import torch
+from megatron.core.distributed import DistributedDataParallelConfig
 from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
@@ -124,6 +125,14 @@ def nemotron_trainer(
         ckpt_include_optimizer=True,
         ckpt_async_save=True,
         ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+        ),
     )
 
     precision_plugin = None
diff --git a/nemo/collections/llm/recipes/nemotron3_8b.py b/nemo/collections/llm/recipes/nemotron3_8b.py
index 3cdb647b5f84..928f0d177947 100644
--- a/nemo/collections/llm/recipes/nemotron3_8b.py
+++ b/nemo/collections/llm/recipes/nemotron3_8b.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -26,6 +26,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "nemotron3_8b"
@@ -174,6 +175,52 @@ def pretrain_recipe(
     )
 
 
+@run.cli.factory(target=pretrain, name=NAME + "_performance")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Nemotron3 8B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory nemotron3_8b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="nemotron3_8b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+        )
+    )
+    return recipe
+
+
 @run.cli.factory(name=NAME + "_nemo")
 def nemo_resume() -> run.Config[nl.AutoResume]:
     """
diff --git a/nemo/collections/llm/recipes/nemotron4_15b.py b/nemo/collections/llm/recipes/nemotron4_15b.py
index c0acae6b13f0..9f184a92d94b 100644
--- a/nemo/collections/llm/recipes/nemotron4_15b.py
+++ b/nemo/collections/llm/recipes/nemotron4_15b.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -23,6 +23,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "nemotron4_15b"
@@ -169,3 +170,49 @@ def pretrain_recipe(
         ),
         resume=default_resume(),
     )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_performance")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 8,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Nemotron4 15B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory nemotron4_15b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="nemotron4_15b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+        )
+    )
+    return recipe
diff --git a/nemo/collections/llm/recipes/nemotron4_22b.py b/nemo/collections/llm/recipes/nemotron4_22b.py
index ba07bae241d8..8ddbdbdf9814 100644
--- a/nemo/collections/llm/recipes/nemotron4_22b.py
+++ b/nemo/collections/llm/recipes/nemotron4_22b.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -23,6 +23,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "nemotron4_22b"
@@ -169,3 +170,58 @@ def pretrain_recipe(
         ),
         resume=default_resume(),
     )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_performance")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 8,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Nemotron4 22B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory nemotron4_22b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="nemotron4_22b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
+    # They are added here for user's knowledge
+    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+            defer_embedding_wgrad_compute=True,
+            wgrad_deferral_limit=22,
+            overlap_param_gather_with_optimizer_step=True,
+            align_param_gather=True,
+        )
+    )
+    return recipe
diff --git a/nemo/collections/llm/recipes/nemotron4_340b.py b/nemo/collections/llm/recipes/nemotron4_340b.py
index 238acb0dac3c..d05a374b0ed2 100644
--- a/nemo/collections/llm/recipes/nemotron4_340b.py
+++ b/nemo/collections/llm/recipes/nemotron4_340b.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -26,6 +26,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "nemotron4_340b"
@@ -174,6 +175,61 @@ def pretrain_recipe(
     )
 
 
+@run.cli.factory(target=pretrain, name=NAME + "_performance")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 16,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Nemotron4 340B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory nemotron4_340b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="nemotron4_340b_perf", num_nodes=16)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
+    # They are added here for user's knowledge
+    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+            defer_embedding_wgrad_compute=True,
+            wgrad_deferral_limit=22,
+            overlap_param_gather_with_optimizer_step=True,
+            align_param_gather=True,
+        )
+    )
+    return recipe
+
+
 @run.cli.factory(name=NAME + "_nemo")
 def nemo_resume() -> run.Config[nl.AutoResume]:
     """

From a1fdf07e6a3f746c9428054ec61c290171f92b66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Fri, 18 Oct 2024 18:27:02 +0200
Subject: [PATCH 10/37] ci: Fix cherry pick team (#10945)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/workflows/cherry-pick-release-commit.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml
index 5f29832f0c0f..67bc69b1f8a5 100644
--- a/.github/workflows/cherry-pick-release-commit.yml
+++ b/.github/workflows/cherry-pick-release-commit.yml
@@ -120,7 +120,7 @@ jobs:
                     "type": "section",
                     "text": {
                       "type": "mrkdwn",
-                      "text": ":alert: Cherrypick bot 🤖: Hey <@'$USERNAME'>: Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: <!subteam^{{ secrets.SLACK_WEBHOOK_ADMIN }}>"
+                      "text": ":alert: Cherrypick bot 🤖: Hey <@'$USERNAME'>: Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>"
                     }
                   }
                 ]

From 76352fb5fd739f491ae0b00c4899b8d68e82b7ca Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Fri, 18 Oct 2024 12:33:40 -0400
Subject: [PATCH 11/37] Packed sequence bug fixes (#10898)

* save prepared dataset to different folders according to tokenizer name

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix hang

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* fix hang

Signed-off-by: Chen Cui <chcui@nvidia.com>

* raise mbs>1 error and provide suggestion to user instead of automatically changing config

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* add ci for packed seq

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* fix bug

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml               | 74 +++++++++++++++----
 nemo/collections/llm/gpt/data/dolly.py        |  5 +-
 nemo/collections/llm/gpt/data/fine_tuning.py  | 52 ++++++++-----
 .../llm/gpt/data/packed_sequence.py           | 31 +++++++-
 nemo/collections/llm/gpt/data/squad.py        |  5 +-
 .../language_modeling/text_memmap_dataset.py  | 20 ++++-
 tests/collections/llm/gpt_finetuning.py       | 15 +++-
 7 files changed, 160 insertions(+), 42 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 7fc6cc708c31..4cd7edde2e3d 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -5225,8 +5225,6 @@ jobs:
         --pp_size 1 \
         --mbs 1
 
-      AFTER_SCRIPT: |
-        rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }}
 
   L2_NeMo_2_GPT_SFT_TP1PP1_MBS2:
     needs: [cicd-test-container-setup]
@@ -5256,8 +5254,6 @@ jobs:
         --pp_size 1 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }}
 
   L2_NeMo_2_GPT_SFT_TP1PP2_MBS2:
     needs: [cicd-test-container-setup]
@@ -5287,8 +5283,6 @@ jobs:
         --pp_size 2 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }}
 
   L2_NeMo_2_GPT_SFT_TP2PP1_MBS2:
     needs: [cicd-test-container-setup]
@@ -5318,8 +5312,35 @@ jobs:
         --pp_size 1 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }}
+
+  L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+
+        python tests/collections/llm/gpt_finetuning.py \
+        --restore_path /home/TestData/nemo2_ckpt/llama_68M \
+        --devices 2 \
+        --max_steps 3 \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
+        --peft none \
+        --tp_size 1 \
+        --pp_size 1 \
+        --mbs 1 --packed
+        
+        python tests/collections/llm/gpt_finetuning.py \
+        --restore_path /home/TestData/nemo2_ckpt/llama_68M \
+        --devices 2 \
+        --max_steps 6 \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
+        --peft none \
+        --tp_size 1 \
+        --pp_size 1 \
+        --mbs 1 --packed
+
 
   L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1:
     needs: [cicd-test-container-setup]
@@ -5349,8 +5370,6 @@ jobs:
         --pp_size 1 \
         --mbs 1
 
-      AFTER_SCRIPT: |
-        rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }}
 
   L2_NeMo_2_GPT_LoRA_TP1PP1_MBS2:
     needs: [cicd-test-container-setup]
@@ -5380,8 +5399,6 @@ jobs:
         --pp_size 1 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }}
 
   L2_NeMo_2_GPT_LoRA_TP1PP2_MBS2:
     needs: [cicd-test-container-setup]
@@ -5411,8 +5428,6 @@ jobs:
         --pp_size 2 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }}
 
   L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2:
     needs: [cicd-test-container-setup]
@@ -5442,8 +5457,33 @@ jobs:
         --pp_size 1 \
         --mbs 2
 
-      AFTER_SCRIPT: |
-        rm -rf /tmp/nemo2_gpt_finetune/${{ github.run_id }}
+  L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+
+        python tests/collections/llm/gpt_finetuning.py \
+        --restore_path /home/TestData/nemo2_ckpt/llama_68M \
+        --devices 2 \
+        --max_steps 3 \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
+        --peft lora \
+        --tp_size 1 \
+        --pp_size 1 \
+        --mbs 1 --packed
+        
+        python tests/collections/llm/gpt_finetuning.py \
+        --restore_path /home/TestData/nemo2_ckpt/llama_68M \
+        --devices 2 \
+        --max_steps 6 \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
+        --peft lora \
+        --tp_size 1 \
+        --pp_size 1 \
+        --mbs 1 --packed
 
   L2_NeMo_2_NeMo_Mcore_Mixtral_bitexact:
     needs: [cicd-test-container-setup]
@@ -5597,10 +5637,12 @@ jobs:
       - L2_NeMo_2_GPT_SFT_TP1PP1_MBS2
       - L2_NeMo_2_GPT_SFT_TP1PP2_MBS2
       - L2_NeMo_2_GPT_SFT_TP2PP1_MBS2
+      - L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED
       - L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1
       - L2_NeMo_2_GPT_LoRA_TP1PP1_MBS2
       - L2_NeMo_2_GPT_LoRA_TP1PP2_MBS2
       - L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2
+      - L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED
       - L2_NeMo_2_Mixtral_Pretraining
       - L2_PTQ_Llama2_INT8_SQ
       - L2_PTQ_Llama2_FP8
diff --git a/nemo/collections/llm/gpt/data/dolly.py b/nemo/collections/llm/gpt/data/dolly.py
index 78751d60cdb0..fb8cf9fd5da0 100644
--- a/nemo/collections/llm/gpt/data/dolly.py
+++ b/nemo/collections/llm/gpt/data/dolly.py
@@ -26,6 +26,7 @@
 
 if TYPE_CHECKING:
     from nemo.collections.common.tokenizers import TokenizerSpec
+    from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 
 
 class DollyDataModule(FineTuningDataModule, IOMixin):
@@ -56,7 +57,7 @@ def __init__(
         pin_memory: bool = True,
         persistent_workers: bool = False,
         pad_to_max_length: bool = False,
-        packed_sequence_size: int = -1,
+        packed_sequence_specs: Optional["PackedSequenceSpecs"] = None,
     ):
         self.force_redownload = force_redownload
         self.delete_raw = delete_raw
@@ -74,7 +75,7 @@ def __init__(
             pin_memory=pin_memory,
             persistent_workers=persistent_workers,
             pad_to_max_length=pad_to_max_length,
-            packed_sequence_size=packed_sequence_size,
+            packed_sequence_specs=packed_sequence_specs,
         )
 
     def prepare_data(self) -> None:
diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py
index 3e4dba7ec89c..01cf617a094d 100644
--- a/nemo/collections/llm/gpt/data/fine_tuning.py
+++ b/nemo/collections/llm/gpt/data/fine_tuning.py
@@ -20,12 +20,14 @@
 import pytorch_lightning as pl
 from torch.utils.data import DataLoader
 
+from nemo.collections.common.tokenizers import AutoTokenizer
 from nemo.collections.llm.gpt.data.core import create_sft_dataset
 from nemo.lightning.pytorch.plugins import MegatronDataSampler
 from nemo.utils import logging
 
 if TYPE_CHECKING:
     from nemo.collections.common.tokenizers import TokenizerSpec
+    from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 
 
 class FineTuningDataModule(pl.LightningDataModule):
@@ -50,10 +52,7 @@ class FineTuningDataModule(pl.LightningDataModule):
         persistent_workers (bool, optional): Whether to keep data loading workers persistent across epochs. Defaults to False.
         max_train_steps (int, optional): Maximum number of steps to train. Used to calculate samples mapping for the mmap dataset
         pad_to_max_length (bool, optional): Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch.
-        packed_sequence_size (int, optional): If a positive integer, this arg enables training with sequence packing and specifies the pack size
-            If less than or equal to 0, sequence packing is disabled. Defaults to -1.
-            Note: This arg is distinct from `seq_length` because `seq_length` specifies the maximum length of the original sequence
-            (i.e. the length to truncate long sequences in the input data).
+        packed_sequence_specs (PackedSequenceSpecs, optional): See PackedSequenceSpecs for details
     """
 
     def __init__(
@@ -70,7 +69,7 @@ def __init__(
         pin_memory: bool = True,
         persistent_workers: bool = False,
         pad_to_max_length: bool = False,
-        packed_sequence_size: int = -1,
+        packed_sequence_specs: Optional["PackedSequenceSpecs"] = None,
     ):
         super().__init__()
         self.seq_length = seq_length
@@ -87,22 +86,21 @@ def __init__(
         self.data_sampler = None
         self.max_train_samples = None
         self.pad_to_max_length = pad_to_max_length
-        self.packed_sequence_size = packed_sequence_size
-        self._adjust_batch_sizes_for_packed_sequence()
+        self.packed_sequence_specs = packed_sequence_specs
+        self.packed_sequence_size = -1 if not packed_sequence_specs else packed_sequence_specs.packed_sequence_size
+        self.validate_batch_size_for_packed_sequence()
 
-    def _adjust_batch_sizes_for_packed_sequence(self):
+    def validate_batch_size_for_packed_sequence(self):
         if self.packed_sequence_size > 0 and self.micro_batch_size > 1:
-            logging.warning(
+            raise ValueError(
                 "Micro batch size should be 1 when training with packed sequence, but your micro batch size "
-                f"is {self.micro_batch_size}. Your config will be automatically updated to the following: "
-                f"MBS will be set to 1 (from {self.micro_batch_size}), "
-                f"GBS will be set to {self.global_batch_size // self.micro_batch_size} (from {self.global_batch_size}), "
-                f"packed sequence length will be set to {self.packed_sequence_size*self.micro_batch_size} (from {self.packed_sequence_size}). "
+                f"is {self.micro_batch_size}. \nThe following config is equivalent to your current setting for "
+                f"a packed dataset. Please update your config to the following: \n"
+                f"Set micro batch size to 1 (currently {self.micro_batch_size})\n"
+                f"Set global batch size to {self.global_batch_size // self.micro_batch_size} (currently {self.global_batch_size}) \n"
+                f"Set packed sequence length to {self.packed_sequence_size*self.micro_batch_size} (currently {self.packed_sequence_size}) \n"
                 f"For details please visit https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/optimizations/sequence_packing.html"
             )
-            self.global_batch_size //= self.micro_batch_size
-            self.packed_sequence_size *= self.micro_batch_size
-            self.micro_batch_size = 1
 
     def prepare_data(self) -> None:
         if self.packed_sequence_size > 0 and not self.train_path_packed.is_file():
@@ -187,7 +185,12 @@ def train_path(self) -> Path:
     @property
     def train_path_packed(self) -> Path:
         if self.packed_sequence_size > 0:
-            return self.dataset_root / f"training_packed{self.packed_sequence_size}.npy"
+            if self.packed_sequence_specs.packed_data_path is not None:
+                return self.packed_sequence_specs.packed_data_path
+            tokenizer_model_name = self._extract_tokenizer_model_name()
+            folder_name = self.dataset_root / "packed" / tokenizer_model_name
+            folder_name.mkdir(parents=True, exist_ok=True)
+            return folder_name / f"training_{self.packed_sequence_size}.npy"
         else:
             raise ValueError("`train_path_packed` invalid since packed sequence size is not specified.")
 
@@ -198,3 +201,18 @@ def validation_path(self) -> Path:
     @property
     def test_path(self) -> Path:
         return self.dataset_root / "test.jsonl"
+
+    def _extract_tokenizer_model_name(self) -> str:
+        if self.packed_sequence_specs.tokenizer_model_name is not None:
+            tokenizer_model_name = self.packed_sequence_specs.tokenizer_model_name
+        elif isinstance(self.tokenizer, AutoTokenizer):
+            name = self.tokenizer.tokenizer.name_or_path
+            if name.endswith("nemo_tokenizer"):
+                # NEMO_HOME/hf_org/hf_model/nemo_tokenizer => hf_org--hf_model
+                tokenizer_model_name = '--'.join(name.split("/")[-3:-1])
+            else:
+                # hf_org/hf_model => hf_org--hf_model
+                tokenizer_model_name = name.replace("/", "--")
+        else:
+            tokenizer_model_name = f"unknown_tokenizer_{hash(self.tokenizer)}"
+        return tokenizer_model_name
diff --git a/nemo/collections/llm/gpt/data/packed_sequence.py b/nemo/collections/llm/gpt/data/packed_sequence.py
index 4675b3fbb398..372e851da7cd 100644
--- a/nemo/collections/llm/gpt/data/packed_sequence.py
+++ b/nemo/collections/llm/gpt/data/packed_sequence.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
 
@@ -83,3 +83,32 @@ def prepare_packed_sequence_data(
     # save output data
     np.save(output_path, output_data)
     logging.info(f"Packed sequence is prepared and saved to {output_path}")
+
+
+@dataclass
+class PackedSequenceSpecs:
+    packed_sequence_size: int = -1
+    """
+    If a positive integer, this arg enables training with sequence packing and specifies the pack size
+    If less than or equal to 0, sequence packing is disabled. Defaults to -1.
+    Note: This arg is distinct from `seq_length` because `seq_length` specifies the maximum length of the original sequence
+    (i.e. the length to truncate long sequences in the input data).
+    """
+
+    tokenizer_model_name: str = None
+    """
+    Keep track of tokenizer model name, since each tokenizer produces a different packed sequence dataset file.
+    This field is set by llm.finetune api.
+    """
+
+    packed_data_path: Path = None
+    """
+    If specified, use the packed dataset from this file instead of the default path.
+    """
+
+    def __post_init__(self):
+        if self.packed_data_path is not None:
+            assert (
+                self.packed_data_path.suffix == ".npy"
+            ), f"packed data file must be a .npy file: {self.packed_data_path}"
+            assert self.packed_data_path.exists(), f"packed data file does not exist: {self.packed_data_path}"
diff --git a/nemo/collections/llm/gpt/data/squad.py b/nemo/collections/llm/gpt/data/squad.py
index ec0fc1aad02c..f872db94077d 100644
--- a/nemo/collections/llm/gpt/data/squad.py
+++ b/nemo/collections/llm/gpt/data/squad.py
@@ -24,6 +24,7 @@
 
 if TYPE_CHECKING:
     from nemo.collections.common.tokenizers import TokenizerSpec
+    from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 
 
 class SquadDataModule(FineTuningDataModule, IOMixin):
@@ -54,7 +55,7 @@ def __init__(
         pin_memory: bool = True,
         persistent_workers: bool = False,
         pad_to_max_length: bool = False,
-        packed_sequence_size: int = -1,
+        packed_sequence_specs: Optional["PackedSequenceSpecs"] = None,
     ):
         self.force_redownload = force_redownload
         self.delete_raw = delete_raw
@@ -72,7 +73,7 @@ def __init__(
             pin_memory=pin_memory,
             persistent_workers=persistent_workers,
             pad_to_max_length=pad_to_max_length,
-            packed_sequence_size=packed_sequence_size,
+            packed_sequence_specs=packed_sequence_specs,
         )
 
     def prepare_data(self) -> None:
diff --git a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py
index dc4fb8ececc5..f62613db891b 100644
--- a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py
@@ -127,7 +127,7 @@ def __init__(
                 index_mapping_dir=index_mapping_dir,
             )
 
-        if is_distributed:
+        if is_distributed and not _lightning_prepare_data():
             torch.distributed.barrier()
 
         if is_distributed and AppState().local_rank == 0:
@@ -152,7 +152,7 @@ def __init__(
                 index_mapping_dir=index_mapping_dir,
             )
 
-        if is_distributed:
+        if is_distributed and not _lightning_prepare_data():
             torch.distributed.barrier()
 
         logging.info(f"Loading data files")
@@ -749,3 +749,19 @@ def get_sample_block(self, block_idx: int) -> np.ndarray:
         sample_block = sample_block % self.dataset_size
 
         return sample_block
+
+
+def _lightning_prepare_data():
+    """
+    This function checks whether it is invoked in lightning's hook "prepare_data", which is run only on rank 0.
+    TextMemMapDataset contains a torch.distributed.barrier operation, so when run inside the single-process hook
+    prepare_data, the barrier operation would hang forever.
+    """
+    import inspect
+
+    return any(
+        [
+            frame.function == 'prepare_data' and 'prepare_packed_sequence_data' in frame.code_context[0]
+            for frame in inspect.stack()
+        ]
+    )
diff --git a/tests/collections/llm/gpt_finetuning.py b/tests/collections/llm/gpt_finetuning.py
index 9eca287669cd..7eaa7744729c 100644
--- a/tests/collections/llm/gpt_finetuning.py
+++ b/tests/collections/llm/gpt_finetuning.py
@@ -19,6 +19,7 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 
 ## NOTE: This script is present for github-actions testing only.
@@ -43,6 +44,7 @@ def get_args():
     parser.add_argument('--mbs', type=int, default=1, help="micro batch size")
     parser.add_argument('--tp_size', type=int, default=1, help="tensor parallel size")
     parser.add_argument('--pp_size', type=int, default=1, help="pipeline parallel size")
+    parser.add_argument('--packed', action='store_true', help="use packed sequence dataset")
 
     return parser.parse_args()
 
@@ -97,7 +99,16 @@ def get_args():
     else:
         peft = None
 
-    squad = llm.SquadDataModule(seq_length=2048, micro_batch_size=args.mbs, global_batch_size=8, num_workers=0)
+    packed_sequence_specs = (
+        PackedSequenceSpecs(packed_sequence_size=2048, tokenizer_model_name="dummy_tokenizer") if args.packed else None
+    )
+    dolly = llm.DollyDataModule(
+        seq_length=2048,
+        micro_batch_size=args.mbs,
+        global_batch_size=8,
+        num_workers=0,
+        packed_sequence_specs=packed_sequence_specs,
+    )
 
     tokenizer = get_nmt_tokenizer(tokenizer_model=os.path.join(args.restore_path, "dummy_tokenizer.model"))
     llama3_8b = llm.LlamaModel(Llama3ConfigCI(), tokenizer=tokenizer)
@@ -109,7 +120,7 @@ def get_args():
 
     llm.finetune(
         model=llama3_8b,
-        data=squad,
+        data=dolly,
         trainer=trainer,
         peft=peft,
         log=logger,

From b6c3c0d5d41d2c695e6a1b23aa573e867b0ec729 Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Fri, 18 Oct 2024 22:02:18 +0400
Subject: [PATCH 12/37] Fix requirements for MacOS (#10930)

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 requirements/requirements_multimodal.txt | 2 +-
 requirements/requirements_nlp.txt        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/requirements_multimodal.txt b/requirements/requirements_multimodal.txt
index 8b56c3974a25..18abe82c9f96 100644
--- a/requirements/requirements_multimodal.txt
+++ b/requirements/requirements_multimodal.txt
@@ -1,6 +1,6 @@
 addict
 clip
-decord
+decord; sys_platform == 'linux'
 diffusers>=0.19.3
 einops_exts
 imageio
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 3d168ad3b12a..7ef03689b9b5 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -9,7 +9,7 @@ gdown
 h5py
 ijson
 jieba
-mamba-ssm==2.2.2
+mamba-ssm==2.2.2; sys_platform == 'linux'
 markdown2
 matplotlib>=3.3.2
 #megatron_core>0.6.0 # add back once mcore on pypi is compatible again

From f4d1c5d44123d312f72110d1073247f40b67f3a5 Mon Sep 17 00:00:00 2001
From: BoxiangW <45734921+BoxiangW@users.noreply.github.com>
Date: Fri, 18 Oct 2024 11:28:24 -0700
Subject: [PATCH 13/37] Fix nemo 2.0 recipes  (#10915)

* Fix recipe num_nodes and long context docstring

* Fix typo

* Fix PP issue

* Fix unit test

* Change recipes

* fix test

* Fix unit tests

* Fix recipes

* Add general legal test on parallelization settings

* Rename test

* Apply isort and black reformatting

Signed-off-by: BoxiangW <BoxiangW@users.noreply.github.com>

---------

Signed-off-by: BoxiangW <BoxiangW@users.noreply.github.com>
Co-authored-by: BoxiangW <BoxiangW@users.noreply.github.com>
---
 nemo/collections/llm/recipes/llama3_70b.py    |  7 ++-
 .../collections/llm/recipes/llama3_70b_16k.py | 18 ++++----
 .../collections/llm/recipes/llama3_70b_64k.py | 11 +++--
 nemo/collections/llm/recipes/llama3_8b_16k.py | 18 ++++----
 nemo/collections/llm/recipes/llama3_8b_64k.py | 24 +++++-----
 .../llm/recipes/mixtral_8x7b_16k.py           | 24 +++++-----
 .../llm/recipes/mixtral_8x7b_64k.py           | 15 +++----
 .../llm/recipes/nemotron4_15b_16k.py          |  2 +-
 .../llm/recipes/nemotron4_15b_64k.py          |  2 +-
 nemo/collections/llm/recipes/nemotron4_22b.py |  2 +-
 .../llm/recipes/nemotron4_22b_16k.py          |  2 +-
 .../llm/recipes/nemotron4_22b_64k.py          | 10 ++---
 .../collections/llm/recipes/nemotron4_340b.py | 14 +++---
 .../llm/recipes/test_llama3_70b.py            |  4 +-
 .../llm/recipes/test_llama3_70b_16k.py        | 45 ++++++++++++++-----
 .../llm/recipes/test_llama3_70b_64k.py        | 39 ++++++++++++----
 .../llm/recipes/test_llama3_8b_16k.py         | 45 ++++++++++++++-----
 .../llm/recipes/test_llama3_8b_64k.py         | 45 ++++++++++++++-----
 .../llm/recipes/test_mixtral_8x7b_16k.py      | 44 +++++++++++++-----
 .../llm/recipes/test_mixtral_8x7b_64k.py      | 42 ++++++++++++-----
 .../llm/recipes/test_nemotron4_15b_16k.py     | 32 +++++++++++++
 .../llm/recipes/test_nemotron4_15b_64k.py     | 32 +++++++++++++
 .../llm/recipes/test_nemotron4_22b_16k.py     | 32 +++++++++++++
 .../llm/recipes/test_nemotron4_22b_64k.py     | 32 +++++++++++++
 24 files changed, 399 insertions(+), 142 deletions(-)

diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py
index ea7864bfbfad..ffd4a833885e 100644
--- a/nemo/collections/llm/recipes/llama3_70b.py
+++ b/nemo/collections/llm/recipes/llama3_70b.py
@@ -24,7 +24,6 @@
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.gpt.model.llama import Llama3Config70B, LlamaModel
 from nemo.collections.llm.peft.lora import LoRA
 from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
@@ -64,7 +63,7 @@ def trainer(
     virtual_pipeline_parallelism: Optional[int] = 5,
     context_parallelism: int = 2,
     sequence_parallelism: bool = True,
-    num_nodes: int = 1,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
     max_steps: int = 1168251,
     callbacks: Optional[list[run.Config[Callback]]] = None,
@@ -143,7 +142,7 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 4, num_gpus_per_node: int = 8, fn=pretrain
 ) -> run.Partial:
     """
     Create a pre-training recipe for Llama3 70B model.
@@ -190,7 +189,7 @@ def pretrain_recipe(
 
 @run.cli.factory(target=pretrain, name=NAME + "_performance")
 def pretrain_recipe_performance(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 4, num_gpus_per_node: int = 8, fn=pretrain
 ) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Llama3 70B model.
diff --git a/nemo/collections/llm/recipes/llama3_70b_16k.py b/nemo/collections/llm/recipes/llama3_70b_16k.py
index c8c1957d7bdc..928f961f7cf3 100644
--- a/nemo/collections/llm/recipes/llama3_70b_16k.py
+++ b/nemo/collections/llm/recipes/llama3_70b_16k.py
@@ -49,7 +49,7 @@ def model() -> run.Config[pl.LightningModule]:
 
 
 def trainer(
-    num_nodes: int = 2,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Config:
     """
@@ -58,8 +58,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for the large 70B model with longer sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -76,10 +76,10 @@ def trainer(
         This configuration uses extensive parallelism to handle the large model size and longer sequence length efficiently.
     """
     return llama3_70b.trainer(
-        tensor_parallelism=2,
-        pipeline_parallelism=4,
+        tensor_parallelism=8,
+        pipeline_parallelism=2,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=5,
+        virtual_pipeline_parallelism=None,
         context_parallelism=2,
         sequence_parallelism=True,
         num_nodes=num_nodes,
@@ -91,7 +91,7 @@ def trainer(
 def pretrain_recipe(
     dir: Optional[str] = None,
     name: str = "default",
-    num_nodes: int = 2,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Partial:
     """
@@ -103,8 +103,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
diff --git a/nemo/collections/llm/recipes/llama3_70b_64k.py b/nemo/collections/llm/recipes/llama3_70b_64k.py
index 5d9845d9aaa7..ffadf5ca8084 100644
--- a/nemo/collections/llm/recipes/llama3_70b_64k.py
+++ b/nemo/collections/llm/recipes/llama3_70b_64k.py
@@ -21,7 +21,6 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes import llama3_70b
 from nemo.utils.exp_manager import TimingCallback
 
@@ -59,8 +58,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for the large 70B model with long sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 32.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -81,7 +80,7 @@ def trainer(
         tensor_parallelism=8,
         pipeline_parallelism=4,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=5,
+        virtual_pipeline_parallelism=None,
         context_parallelism=8,
         sequence_parallelism=True,
         num_nodes=num_nodes,
@@ -106,8 +105,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 32.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
diff --git a/nemo/collections/llm/recipes/llama3_8b_16k.py b/nemo/collections/llm/recipes/llama3_8b_16k.py
index 0b42b392827a..d6c1677a3b4b 100644
--- a/nemo/collections/llm/recipes/llama3_8b_16k.py
+++ b/nemo/collections/llm/recipes/llama3_8b_16k.py
@@ -49,7 +49,7 @@ def model() -> run.Config[pl.LightningModule]:
 
 
 def trainer(
-    num_nodes: int = 1,
+    num_nodes: int = 2,
     num_gpus_per_node: int = 8,
 ) -> run.Config:
     """
@@ -58,8 +58,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for longer sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 2.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -76,10 +76,10 @@ def trainer(
         This configuration uses increased parallelism to handle the longer sequence length efficiently.
     """
     return llama3_8b.trainer(
-        tensor_parallelism=2,
-        pipeline_parallelism=4,
+        tensor_parallelism=4,
+        pipeline_parallelism=2,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=5,
+        virtual_pipeline_parallelism=None,
         context_parallelism=2,
         sequence_parallelism=True,
         num_nodes=num_nodes,
@@ -91,7 +91,7 @@ def trainer(
 def pretrain_recipe(
     dir: Optional[str] = None,
     name: str = "default",
-    num_nodes: int = 1,
+    num_nodes: int = 2,
     num_gpus_per_node: int = 8,
 ) -> run.Partial:
     """
@@ -103,8 +103,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 2.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
diff --git a/nemo/collections/llm/recipes/llama3_8b_64k.py b/nemo/collections/llm/recipes/llama3_8b_64k.py
index 38f787113bf5..692347ea8dd0 100644
--- a/nemo/collections/llm/recipes/llama3_8b_64k.py
+++ b/nemo/collections/llm/recipes/llama3_8b_64k.py
@@ -49,7 +49,7 @@ def model() -> run.Config[pl.LightningModule]:
 
 
 def trainer(
-    num_nodes: int = 1,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Config:
     """
@@ -58,8 +58,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for long sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -69,17 +69,17 @@ def trainer(
             $ nemo llm pretrain trainer=llama3_8b_64k ...
 
         Python API usage:
-            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> trainer_config = trainer(num_nodes=4, num_gpus_per_node=8)
             >>> print(trainer_config)
 
     Note:
         This configuration uses significantly increased parallelism to handle the long sequence length efficiently.
     """
     return llama3_8b.trainer(
-        tensor_parallelism=2,
-        pipeline_parallelism=4,
+        tensor_parallelism=4,
+        pipeline_parallelism=2,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=5,
+        virtual_pipeline_parallelism=None,
         context_parallelism=4,
         sequence_parallelism=True,
         num_nodes=num_nodes,
@@ -91,7 +91,7 @@ def trainer(
 def pretrain_recipe(
     dir: Optional[str] = None,
     name: str = "default",
-    num_nodes: int = 1,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Partial:
     """
@@ -103,8 +103,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
@@ -112,10 +112,10 @@ def pretrain_recipe(
     Examples:
         CLI usage:
             $ nemo llm pretrain --factory llama3_8b_64k
-            $ nemo llm pretrain --factory "llama3_8b_64k(num_nodes=2, name='my_64k_pretrain')"
+            $ nemo llm pretrain --factory "llama3_8b_64k(num_nodes=4, name='my_64k_pretrain')"
 
         Python API usage:
-            >>> recipe = pretrain_recipe(name="llama3_8b_64k_pretrain", num_nodes=2)
+            >>> recipe = pretrain_recipe(name="llama3_8b_64k_pretrain", num_nodes=4)
             >>> print(recipe)
 
     Note:
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py
index 8b26a8c7c3e3..7cbfaf723544 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py
@@ -51,7 +51,7 @@ def model() -> run.Config[pl.LightningModule]:
 
 
 def trainer(
-    num_nodes: int = 2,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Config:
     """
@@ -60,8 +60,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for longer sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -71,17 +71,17 @@ def trainer(
             $ nemo llm pretrain trainer=mixtral_8x7b_16k ...
 
         Python API usage:
-            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> trainer_config = trainer(num_nodes=4, num_gpus_per_node=8)
             >>> print(trainer_config)
 
     Note:
         This configuration uses increased parallelism to handle the longer sequence length efficiently.
     """
     return mixtral_8x7b.trainer(
-        tensor_parallelism=2,
-        pipeline_parallelism=4,
+        tensor_parallelism=4,
+        pipeline_parallelism=2,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=8,
+        virtual_pipeline_parallelism=None,
         context_parallelism=4,
         sequence_parallelism=True,
         expert_parallelism=1,
@@ -95,7 +95,7 @@ def trainer(
 def pretrain_recipe(
     dir: Optional[str] = None,
     name: str = "default",
-    num_nodes: int = 2,
+    num_nodes: int = 4,
     num_gpus_per_node: int = 8,
 ) -> run.Partial:
     """
@@ -107,8 +107,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
@@ -116,10 +116,10 @@ def pretrain_recipe(
     Examples:
         CLI usage:
             $ nemo llm pretrain --factory mixtral_8x7b_16k
-            $ nemo llm pretrain --factory "mixtral_8x7b_16k(num_nodes=2, name='my_16k_pretrain')"
+            $ nemo llm pretrain --factory "mixtral_8x7b_16k(num_nodes=4, name='my_16k_pretrain')"
 
         Python API usage:
-            >>> recipe = pretrain_recipe(name="mixtral_8x7b_16k_pretrain", num_nodes=2)
+            >>> recipe = pretrain_recipe(name="mixtral_8x7b_16k_pretrain", num_nodes=4)
             >>> print(recipe)
     """
     recipe = mixtral_8x7b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py
index 6c8f7077fba3..3606be5ec12b 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py
@@ -21,7 +21,6 @@
 
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes import mixtral_8x7b
 from nemo.utils.exp_manager import TimingCallback
 
@@ -59,8 +58,8 @@ def trainer(
     This function sets up the distributed training strategy optimized for very long sequences.
 
     Args:
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 8.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Config: Configuration for the NeMo Lightning Trainer.
@@ -78,11 +77,11 @@ def trainer(
         It requires a substantial amount of computational resources.
     """
     return mixtral_8x7b.trainer(
-        tensor_parallelism=4,
+        tensor_parallelism=8,
         pipeline_parallelism=4,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=4,
-        context_parallelism=8,
+        virtual_pipeline_parallelism=None,
+        context_parallelism=4,
         sequence_parallelism=True,
         expert_parallelism=1,
         num_nodes=num_nodes,
@@ -107,8 +106,8 @@ def pretrain_recipe(
     Args:
         dir (Optional[str]): Directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
+        num_nodes (int, optional): Number of compute nodes to use. Defaults to 16.
+        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.
 
     Returns:
         run.Partial: Partial configuration for pre-training.
diff --git a/nemo/collections/llm/recipes/nemotron4_15b_16k.py b/nemo/collections/llm/recipes/nemotron4_15b_16k.py
index d0e9d939d8e7..75eced72761f 100644
--- a/nemo/collections/llm/recipes/nemotron4_15b_16k.py
+++ b/nemo/collections/llm/recipes/nemotron4_15b_16k.py
@@ -56,7 +56,7 @@ def pretrain_recipe(
     # Trainer
     tensor_parallelism: int = 2,
     pipeline_parallelism: int = 2,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
     virtual_pipeline_parallelism: Optional[int] = None,
     context_parallelism: int = 2,
     sequence_parallelism: bool = True,
diff --git a/nemo/collections/llm/recipes/nemotron4_15b_64k.py b/nemo/collections/llm/recipes/nemotron4_15b_64k.py
index c3f4575a1fd6..8286778aa7ba 100644
--- a/nemo/collections/llm/recipes/nemotron4_15b_64k.py
+++ b/nemo/collections/llm/recipes/nemotron4_15b_64k.py
@@ -56,7 +56,7 @@ def pretrain_recipe(
     # Trainer
     tensor_parallelism: int = 4,
     pipeline_parallelism: int = 2,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
     virtual_pipeline_parallelism: Optional[int] = None,
     context_parallelism: int = 4,
     sequence_parallelism: bool = True,
diff --git a/nemo/collections/llm/recipes/nemotron4_22b.py b/nemo/collections/llm/recipes/nemotron4_22b.py
index 8ddbdbdf9814..4fb697c006fc 100644
--- a/nemo/collections/llm/recipes/nemotron4_22b.py
+++ b/nemo/collections/llm/recipes/nemotron4_22b.py
@@ -57,7 +57,7 @@ def pretrain_recipe(
     # Trainer
     tensor_parallelism: int = 2,
     pipeline_parallelism: int = 4,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
     virtual_pipeline_parallelism: Optional[int] = 10,
     context_parallelism: int = 1,
     sequence_parallelism: bool = False,
diff --git a/nemo/collections/llm/recipes/nemotron4_22b_16k.py b/nemo/collections/llm/recipes/nemotron4_22b_16k.py
index 614004d12aa3..42f258c6057d 100644
--- a/nemo/collections/llm/recipes/nemotron4_22b_16k.py
+++ b/nemo/collections/llm/recipes/nemotron4_22b_16k.py
@@ -57,7 +57,7 @@ def pretrain_recipe(
     tensor_parallelism: int = 4,
     pipeline_parallelism: int = 1,
     pipeline_parallelism_type: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = 10,
+    virtual_pipeline_parallelism: Optional[int] = None,
     context_parallelism: int = 2,
     sequence_parallelism: bool = True,
     num_nodes: int = 1,
diff --git a/nemo/collections/llm/recipes/nemotron4_22b_64k.py b/nemo/collections/llm/recipes/nemotron4_22b_64k.py
index 57211e5dddc1..67d60a6e1c90 100644
--- a/nemo/collections/llm/recipes/nemotron4_22b_64k.py
+++ b/nemo/collections/llm/recipes/nemotron4_22b_64k.py
@@ -56,9 +56,9 @@ def pretrain_recipe(
     # Trainer
     tensor_parallelism: int = 4,
     pipeline_parallelism: int = 2,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = 10,
-    context_parallelism: int = 2,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 4,
     sequence_parallelism: bool = True,
     num_nodes: int = 4,
     num_gpus_per_node: int = 8,
@@ -122,10 +122,10 @@ def pretrain_recipe(
     Examples:
         CLI usage:
             $ nemo llm pretrain --factory nemotron4_22b_64k
-            $ nemo llm pretrain --factory "nemotron4_22b_64k(num_nodes=1, name='my_nemotron_pretrain')"
+            $ nemo llm pretrain --factory "nemotron4_22b_64k(num_nodes=2, name='my_nemotron_pretrain')"
 
         Python API usage:
-            >>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=1)
+            >>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=2)
             >>> print(recipe)
 
     Note:
diff --git a/nemo/collections/llm/recipes/nemotron4_340b.py b/nemo/collections/llm/recipes/nemotron4_340b.py
index d05a374b0ed2..cc9c7995c9e4 100644
--- a/nemo/collections/llm/recipes/nemotron4_340b.py
+++ b/nemo/collections/llm/recipes/nemotron4_340b.py
@@ -42,7 +42,7 @@ def model() -> run.Config[pl.LightningModule]:
 
     Examples:
         CLI usage:
-            $ nemo llm pretrain model=nemotron4_340 ...
+            $ nemo llm pretrain model=nemotron4_340b ...
 
         Python API usage:
             >>> model_config = model()
@@ -60,7 +60,7 @@ def pretrain_recipe(
     # Trainer
     tensor_parallelism: int = 8,
     pipeline_parallelism: int = 12,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
     virtual_pipeline_parallelism: Optional[int] = 8,
     context_parallelism: int = 1,
     sequence_parallelism: bool = False,
@@ -125,8 +125,8 @@ def pretrain_recipe(
 
     Examples:
         CLI usage:
-            $ nemo llm pretrain --factory nemotron4_340
-            $ nemo llm pretrain --factory "nemotron4_340(num_nodes=1, name='my_nemotron_pretrain')"
+            $ nemo llm pretrain --factory nemotron4_340b
+            $ nemo llm pretrain --factory "nemotron4_340b(num_nodes=1, name='my_nemotron_pretrain')"
 
         Python API usage:
             >>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=1)
@@ -263,7 +263,7 @@ def finetune_recipe(
     # Trainer
     tensor_parallelism: int = 8,
     pipeline_parallelism: int = 12,
-    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
     virtual_pipeline_parallelism: Optional[int] = 8,
     context_parallelism: int = 1,
     sequence_parallelism: bool = False,
@@ -328,8 +328,8 @@ def finetune_recipe(
 
     Examples:
         CLI usage:
-            $ nemo llm finetune --factory nemotron4_340
-            $ nemo llm finetune --factory "nemotron4_340(name='my_nemotron4_340_finetune', num_nodes=4)"
+            $ nemo llm finetune --factory nemotron4_340b
+            $ nemo llm finetune --factory "nemotron4_340b(name='my_nemotron4_340_finetune', num_nodes=4)"
 
         Python API usage:
             >>> recipe = finetune_recipe(name="my_nemotron4_340_finetune", num_nodes=4)
diff --git a/tests/collections/llm/recipes/test_llama3_70b.py b/tests/collections/llm/recipes/test_llama3_70b.py
index a842975846dd..cc77ec921de7 100644
--- a/tests/collections/llm/recipes/test_llama3_70b.py
+++ b/tests/collections/llm/recipes/test_llama3_70b.py
@@ -31,7 +31,7 @@ def test_trainer(self, recipe_module):
         assert trainer_config.__fn_or_cls__ == Trainer
         assert trainer_config.accelerator == "gpu"
         assert trainer_config.devices == 8
-        assert trainer_config.num_nodes == 1
+        assert trainer_config.num_nodes == 4
 
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
@@ -81,7 +81,7 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
 
     def test_pretrain_recipe_performance(self, recipe_module):
         recipe = recipe_module.pretrain_recipe_performance(
-            name="test_perf", dir="/tmp", num_nodes=1, num_gpus_per_node=8
+            name="test_perf", dir="/tmp", num_nodes=4, num_gpus_per_node=8
         )
         assert any(
             isinstance(cb, run.Config) and cb.__fn_or_cls__ == MegatronCommOverlapCallback
diff --git a/tests/collections/llm/recipes/test_llama3_70b_16k.py b/tests/collections/llm/recipes/test_llama3_70b_16k.py
index 60940b062a87..17f0ec5ebd99 100644
--- a/tests/collections/llm/recipes/test_llama3_70b_16k.py
+++ b/tests/collections/llm/recipes/test_llama3_70b_16k.py
@@ -29,15 +29,15 @@ def test_trainer(self, recipe_module):
         assert trainer_config.__fn_or_cls__ == Trainer
         assert trainer_config.accelerator == "gpu"
         assert trainer_config.devices == 8
-        assert trainer_config.num_nodes == 2
+        assert trainer_config.num_nodes == 4
 
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
         assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.tensor_model_parallel_size == 8
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
         assert trainer_config.strategy.context_parallel_size == 2
         assert trainer_config.strategy.sequence_parallel is True
 
@@ -61,14 +61,37 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
-        assert trainer_config.strategy.context_parallel_size == 2
-        assert trainer_config.strategy.sequence_parallel is True
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_llama3_70b_64k.py b/tests/collections/llm/recipes/test_llama3_70b_64k.py
index 89813162fae1..e9f496dfdd2e 100644
--- a/tests/collections/llm/recipes/test_llama3_70b_64k.py
+++ b/tests/collections/llm/recipes/test_llama3_70b_64k.py
@@ -38,7 +38,7 @@ def test_trainer(self, recipe_module):
         assert trainer_config.strategy.tensor_model_parallel_size == 8
         assert trainer_config.strategy.pipeline_model_parallel_size == 4
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
         assert trainer_config.strategy.context_parallel_size == 8
         assert trainer_config.strategy.sequence_parallel is True
 
@@ -67,14 +67,37 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 8
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
-        assert trainer_config.strategy.context_parallel_size == 8
-        assert trainer_config.strategy.sequence_parallel is True
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_llama3_8b_16k.py b/tests/collections/llm/recipes/test_llama3_8b_16k.py
index d7f3bd40ecb7..fe75f01236ab 100644
--- a/tests/collections/llm/recipes/test_llama3_8b_16k.py
+++ b/tests/collections/llm/recipes/test_llama3_8b_16k.py
@@ -29,15 +29,15 @@ def test_trainer(self, recipe_module):
         assert trainer_config.__fn_or_cls__ == Trainer
         assert trainer_config.accelerator == "gpu"
         assert trainer_config.devices == 8
-        assert trainer_config.num_nodes == 1
+        assert trainer_config.num_nodes == 2
 
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
         assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
         assert trainer_config.strategy.context_parallel_size == 2
         assert trainer_config.strategy.sequence_parallel is True
 
@@ -61,14 +61,37 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
-        assert trainer_config.strategy.context_parallel_size == 2
-        assert trainer_config.strategy.sequence_parallel is True
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_llama3_8b_64k.py b/tests/collections/llm/recipes/test_llama3_8b_64k.py
index f489e12dc55f..0316b736341a 100644
--- a/tests/collections/llm/recipes/test_llama3_8b_64k.py
+++ b/tests/collections/llm/recipes/test_llama3_8b_64k.py
@@ -29,15 +29,15 @@ def test_trainer(self, recipe_module):
         assert trainer_config.__fn_or_cls__ == Trainer
         assert trainer_config.accelerator == "gpu"
         assert trainer_config.devices == 8
-        assert trainer_config.num_nodes == 1
+        assert trainer_config.num_nodes == 4
 
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
         assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
         assert trainer_config.strategy.context_parallel_size == 4
         assert trainer_config.strategy.sequence_parallel is True
 
@@ -61,14 +61,37 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
-        assert trainer_config.strategy.context_parallel_size == 4
-        assert trainer_config.strategy.sequence_parallel is True
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py b/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py
index 9f52b7117e82..62d6e0e31917 100644
--- a/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py
+++ b/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py
@@ -31,15 +31,15 @@ def test_trainer(self, recipe_module):
         assert trainer_config.__fn_or_cls__ == Trainer
         assert trainer_config.accelerator == "gpu"
         assert trainer_config.devices == 8
-        assert trainer_config.num_nodes == 2
+        assert trainer_config.num_nodes == 4
 
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
         assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
         assert trainer_config.strategy.context_parallel_size == 4
         assert trainer_config.strategy.sequence_parallel is True
         assert trainer_config.strategy.expert_model_parallel_size == 1
@@ -69,15 +69,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 2
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8
-        assert trainer_config.strategy.context_parallel_size == 4
-        assert trainer_config.strategy.sequence_parallel is True
-        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py b/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py
index f508e6dfd585..9ff93a89f438 100644
--- a/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py
+++ b/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py
@@ -35,11 +35,11 @@ def test_trainer(self, recipe_module):
         # Check strategy configuration
         assert isinstance(trainer_config.strategy, run.Config)
         assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
-        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.tensor_model_parallel_size == 8
         assert trainer_config.strategy.pipeline_model_parallel_size == 4
         assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.context_parallel_size == 8
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
+        assert trainer_config.strategy.context_parallel_size == 4
         assert trainer_config.strategy.sequence_parallel is True
         assert trainer_config.strategy.expert_model_parallel_size == 1
 
@@ -63,15 +63,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_trainer_parallelism_options(self, recipe_module):
+    def test_valid_trainer_parallelism(self, recipe_module):
         trainer_config = recipe_module.trainer()
-        assert trainer_config.strategy.tensor_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
-        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 4
-        assert trainer_config.strategy.context_parallel_size == 8
-        assert trainer_config.strategy.sequence_parallel is True
-        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
 
     def test_model_config_parameters(self, recipe_module):
         model_config = recipe_module.model()
diff --git a/tests/collections/llm/recipes/test_nemotron4_15b_16k.py b/tests/collections/llm/recipes/test_nemotron4_15b_16k.py
index e0b4e1f56eb8..6c1f5d90e160 100644
--- a/tests/collections/llm/recipes/test_nemotron4_15b_16k.py
+++ b/tests/collections/llm/recipes/test_nemotron4_15b_16k.py
@@ -47,3 +47,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_valid_trainer_parallelism(self, recipe_module):
+        trainer_config = recipe_module.pretrain_recipe().trainer
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
diff --git a/tests/collections/llm/recipes/test_nemotron4_15b_64k.py b/tests/collections/llm/recipes/test_nemotron4_15b_64k.py
index 9525039eb90e..8ed35fb81893 100644
--- a/tests/collections/llm/recipes/test_nemotron4_15b_64k.py
+++ b/tests/collections/llm/recipes/test_nemotron4_15b_64k.py
@@ -47,3 +47,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_valid_trainer_parallelism(self, recipe_module):
+        trainer_config = recipe_module.pretrain_recipe().trainer
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
diff --git a/tests/collections/llm/recipes/test_nemotron4_22b_16k.py b/tests/collections/llm/recipes/test_nemotron4_22b_16k.py
index 1e501b447d45..6b4a581348e0 100644
--- a/tests/collections/llm/recipes/test_nemotron4_22b_16k.py
+++ b/tests/collections/llm/recipes/test_nemotron4_22b_16k.py
@@ -47,3 +47,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_valid_trainer_parallelism(self, recipe_module):
+        trainer_config = recipe_module.pretrain_recipe().trainer
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False
diff --git a/tests/collections/llm/recipes/test_nemotron4_22b_64k.py b/tests/collections/llm/recipes/test_nemotron4_22b_64k.py
index c37a45793aff..68a238a93338 100644
--- a/tests/collections/llm/recipes/test_nemotron4_22b_64k.py
+++ b/tests/collections/llm/recipes/test_nemotron4_22b_64k.py
@@ -47,3 +47,35 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_valid_trainer_parallelism(self, recipe_module):
+        trainer_config = recipe_module.pretrain_recipe().trainer
+
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            % trainer_config.devices
+            == 0
+        )
+        assert (
+            trainer_config.strategy.tensor_model_parallel_size
+            * trainer_config.strategy.pipeline_model_parallel_size
+            * trainer_config.strategy.context_parallel_size
+            * trainer_config.strategy.expert_model_parallel_size
+            / trainer_config.devices
+            % trainer_config.num_nodes
+            == 0
+        )
+
+        if trainer_config.strategy.pipeline_model_parallel_size != 1:
+            assert trainer_config.strategy.pipeline_dtype is not None
+
+        if trainer_config.strategy.tensor_model_parallel_size == 1:
+            assert trainer_config.strategy.sequence_parallel is False

From c82a5976404a3bd458ff2d4577d7e1c5aeab8f24 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Fri, 18 Oct 2024 11:43:15 -0700
Subject: [PATCH 14/37] Akoumparouli/nemo ux fix dir or string artifact
 (#10936)

* Add __repr__ to Artifact

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* nemo.lightning.io.artifact: represent strings as fdl.Config to avoid path adjustment during restoration

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* t5 test minification

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
---
 nemo/lightning/io/artifact/base.py               | 7 +++++--
 nemo/lightning/io/artifact/file.py               | 7 +++----
 nemo/lightning/io/mixin.py                       | 9 +++++++++
 tests/collections/llm/megatron_t5_pretraining.py | 8 ++++----
 4 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/nemo/lightning/io/artifact/base.py b/nemo/lightning/io/artifact/base.py
index ec451de9753b..7d2d608c4149 100644
--- a/nemo/lightning/io/artifact/base.py
+++ b/nemo/lightning/io/artifact/base.py
@@ -6,10 +6,10 @@
 
 
 class Artifact(ABC, Generic[ValueT]):
-    def __init__(self, attr: str, required: bool = True):
+    def __init__(self, attr: str, required: bool = True, skip: bool = False):
         self.attr = attr
         self.required = required
-        self.skip = False
+        self.skip = skip
 
     @abstractmethod
     def dump(self, value: ValueT, absolute_dir: Path, relative_dir: Path) -> ValueT:
@@ -18,3 +18,6 @@ def dump(self, value: ValueT, absolute_dir: Path, relative_dir: Path) -> ValueT:
     @abstractmethod
     def load(self, path: Path) -> ValueT:
         pass
+
+    def __repr__(self):
+        return f"{type(self).__name__}(skip= {self.skip}, attr= {self.attr}, required= {self.required})"
diff --git a/nemo/lightning/io/artifact/file.py b/nemo/lightning/io/artifact/file.py
index 1364468cde0a..1cd63b706c9a 100644
--- a/nemo/lightning/io/artifact/file.py
+++ b/nemo/lightning/io/artifact/file.py
@@ -2,6 +2,7 @@
 import shutil
 from pathlib import Path
 from typing import Union
+import fiddle as fdl
 
 from nemo.lightning.io.artifact.base import Artifact
 
@@ -19,8 +20,7 @@ class FileArtifact(Artifact[str]):
     def dump(self, value: str, absolute_dir: Path, relative_dir: Path) -> str:
         if not pathize(value).exists():
             # This is Artifact is just a string.
-            self.skip = True
-            return value
+            return fdl.Config(FileArtifact, attr=value, skip=True)
         new_value = copy_file(value, absolute_dir, relative_dir)
         return str(new_value)
 
@@ -65,8 +65,7 @@ class DirOrStringArtifact(DirArtifact):
     def dump(self, value: str, absolute_dir: Path, relative_dir: Path) -> str:
         if not pathize(value).exists():
             # This is Artifact is just a string.
-            self.skip = True
-            return value
+            return fdl.Config(DirOrStringArtifact, attr=value, skip=True)
         return super().dump(value, absolute_dir, relative_dir)
 
     def load(self, path: str) -> str:
diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
index 463c18065494..27cb3b18b55b 100644
--- a/nemo/lightning/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -609,6 +609,15 @@ def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: P
 
 def _artifact_transform_load(cfg: fdl.Config, path: Path):
     for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []):
+        # We expect an artifact.attr to be a string or a fdl.Config.
+        # Some parameteres can be a string or a filepath. When those parameters are just strings,
+        # we will represent it with a fdl.Config, and will skip the rest of the loop (base-dir adjustment).
+        current_val = getattr(cfg, artifact.attr)
+        if isinstance(current_val, fdl.Config):
+            # artifact.attr is a string not a path.
+            setattr(cfg, artifact.attr, fdl.build(current_val).attr)
+            continue
+
         if artifact.skip:
             continue
         current_val = getattr(cfg, artifact.attr)
diff --git a/tests/collections/llm/megatron_t5_pretraining.py b/tests/collections/llm/megatron_t5_pretraining.py
index 5d8f55a7f26f..29d7eb2ebf2b 100644
--- a/tests/collections/llm/megatron_t5_pretraining.py
+++ b/tests/collections/llm/megatron_t5_pretraining.py
@@ -59,16 +59,16 @@ def get_args():
         paths=args.data_path,
         seq_length=512,
         seq_length_dec=128,
-        micro_batch_size=64,
-        global_batch_size=512,
+        micro_batch_size=args.devices,
+        global_batch_size=2 * args.devices,
         seed=1234,
         tokenizer=tokenizer,
         split="99982,9,9",
         index_mapping_dir=args.index_mapping_dir,
     )
     t5_config = llm.t5.model.t5.T5Config(
-        num_layers=12,
-        encoder_num_layers=12,
+        num_layers=args.devices,
+        encoder_num_layers=args.devices,
         hidden_size=768,
         ffn_hidden_size=3072,
         num_attention_heads=12,

From 448ff8cd74ff375350ce54c2427984619431c846 Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Fri, 18 Oct 2024 22:55:23 +0300
Subject: [PATCH 15/37] ckpt convert bug fixes (#10878)

* Mistral-NeMo-12B recipe

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rename mistral to mistral_7b

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* include mistral_nemo_12b in __init__

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* add to __init__

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* Remove stale imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* TP=2

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove finetune_reci[e

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Rename MistralNeMo2407Config12B to MistralNeMoConfig12B per review's suggestion

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update config names in tests

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* mistral-nemo-12b from llama_8b

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* TP=2; SP=True

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix overlap value

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* update mistral-nemo-base-12b finetune recipe

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* bug fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

* remove extra file

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove extra changes

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert changes

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add ckpt_format configurable

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* revert changes

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: dimapihtar <dimapihtar@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>
---
 .../megatron_ckpt_to_nemo.py                  | 10 ++++---
 .../convert_zarr_to_torch_dist.py             | 28 +++++++++++++------
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
index 40ba35f819ef..c81119489582 100644
--- a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
+++ b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
@@ -21,7 +21,9 @@
      --checkpoint_name <checkpoint_name> \
      --nemo_file_path <path_to_output_nemo_file> \
      --tensor_model_parallel_size <tensor_model_parallel_size> \
-     --pipeline_model_parallel_size <pipeline_model_parallel_size>
+     --pipeline_model_parallel_size <pipeline_model_parallel_size> \
+     --gpus_per_node <gpus_per_node> \
+     --model_type <model_type>
 """
 
 import dis
@@ -100,7 +102,7 @@ def get_args():
         default="gpt",
         choices=["gpt", "sft", "t5", "bert", "nmt", "bart", "retro"],
     )
-    parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
+    parser.add_argument("--local-rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
     parser.add_argument("--bcp", action="store_true", help="Whether on BCP platform")
     parser.add_argument(
         "--precision",
@@ -134,7 +136,7 @@ def convert(local_rank, rank, world_size, args):
             'accelerator': 'gpu',
             'precision': args.precision,
         },
-        'model': {'native_amp_init_scale': 2 ** 32, 'native_amp_growth_interval': 1000, 'hysteresis': 2},
+        'model': {'native_amp_init_scale': 2**32, 'native_amp_growth_interval': 1000, 'hysteresis': 2},
     }
     cfg = OmegaConf.create(cfg)
 
@@ -142,7 +144,7 @@ def convert(local_rank, rank, world_size, args):
     # If FP16 create a GradScaler as the build_model_parallel_config of MegatronBaseModel expects it
     if cfg.trainer.precision == '16-mixed':
         scaler = GradScaler(
-            init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
+            init_scale=cfg.model.get('native_amp_init_scale', 2**32),
             growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
             hysteresis=cfg.model.get('hysteresis', 2),
         )
diff --git a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
index 29b56aa706fa..eeaee9aba461 100644
--- a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
+++ b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
@@ -16,14 +16,13 @@
 Conversion script to convert zarr checkpoints into torch distributed checkpoint.
   Example to run this conversion script:
     python -m torch.distributed.launch --nproc_per_node=<tensor_model_parallel_size> * <pipeline_model_parallel_size> \
-     megatron_zarr_ckpt_to_torch_dist.py \
+     convert_zarr_to_torch_dist.py \
      --model_type <model_type> \
      --checkpoint_folder <path_to_PTL_checkpoints_folder> \
      --checkpoint_name <checkpoint_name> \
      --path_to_save <path_to_output_ckpt_files> \
      --tensor_model_parallel_size <tensor_model_parallel_size> \
      --pipeline_model_parallel_size <pipeline_model_parallel_size> \
-     --hparams_file <path_to_model_yaml_config> \
      --gpus_per_node <gpus_per_node>
 """
 
@@ -64,12 +63,14 @@ def get_args():
         "--hparams_file",
         type=str,
         default=None,
-        required=True,
+        required=False,
         help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
     )
     parser.add_argument("--path_to_save", type=str, default=None, required=True, help="Path to output ckpt files.")
     parser.add_argument(
-        "--save_to_nemo", action="store_true", help="If passed, output will be written as .nemo file.",
+        "--save_to_nemo",
+        action="store_true",
+        help="If passed, output will be written as .nemo file.",
     )
     parser.add_argument("--gpus_per_node", type=int, required=True, default=None)
     parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None)
@@ -81,7 +82,7 @@ def get_args():
         default=None,
         help="If pipeline parallel size > 1, this is the rank at which the encoder ends and the decoder begins.",
     )
-    parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
+    parser.add_argument("--local-rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
     parser.add_argument("--cluster_type", required=False, default=None, help="Whether on BCP platform")
     parser.add_argument(
         "--precision",
@@ -93,7 +94,18 @@ def get_args():
     )
 
     parser.add_argument(
-        "--model_type", type=str, required=True, default="gpt", choices=["gpt", "sft", "bert"],
+        "--model_type",
+        type=str,
+        required=True,
+        default="gpt",
+        choices=["gpt", "sft", "bert"],
+    ),
+    parser.add_argument(
+        "--ckpt_format",
+        type=str,
+        required=False,
+        default="torch_dist",
+        choices=["zarr", "torch_dist"],
     )
 
     args = parser.parse_args()
@@ -114,7 +126,7 @@ def convert(local_rank, rank, world_size, args):
             'precision': args.precision,
         },
         'model': {
-            'native_amp_init_scale': 2 ** 32,
+            'native_amp_init_scale': 2**32,
             'native_amp_growth_interval': 1000,
             'hysteresis': 2,
             'gradient_as_bucket_view': True,
@@ -167,7 +179,7 @@ def convert(local_rank, rank, world_size, args):
         )
 
     with open_dict(model.cfg):
-        model.cfg.torch_distributed_checkpoint = True
+        model.cfg.dist_ckpt_format = args.ckpt_format
 
     model._save_restore_connector = NLPSaveRestoreConnector()
     save_file_path = args.path_to_save

From 4d53061900597b7f28fec216f9adc250c85d2a16 Mon Sep 17 00:00:00 2001
From: Anna Shors <71393111+ashors1@users.noreply.github.com>
Date: Fri, 18 Oct 2024 22:13:10 -0700
Subject: [PATCH 16/37] fix typo in docstring (#10955)

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo/lightning/pytorch/callbacks/model_checkpoint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
index 5244939eb5fb..adf890a8fb11 100644
--- a/nemo/lightning/pytorch/callbacks/model_checkpoint.py
+++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
@@ -49,7 +49,7 @@ class ModelCheckpoint(PTLModelCheckpoint):
             ``every_n_epochs`` or ``every_n_train_steps``.
         save_on_train_epoch_end: Whether to run checkpointing at the end of the training epoch
         save_optim_on_train_end: Whether to include the optimizer states in the final checkpoint
-            at the end of training. Only applicable when save_weights_only is ``True``.
+            at the end of training. Only applicable when save_weights_only is ``False``.
         always_save_context: Whether to dump the artifacts needed to reinintialize the current
             model, trainer, and dataloader to allow for reproducibility of experiments.
         save_context_on_train_end: Whether to dump the artifacts on_train_end regardless of whether

From 32454c7dec33ac93ff5cb410c561f4beec53afdb Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Sat, 19 Oct 2024 18:21:23 +0300
Subject: [PATCH 17/37] remove deprecated ci tests (#10922)

* remove deprecated tutorial

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove deprecated ci tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add deprecation note

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add deprecation note

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove bart tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .github/workflows/cicd-main.yml               | 1211 +----------------
 .../nlp/duplex_text_normalization/README.md   |    2 +
 examples/nlp/token_classification/README.md   |    2 +
 ...Joint_Intent_and_Slot_Classification.ipynb |  827 -----------
 4 files changed, 9 insertions(+), 2033 deletions(-)
 create mode 100644 examples/nlp/duplex_text_normalization/README.md
 create mode 100644 examples/nlp/token_classification/README.md
 delete mode 100644 tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 4cd7edde2e3d..b576ddfd4d50 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -1333,275 +1333,6 @@ jobs:
                 pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \
                 output_manifest=preds.json
 
-  # L2: Duplex Text Normalization
-  L2_Duplex_Text_Normalization_with_Tarred_dataset:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Duplex_Text_Normalization_with_Tarred_dataset') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        cd examples/nlp/duplex_text_normalization && \
-        python duplex_text_normalization_train.py \
-        data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \
-        mode=tn \
-        lang=en \
-        tagger_model.do_training=false \
-        decoder_model.transformer=t5-small \
-        data.validation_ds.batch_size=2 \
-        data.train_ds.use_cache=false \
-        data.validation_ds.use_cache=false \
-        data.test_ds.batch_size=2 \
-        data.train_ds.decoder_data_augmentation=false \
-        data.train_ds.num_workers=2 \
-        decoder_trainer.devices=[0,1] \
-        decoder_trainer.accelerator="gpu" \
-        data.train_ds.use_tarred_dataset=true \
-        +decoder_trainer.fast_dev_run=true \
-        decoder_exp_manager.create_checkpoint_callback=false \
-        data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \
-        data.test_ds.use_cache=false \
-        data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv
-
-  # L2: Intent and Slot Classification Tasks
-  L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/intent_slot_classification && \
-        python intent_slot_classification.py \
-        model.data_dir=/home/TestData/nlp/retail \
-        model.validation_ds.prefix=dev \
-        model.test_ds.prefix=dev \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=true \
-        exp_manager.exp_dir=checkpoints
-      AFTER_SCRIPT: |
-        rm -rf checkpoints
-
-  L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/intent_slot_classification && \
-        python multi_label_intent_slot_classification.py \
-        model.data_dir=/home/TestData/nlp/new_multiatis \
-        model.validation_ds.prefix=dev \
-        model.test_ds.prefix=dev \
-        trainer.devices=1 \
-        +trainer.fast_dev_run=true \
-        exp_manager.exp_dir=checkpoints2
-      AFTER_SCRIPT: |
-        rm -rf checkpoints2
-
-    # TODO: add when megatron-bert is supported again
-    # stage("L2: Model Parallel Size 2 Megatron Text Classification") {
-    #   when {
-    #     anyOf{
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   steps{
-    #     cd examples/nlp/text_classification && \
-    #     python text_classification_with_bert.py \
-    #     trainer.devices=[0,1] \
-    #     trainer.accelerator="gpu" \
-    #     trainer.num_nodes=1 \
-    #     trainer.precision=16 \
-    #     trainer.gradient_clip_val=1.0 \
-    #     +trainer.fast_dev_run=true \
-    #     model.dataset.num_classes=6 \
-    #     model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-    #     model.train_ds.batch_size=4 \
-    #     model.language_model.pretrained_model_name=megatron-bert-uncased \
-    #     model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \
-    #     model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
-    #     model.nemo_path=null \
-    #     ~model.infer_samples \
-    #     exp_manager=null
-    #   }
-    # }
-
-    # stage("L2: Model Parallel Size 2 Megatron Autoresume") {
-    #   when {
-    #     anyOf{
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   steps{
-    #     cd examples/nlp/text_classification && \
-    #     python text_classification_with_bert.py \
-    #     trainer.devices=[0,1] \
-    #     trainer.accelerator="gpu" \
-    #     trainer.num_nodes=1 \
-    #     trainer.precision=16 \
-    #     trainer.gradient_clip_val=1.0 \
-    #     trainer.max_epochs=1 \
-    #     +trainer.fast_dev_run=true \
-    #     model.dataset.num_classes=6 \
-    #     model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-    #     model.train_ds.batch_size=4 \
-    #     model.language_model.pretrained_model_name=megatron-bert-uncased \
-    #     model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \
-    #     model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
-    #     model.nemo_path=null \
-    #     ~model.infer_samples \
-    #     +exp_manager.explicit_log_dir=/home/TestData/nlp/mp_autoresume \
-    #     +exp_manager.resume_if_exists=true
-    #   }
-    # }
-
-    # stage("L2: Model Parallel Size 2 Megatron Evaluation from .nemo") {
-    #   when {
-    #     anyOf{
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   steps{
-    #     cd examples/nlp/text_classification && \
-    #     python model_parallel_text_classification_evaluation.py \
-    #     trainer.devices=[0,1] \
-    #     trainer.accelerator="gpu" \
-    #     trainer.num_nodes=1 \
-    #     model.dataset.num_classes=6 \
-    #     model.test_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
-    #     model.nemo_path=/home/TestData/nlp/mp_2_nemo/retail_text_class_350M.nemo \
-    #     exp_manager=null
-    #   }
-    # }
-
-    # stage("L2: Model Parallel Size 2 Megatron Train from .nemo") {
-    #   when {
-    #     anyOf{
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   steps{
-    #     cd examples/nlp/token_classification && \
-    #     python token_classification_train.py \
-    #     pretrained_model=/home/TestData/nlp/mp_2_nemo/ner_350M.nemo \
-    #     model.dataset.data_dir=/home/TestData/nlp/ner/ \
-    #     model.train_ds.batch_size=2 \
-    #     model.dataset.use_cache=false \
-    #     trainer.devices=[0,1] \
-    #     trainer.accelerator="gpu" \
-    #     +trainer.fast_dev_run=true \
-    #     model.dataset.class_balancing="weighted_loss" \
-    #     exp_manager=null
-    #   }
-    # }
-
-
-  # L2: Parallel NLP Examples 2
-  L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/token_classification && \
-        python token_classification_train.py \
-        pretrained_model=ner_en_bert \
-        model.dataset.data_dir=/home/TestData/nlp/ner/ \
-        model.train_ds.batch_size=2 \
-        model.dataset.use_cache=false \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=true \
-        model.dataset.class_balancing="weighted_loss" \
-        exp_manager.exp_dir=null
-
-  L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/token_classification && \
-        data_dir="$(mktemp -d -p "$(pwd)")" && \
-        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-        python punctuation_capitalization_train_evaluate.py \
-          pretrained_model=punctuation_en_bert \
-          model.train_ds.ds_item="${data_dir}" \
-          model.validation_ds.ds_item="${data_dir}" \
-          model.test_ds.ds_item="${data_dir}" \
-          +model.train_ds.use_cache=false \
-          +model.validation_ds.use_cache=false \
-          +model.test_ds.use_cache=false \
-          trainer.devices=1 \
-          trainer.accelerator="gpu" \
-          +trainer.fast_dev_run=true \
-          exp_manager.exp_dir=null;
-
-        rm -rf "${data_dir}"
-
-  L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/token_classification && \
-        python token_classification_train.py \
-        model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=true \
-        model.dataset.use_cache=false \
-        model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \
-        exp_manager.exp_dir=null
-
-  L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/token_classification/token_classification_evaluate.py \
-        model.dataset.data_dir=/home/TestData/nlp/ner/ \
-        model.dataset.use_cache=false \
-        pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo
-
-  L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        data_dir="$(mktemp -d -p "$(pwd)")" && \
-        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-        python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-          +do_training=false \
-          +do_testing=true \
-          model.test_ds.ds_item="${data_dir}" \
-          ~model.train_ds \
-          ~model.validation_ds \
-          +model.test_ds.use_cache=false \
-          pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo;
-
-        rm -rf "${data_dir}"
-
-
   # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed
   L2_Pretraining_BERT_pretraining_from_Text:
     needs: [cicd-test-container-setup]
@@ -1978,313 +1709,6 @@ jobs:
         model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
       AFTER_SCRIPT: |
         rm -rf examples/nlp/machine_translation/megatron_nmt_results
-
-  L2_Megatron_BART_Perceiver_MIM_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Perceiver_MIM_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.arch=perceiver \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="swiglu" \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="swiglu" \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.micro_batch_size=2 \
-        model.global_batch_size=4 \
-        model.data.data_impl=text_mmap \
-        model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-        model.data.splits_string="\"800,100,100\"" \
-        model.data.whole_word_masking=False \
-        model.tokenizer.library=sentencepiece \
-        model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-        ++model.hiddens.enc_output_name=z \
-        ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
-        ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
-        ++model.hiddens.loss.mim.cls_name=a_mim \
-        ++model.hiddens.loss.mim.loss_weight=0.5
-        # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
-        # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.arch=perceiver \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="swiglu" \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="swiglu" \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.micro_batch_size=2 \
-        model.global_batch_size=4 \
-        model.data.data_impl=text_mmap \
-        model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-        model.data.splits_string="\"800,100,100\"" \
-        model.data.whole_word_masking=False \
-        model.tokenizer.library=sentencepiece \
-        model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-        ++model.hiddens.enc_output_name=z \
-        ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
-        ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
-        ++model.hiddens.loss.mim.cls_name=a_mim \
-        ++model.hiddens.loss.mim.loss_weight=0.5
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/megatron_mim_results
-
-    # stage("L2: NMT Bottleneck Fallback") {
-    #   when {
-    #     anyOf {
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   parallel {
-    #     stage("L2: seq2seq (no bottleneck)") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=nll \
-    #           model.encoder.arch=seq2seq \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=params \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \
-    #           model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
-    #           model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null \
-    #         }
-    #     }
-    #   }
-    # }
-    # stage("L2: NMT Bottleneck Architecture") {
-    #   when {
-    #     anyOf {
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   parallel {
-    #     stage("Bridge Encoder (identity)") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=nll \
-    #           model.encoder.arch=bridge \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=identity \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null
-    #         }
-    #     }
-    #     stage("Perceiver Encoder (params)") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=nll \
-    #           model.encoder.arch=perceiver \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=params \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null
-    #         }
-    #     }
-    #   }
-    # }
-    # stage("L2: NMT Bottleneck LVM") {
-    #   when {
-    #     anyOf {
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   parallel {
-    #     stage("VAE") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=vae \
-    #           model.encoder.arch=perceiver \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=params \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null
-    #         }
-    #     }
-    #     stage("MIM") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=mim \
-    #           model.encoder.arch=perceiver \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=params \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null
-    #         }
-    #     }
-    #   }
-    # }
         
   L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism:
     needs: [cicd-test-container-setup]
@@ -2354,78 +1778,6 @@ jobs:
         model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
         model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
 
-  L2_Megatron_Bert_Pretraining_and_Resume_Training:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Bert_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.sequence_parallel=True \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=20 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/bert_pretrain_results
-        rm -rf examples/nlp/language_modeling/bert_index_mappings
-
   L2_Megatron_Core_Bert_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -2496,228 +1848,6 @@ jobs:
         rm -rf examples/nlp/language_modeling/bert_pretrain_results
         rm -rf examples/nlp/language_modeling/bert_index_mappings
 
-  L2_Megatron_RETRO_Pretraining_and_Resume_Training:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_RETRO_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_retro_pretraining.py \
-        trainer.num_nodes=1 \
-        trainer.devices=2 \
-        trainer.precision=bf16 \
-        trainer.accelerator=gpu \
-        model.data.data_prefix=["none"] \
-        exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
-        model.mcore_gpt=True \
-        model.tensor_model_parallel_size=1 \
-        model.pipeline_model_parallel_size=1 \
-        model.optim.name=distributed_fused_adam \
-        model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
-        model.data.num_workers=4 \
-        model.micro_batch_size=1 \
-        model.data.shuffle_documents=False \
-        trainer.val_check_interval=30 \
-        +trainer.num_sanity_val_steps=0 \
-        model.init_method_std=0.023 \
-        model.optim.lr=6.0e-4 \
-        model.megatron_amp_O2=True \
-        model.data.splits_string="\"98,2,0\"" \
-        model.data.dataloader_type=cyclic \
-        trainer.max_steps=10
-
-        python examples/nlp/language_modeling/megatron_retro_pretraining.py \
-        trainer.num_nodes=1 \
-        trainer.devices=2 \
-        trainer.precision=bf16 \
-        trainer.accelerator=gpu \
-        model.data.data_prefix=["none"] \
-        exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
-        model.mcore_gpt=True \
-        model.tensor_model_parallel_size=1 \
-        model.pipeline_model_parallel_size=1 \
-        model.optim.name=distributed_fused_adam \
-        model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
-        model.data.num_workers=4 \
-        model.micro_batch_size=1 \
-        model.data.shuffle_documents=False \
-        trainer.val_check_interval=30 \
-        +trainer.num_sanity_val_steps=0 \
-        model.init_method_std=0.023 \
-        model.optim.lr=6.0e-4 \
-        model.megatron_amp_O2=True \
-        model.data.splits_string="\"98,2,0\"" \
-        model.data.dataloader_type=cyclic \
-        trainer.max_steps=20
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/mcore_retro_results
-
-  L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
-        trainer.devices=2 \
-        trainer.num_nodes=1 \
-        trainer.accelerator=gpu \
-        trainer.accumulate_grad_batches=1 \
-        trainer.limit_val_batches=2 \
-        exp_manager.resume_if_exists=True \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        trainer.val_check_interval=10 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
-        model.data.data_prefix= \
-        model.data.knn_index= \
-        model.data.retrieval_prefix= \
-        model.tensor_model_parallel_size=2 \
-        model.micro_batch_size=4 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.chunk_size=32 \
-        model.enc_num_layers=2 \
-        model.dec_num_layers=2 \
-        model.enc_cross_attention=[1] \
-        model.dec_cross_attention=[1] \
-        +model.data.mock=True
-
-            python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
-            trainer.devices=2 \
-            trainer.num_nodes=1 \
-            trainer.accelerator=gpu \
-            trainer.accumulate_grad_batches=1 \
-            trainer.limit_val_batches=2 \
-            exp_manager.resume_if_exists=True \
-            trainer.max_steps=20 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            trainer.val_check_interval=10 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
-            model.data.data_prefix= \
-            model.data.knn_index= \
-            model.data.retrieval_prefix= \
-            model.tensor_model_parallel_size=2 \
-            model.micro_batch_size=4 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.chunk_size=32 \
-            model.enc_num_layers=2 \
-            model.dec_num_layers=2 \
-            model.enc_cross_attention=[1] \
-            model.dec_cross_attention=[1] \
-            +model.data.mock=True
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/retro_legacy_results
-
-  # L2_Megatron_RETRO_muTransfer_Pretraining_Performance:
-  #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure
-  #   container:
-  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #     options: 
-  #       # --user 0:128
-  #       --device=/dev/nvidia0
-  #       --gpus all
-  #       --shm-size=8g 
-  #       --env TRANSFORMERS_OFFLINE=0 
-  #       --env HYDRA_FULL_ERROR=1
-  #       --volume /mnt/datadrive/TestData:/home/TestData
-  #   steps:
-  #       - name: Checkout repository
-  #         uses: actions/checkout@v4
-  #       - run: |
-  #           python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \
-  #               trainer.devices=2 \
-  #               trainer.num_nodes=1 \
-  #               trainer.accelerator=gpu \
-  #               trainer.accumulate_grad_batches=1 \
-  #               trainer.max_steps=100 \
-  #               trainer.log_every_n_steps=1 \
-  #               trainer.precision=16 \
-  #               trainer.val_check_interval=100 \
-  #               trainer.limit_val_batches=0 \
-  #               trainer.gradient_clip_val=1.0 \
-  #               +trainer.num_sanity_val_steps=0 \
-  #               exp_manager.exp_dir=examples/nlp/language_modeling/retro_results/ \
-  #               +exp_manager.version=smalltest \
-  #               model.data.neighbors=2 \
-  #               model.megatron_amp_O2=False \
-  #               model.apply_query_key_layer_scaling=False \
-  #               model.tensor_model_parallel_size=1 \
-  #               model.optim.name=muadamw \
-  #               model.optim.weight_decay=0.1 \
-  #               model.optim.betas=[0.9,0.95] \
-  #               model.optim.lr=6e-4 \
-  #               model.optim.sched.warmup_steps=1000 \
-  #               model.optim.sched.constant_steps=0 \
-  #               model.optim.sched.min_lr=6e-5 \
-  #               model.add_position_embedding=False \
-  #               model.enc_num_layers=2 \
-  #               model.dec_num_layers=6 \
-  #               model.enc_cross_attention=[0] \
-  #               model.dec_cross_attention=[3,5] \
-  #               model.hidden_size=96 \
-  #               model.ffn_hidden_size=384 \
-  #               model.init_method_std=0.023 \
-  #               model.num_attention_heads=12 \
-  #               model.max_position_embeddings=1024 \
-  #               model.encoder_seq_length=1024 \
-  #               model.tokenizer.library=megatron \
-  #               model.tokenizer.type=GPT2BPETokenizer \
-  #               model.tokenizer.merge_file=/home/TestData/nlp/megatron_retro/gpt2-merges.txt \
-  #               model.tokenizer.vocab_file=/home/TestData/nlp/megatron_retro/gpt2-vocab.json \
-  #               model.data.data_prefix=[/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document] \
-  #               model.data.knn_index=[/home/TestData/nlp/megatron_retro/knn2_map_wiki_test.idx] \
-  #               model.data.retrieval_prefix=/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document \
-  #               model.data.index_mapping_dir=/home/TestData/nlp/megatron_retro \
-  #               model.data.num_workers=8 \
-  #               model.micro_batch_size=8 \
-  #               model.normalization=rmsnorm \
-  #               model.transformer_block_type=pre_ln \
-  #               model.bias_activation_fusion=True \
-  #               model.bias_dropout_add_fusion=False \
-  #               model.masked_softmax_fusion=True \
-  #               model.hidden_dropout=0 \
-  #               model.attention_dropout=0 \
-  #               model.fp32_residual_connection=True \
-  #               model.shape_file=/home/TestData/nlp/megatron_retro/o1_rel_shape_info_tiny.yaml
-
-  #               python -c "import pandas as pd
-  #               import pathlib
-  #               from pandas.testing import assert_frame_equal
-  #               from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
-  #               import torch
-  #               if not (torch.cuda.is_available() and "A100" in torch.cuda.get_device_name()):
-  #                   import sys
-  #                   sys.exit(0)
-  #               event_file = list(pathlib.Path("examples/nlp/language_modeling/retro_results/megatron_retro/smalltest").glob("events.out.tfevents*"))[0]
-  #               ea = EventAccumulator(str(event_file)).Reload()
-  #               vals = []
-  #               for i in ea.Scalars("reduced_train_loss"):
-  #                   vals.append(i.value)
-  #               training_curve = pd.DataFrame({"loss": vals})
-  #               gt_curve = pd.read_csv("/home/TestData/nlp/megatron_retro/expected_learning_curve.csv")
-  #               assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"
-
-  #               rm -rf examples/nlp/language_modeling/retro_results
-  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-  #         if: "failure()"
-
   L2_RAG_Pipeline_Indexing:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -2748,27 +1878,11 @@ jobs:
         trainer.precision="bf16-mixed" \
         indexing.embedder.model_path="/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo" \
         indexing.index_path="/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index" \
-        generating.llm.model_path="/home/TestData/nlp/rag_pipeline/testing_models/llms/megatron_gpt_125m.nemo" \
-        generating.inference.tokens_to_generate=50 \
-        generating.inference.greedy=False \
-        generating.inference.temperature=1.0 \
-        generating.query="Which art schools did I applied to?"
-
-  L2_BioMegatron_Bert_NER_Task:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_BioMegatron_Bert_NER_Task') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/token_classification/token_classification_train.py \
-        exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \
-        trainer.max_epochs=1 \
-        model.dataset.data_dir=/home/TestData/nlp/ner \
-        model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \
-        model.tokenizer.tokenizer_name=null
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/token_classification_results
+        generating.llm.model_path="/home/TestData/nlp/rag_pipeline/testing_models/llms/megatron_gpt_125m.nemo" \
+        generating.inference.tokens_to_generate=50 \
+        generating.inference.greedy=False \
+        generating.inference.temperature=1.0 \
+        generating.query="Which art schools did I applied to?"
 
   L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
@@ -3924,103 +3038,6 @@ jobs:
       AFTER_SCRIPT: |
         rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo
 
-  L2_Megatron_T5_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=swiglu \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=relative \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=fast-swiglu \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type=pre_ln \
-        model.decoder.transformer_block_type=pre_ln \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False
-
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=swiglu \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=relative \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=fast-swiglu \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type=pre_ln \
-        model.decoder.transformer_block_type=pre_ln \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/t5_pretrain_results
-        rm -rf examples/nlp/language_modeling/t5_index_mappings
-
   L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4476,18 +3493,6 @@ jobs:
         rm -rf examples/nlp/language_modeling/t5_pretrain_results
         rm -rf examples/nlp/language_modeling/t5_index_mappings
 
-  L2_Megatron_T5_Eval:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Eval') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_t5_eval.py \
-            --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            --prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
-            --tensor_model_parallel_size 1
-
   L2_Megatron_Core_T5_Eval:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4500,196 +3505,6 @@ jobs:
             --prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
             --tensor_model_parallel_size 1
 
-  L2_Megatron_BART_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="reglu" \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="reglu" \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}"
-
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=5 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="reglu" \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="reglu" \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}"
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/bart_pretrain_results
-
-  L2_Megatron_BART_Pretraining_and_Resume_Training_PP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=geglu \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=geglu \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.respect_document_boundaries=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
-
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=geglu \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=geglu \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.respect_document_boundaries=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/bart_pretrain_results
-
- 
-  L2_Megatron_T5_PEFT_Lora_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_PEFT_Lora_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-
-        python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=9999 \
-        trainer.max_steps=3 \
-        trainer.val_check_interval=3 \
-        ++trainer.limit_val_batches=2 \
-        trainer.precision=16 \
-        exp_manager.exp_dir=/tmp/nlp_t5_lora_tuning_tp2 \
-        model.pipeline_model_parallel_size=1 \
-        model.tensor_model_parallel_size=2 \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
-        model.peft.peft_scheme=lora \
-        model.answer_only_loss=True \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[1.0] \
-        model.data.train_ds.num_workers=0 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]
-
-        python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
-        model.peft.restore_from_path=/tmp/nlp_t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
-        model.peft.restore_from_ckpt_name=null \
-        model.peft.restore_from_hparams_path=null \
-        model.tensor_model_parallel_size=2 \
-        trainer.devices=2 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-        model.data.test_ds.names=[quarel4] \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.data.test_ds.tokens_to_generate=10 \
-        model.data.test_ds.write_predictions_to_file=True \
-        model.data.test_ds.output_file_path_prefix=/tmp/nlp_t5_lora_tuning_tp2/out \
-        inference.greedy=True \
-        inference.repetition_penalty=1.0 \
-        inference.outfile_path=/tmp/nlp_t5_lora_tuning_tp2/out.jsonl
-
   L2_Megatron_Core_T5_PEFT_Lora_TP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -5555,14 +4370,8 @@ jobs:
       - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
       - L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
       - L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference
-      - L2_Duplex_Text_Normalization_with_Tarred_dataset
       - L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification
       - L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification
-      - L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test
-      - L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test
-      - L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1
-      - L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification
-      - L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation
       - L2_Pretraining_BERT_pretraining_from_Text
       - L2_Pretraining_BERT_from_Preprocessed
       - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN
@@ -5573,15 +4382,10 @@ jobs:
       - L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation
       - L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation
       - L2_Megatron_NMT_Training_TP2
-      - L2_Megatron_BART_Perceiver_MIM_Training_TP2
       - L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism
-      - L2_Megatron_Bert_Pretraining_and_Resume_Training
       - L2_Megatron_Core_Bert_Pretraining_and_Resume_Training
-      - L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training
-      - L2_Megatron_RETRO_Pretraining_and_Resume_Training
       - L2_RAG_Pipeline_Indexing
       - L2_RAG_Pipeline_Generating
-      - L2_BioMegatron_Bert_NER_Task
       - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_Skip_Train
       - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2
@@ -5602,18 +4406,13 @@ jobs:
       - L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len
       - L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2
       - L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2
-      - L2_Megatron_T5_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_Pretraining_and_Resume_Training_PP2
       - L2_Megatron_T5_w_Mixture_of_Expert_Pretraining
       - L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_T5_Eval
       - L2_Megatron_Core_T5_Eval
-      - L2_Megatron_BART_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_BART_Pretraining_and_Resume_Training_PP2
-      - L2_Megatron_T5_PEFT_Lora_TP2
       - L2_Megatron_Core_T5_PEFT_Lora_TP2
       - L2_Megatron_Mock_Data_Generation_MockGPTDataset
       - L2_Megatron_Mock_Data_Generation_MockT5Dataset
diff --git a/examples/nlp/duplex_text_normalization/README.md b/examples/nlp/duplex_text_normalization/README.md
new file mode 100644
index 000000000000..808ed2856fb2
--- /dev/null
+++ b/examples/nlp/duplex_text_normalization/README.md
@@ -0,0 +1,2 @@
+> [!IMPORTANT]  
+> This section is no longer supported in NeMo and is scheduled for removal in the 23.11 release.
diff --git a/examples/nlp/token_classification/README.md b/examples/nlp/token_classification/README.md
new file mode 100644
index 000000000000..808ed2856fb2
--- /dev/null
+++ b/examples/nlp/token_classification/README.md
@@ -0,0 +1,2 @@
+> [!IMPORTANT]  
+> This section is no longer supported in NeMo and is scheduled for removal in the 23.11 release.
diff --git a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
deleted file mode 100644
index 608685254a0d..000000000000
--- a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
+++ /dev/null
@@ -1,827 +0,0 @@
-{
-    "cells": [
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {
-                "pycharm": {
-                    "name": "#%%\n"
-                }
-            },
-            "outputs": [],
-            "source": [
-                "\"\"\"\n",
-                "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
-                "\n",
-                "Instructions for setting up Colab are as follows:\n",
-                "1. Open a new Python 3 notebook.\n",
-                "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
-                "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
-                "4. Run this cell to set up dependencies.\n",
-                "\"\"\"\n",
-                "# If you're using Google Colab and not running locally, run this cell\n",
-                "\n",
-                "# install NeMo\n",
-                "BRANCH = 'main'\n",
-                "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {
-                "pycharm": {
-                    "name": "#%%\n"
-                }
-            },
-            "outputs": [],
-            "source": [
-                "from nemo.collections import nlp as nemo_nlp\n",
-                "from nemo.utils.exp_manager import exp_manager\n",
-                "from nemo.utils import logging\n",
-                "\n",
-                "import os\n",
-                "import wget\n",
-                "import torch\n",
-                "import pytorch_lightning as pl\n",
-                "from omegaconf import OmegaConf"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "# Task Description\n",
-                "**Joint Intent and Slot classification** - is a task of classifying an Intent and detecting all relevant Slots (Entities)\n",
-                "for this Intent in a query.\n",
-                "For example, in the query:  `What is the weather in Santa Clara tomorrow morning?`, we would like to classify the query\n",
-                "as a `weather` Intent, and detect `Santa Clara` as a `location` slot and `tomorrow morning` as a `date_time` slot.\n",
-                "Intents and Slots names are usually task specific and defined as labels in the training data.\n",
-                "This is a fundamental step that is executed in any task-driven Conversational Assistant.\n",
-                "\n",
-                "Our Bert based model implementation enables to train and then detect both of these tasks together.\n",
-                "\n",
-                "**Multi Label Joint Intent and Slot classification** - is very similar to the task above, but instead of only classifying a single Intent, the task can predict multiple different intents for each query. For example, for the query `Yes, please tell me the weather`, we might want the intents for this utterance to be `yes` and `weather`. You can skip to that tutorial [here](#multi-label)\n"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "# Dataset and NeMo data format\n",
-                "\n",
-                "In this tutorial we are going to use a virtual assistant interaction data set that can be downloaded from here: https://github.com/xliuhw/NLU-Evaluation-Data.\n",
-                "There are about 10K training and 1K testing queries which cover 64 various Intents and 55 Slots. \n",
-                "\n",
-                "To work with NeMo NLP classification model, this dataset should be first converted to the NeMo format, which requires next files:\n",
-                "- **dict.intents.csv** - list of all intent names in the data. One line per an intent name.\n",
-                "- **dict.slots.csv** - list of all slot names in the data. One line per a slot name. It is possible to use both: B- I- notations, for separating between first and intermediate tokens for multi token slots. Or just use one slot type for each token of multi token slot. Our recommendation is to use later one, since it is simpler and there is no visible degradation in performance.\n",
-                "- **train.tsv/test.tsv** - contain original queries, one per line, and intent number separated by tab. For example: `what alarms do i have set right now\t0`. Intent numbers are according to the intent line in the intent dictionary file (dict.intents.csv) starting from 0. First line of these files contains a header line: `sentence \\tab label`.\n",
-                "- **train_slot.tvs/test_slot.tsv** - contain one line per a query, where instead each token there is a number of the token from the slots dictionary file (dict.slots.csv), starting from 0. Last 'out-of scope' token is usually located in the last line of the dictionary. Example: `54 0 0 54 54 12 12` (numbers separated by space). No header line in these files.\n",
-                "\n",
-                "NeMo provides **import_dataset.py** converter for few reference datasets (Assistant / Atis / Snips) which converts them to the NeMo data format for the Intent and Slot classification model. If you have your own annotated dataset in a different format, you will need to write a data converter. Possible recommended format for your own annotation, is to have one text file per all examples of one intent. With one line per query in a form like: `did i set an alarm to [alarm_type : wake up] in the [timeofday : morning]`, using brackets to define slot names. This is very similar to the assistant format from this example and you can use its converter to NeMo format with small changes. \n",
-                "\n",
-                "You can run this utility as follows:\n",
-                "\n",
-                "**python examples/nlp/intent_slot_classification/data/import_datasets.py --dataset_name=assistant --source_data_dir=source_dir_name --target_data_dir=target_dir_name**\n"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "# Download, preprocess and explore the dataset\n",
-                "## Download the dataset and convert it to the NeMo format"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# you can replace DATA_DIR and NEMO_DIR with your own locations\n",
-                "DATA_DIR = \".\"\n",
-                "NEMO_DIR = '.'\n",
-                "\n",
-                "# download the converter files from github for the purpose of this tutorial\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py', NEMO_DIR)\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py', NEMO_DIR)"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {
-                "pycharm": {
-                    "name": "#%%\n"
-                }
-            },
-            "outputs": [],
-            "source": [
-                "# download and unzip the example dataset from github\n",
-                "print('Downloading dataset...')\n",
-                "wget.download('https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip', DATA_DIR)\n",
-                "! unzip {DATA_DIR}/NLU-Evaluation-Data-master.zip -d {DATA_DIR}"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# convert the dataset to the NeMo format\n",
-                "!python {NEMO_DIR}/import_datasets.py --dataset_name=assistant --source_data_dir={DATA_DIR}/NLU-Evaluation-Data-master --target_data_dir={DATA_DIR}/nemo_format\n"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Data exploration\n",
-                "You can see the dataset in both the original and NeMo's formats. We have here 65 different Intents and 55 Slots, which could be typical commands for virtual assistants. Out of scope slot has the name 'O' and is the last in the dictionary of Slots. And we can see examples of queries and also format of training intent and slot files. "
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# list of queries divided by intent files in the original training dataset\n",
-                "! ls -l {DATA_DIR}/NLU-Evaluation-Data-master/dataset/trainset"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# print all intents from the NeMo format intent dictionary\n",
-                "!echo 'Intents: ' $(wc -l < {DATA_DIR}/nemo_format/dict.intents.csv)\n",
-                "! cat {DATA_DIR}/nemo_format/dict.intents.csv"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# print all slots from the NeMo format slot dictionary\n",
-                "!echo 'Slots: ' $(wc -l < {DATA_DIR}/nemo_format/dict.slots.csv)\n",
-                "! cat {DATA_DIR}/nemo_format/dict.slots.csv"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# examples from the intent training file\n",
-                "! head -n 10 {DATA_DIR}/nemo_format/train.tsv"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# examples from the slot training file\n",
-                "! head -n 10 {DATA_DIR}/nemo_format/train_slots.tsv"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "# Training model"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Model configuration\n",
-                "\n",
-                "Our Joint Intent and Slot classification model is comprised of the pretrained [BERT](https://arxiv.org/pdf/1810.04805.pdf) model with an Intent and Slot Classification layer on top of it.\n",
-                "\n",
-                "All model and training parameters are defined in the **intent_slot_classification_config.yaml** config file. This file is located in the folder **examples/nlp/intent_slot_classification/conf/**. It contains 2 main sections:\n",
-                "- **model**: All arguments that are related to the Model - language model, token classifier, optimizer and schedulers, datasets and any other related information\n",
-                "\n",
-                "- **trainer**: Any argument to be passed to PyTorch Lightning\n",
-                "\n",
-                "We will download the config file from repository for the purpose of the tutorial. If you have a version of NeMo installed locally, you can use it from the above folder."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# download the model config file from repository for the purpose of this example\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml', NEMO_DIR)\n",
-                "\n",
-                "# print content of the config file\n",
-                "config_file = \"intent_slot_classification_config.yaml\"\n",
-                "print(config_file)\n",
-                "config = OmegaConf.load(config_file)\n",
-                "print(OmegaConf.to_yaml(config))"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Setting up Data within the config\n",
-                "\n",
-                "Among other things, the config file contains dictionaries called train_ds and validation_ds. These are configurations used to setup the Dataset and DataLoaders of the corresponding config.\n",
-                "\n",
-                "The converter utility creates both training and evaluation files in the same directory, so we need to specify `model.data_dir` parameter to this directory. Also notice that some config lines, including `model.data_dir`, have `???` in place of paths, this means that values for these fields are required to be specified by the user.\n",
-                "\n",
-                "`config.model.intent_loss_weight` parameter - is a balance of training loss between Intent and Slot losses, a number between 0 to 1. Its default value is 0.6 which gives slightly higher priority to the Intent loss and it empirically works quite well. You can experiment with this value if you like.\n",
-                "Also you can try to change `config.model.class_balancing` parameter to `weighted_loss` and see if you get better accuracy.\n",
-                "\n",
-                "Let's now add the data directory path to the config."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "config.model.data_dir = f'{DATA_DIR}/nemo_format'"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Building the PyTorch Lightning Trainer\n",
-                "\n",
-                "NeMo models are primarily PyTorch Lightning modules - and therefore are entirely compatible with the PyTorch Lightning ecosystem. `config.trainer.max_epochs` - param defines number of training epochs. Usually 50-100 epochs or less should be enough to train on your data. Let's instantiate the Trainer object."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# lets modify some trainer configs\n",
-                "# checks if we have GPU available and uses it\n",
-                "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
-                "config.trainer.devices = 1\n",
-                "config.trainer.accelerator = accelerator\n",
-                "\n",
-                "config.trainer.precision = 16 if torch.cuda.is_available() else 32\n",
-                "\n",
-                "# for mixed precision training, uncomment the line below (precision should be set to 16 and amp_level to O1):\n",
-                "# config.trainer.amp_level = O1\n",
-                "\n",
-                "# remove distributed training flags\n",
-                "config.trainer.strategy = 'auto'\n",
-                "\n",
-                "# setup a small number of epochs for demonstration purposes of this tutorial\n",
-                "config.trainer.max_epochs = 5\n",
-                "\n",
-                "trainer = pl.Trainer(**config.trainer)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Setting up a NeMo Experiment\n",
-                "\n",
-                "NeMo has an experiment manager that handles logging and checkpointing for us, so let's use it. Model check points during training will be saved in this directory. "
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))\n",
-                "# the exp_dir provides a path to the current experiment for easy access\n",
-                "print(str(exp_dir))"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Initializing the model and Training\n",
-                "\n",
-                "Initial statistics of the dataset will be displayed at the beginning of the training and then Intent and Slot classification report will be displayed after each training epoch."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# initialize the model\n",
-                "model = nemo_nlp.models.IntentSlotClassificationModel(config.model, trainer=trainer)\n",
-                "\n",
-                "# train\n",
-                "trainer.fit(model)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "After training for 5 epochs, which should take no more than few minutes, you can expect training precision for this data set to be around these numbers (the accuracy will gradually continue to improve for this dataset up to about 50 epochs of training): \n",
-                "```\n",
-                "Intents:\n",
-                "    label                                                precision    recall       f1           support   \n",
-                "    alarm_query (label_id: 0)                               94.74      94.74      94.74         19\n",
-                "    alarm_remove (label_id: 1)                             100.00     100.00     100.00         11\n",
-                "    alarm_set (label_id: 2)                                 85.71      94.74      90.00         19\n",
-                "    audio_volume_down (label_id: 3)                          0.00       0.00       0.00          8\n",
-                "    audio_volume_mute (label_id: 4)                        100.00      86.67      92.86         15\n",
-                "    audio_volume_up (label_id: 5)                           56.52     100.00      72.22         13\n",
-                "    calendar_query (label_id: 6)                            55.00      57.89      56.41         19\n",
-                "    calendar_remove (label_id: 7)                           88.89      84.21      86.49         19\n",
-                "    calendar_set (label_id: 8)                              81.25      68.42      74.29         19\n",
-                "    cooking_recipe (label_id: 9)                            86.36     100.00      92.68         19\n",
-                "    datetime_convert (label_id: 10)                          0.00       0.00       0.00          8\n",
-                "    datetime_query (label_id: 11)                           65.52     100.00      79.17         19\n",
-                "    email_addcontact (label_id: 12)                        100.00      12.50      22.22          8\n",
-                "    email_query (label_id: 13)                              83.33      78.95      81.08         19\n",
-                "    email_querycontact (label_id: 14)                       62.50      78.95      69.77         19\n",
-                "    email_sendemail (label_id: 15)                          70.83      89.47      79.07         19\n",
-                "    general_affirm (label_id: 16)                           95.00     100.00      97.44         19\n",
-                "    general_commandstop (label_id: 17)                     100.00     100.00     100.00         19\n",
-                "    general_confirm (label_id: 18)                         100.00     100.00     100.00         19\n",
-                "    general_dontcare (label_id: 19)                        100.00     100.00     100.00         19\n",
-                "    general_explain (label_id: 20)                         100.00      94.74      97.30         19\n",
-                "    general_joke (label_id: 21)                            100.00     100.00     100.00         12\n",
-                "    general_negate (label_id: 22)                           95.00     100.00      97.44         19\n",
-                "    general_praise (label_id: 23)                          100.00      94.74      97.30         19\n",
-                "    general_quirky (label_id: 24)                           40.00      10.53      16.67         19\n",
-                "    general_repeat (label_id: 25)                          100.00     100.00     100.00         19\n",
-                "    iot_cleaning (label_id: 26)                             84.21     100.00      91.43         16\n",
-                "    iot_coffee (label_id: 27)                               94.74      94.74      94.74         19\n",
-                "    iot_hue_lightchange (label_id: 28)                      94.44      89.47      91.89         19\n",
-                "    iot_hue_lightdim (label_id: 29)                        100.00      83.33      90.91         12\n",
-                "    iot_hue_lightoff (label_id: 30)                         89.47      89.47      89.47         19\n",
-                "    iot_hue_lighton (label_id: 31)                           0.00       0.00       0.00          3\n",
-                "    iot_hue_lightup (label_id: 32)                          81.25      92.86      86.67         14\n",
-                "    iot_wemo_off (label_id: 33)                             60.00     100.00      75.00          9\n",
-                "    iot_wemo_on (label_id: 34)                             100.00      14.29      25.00          7\n",
-                "    lists_createoradd (label_id: 35)                        78.95      78.95      78.95         19\n",
-                "    lists_query (label_id: 36)                              78.95      78.95      78.95         19\n",
-                "    lists_remove (label_id: 37)                             90.00      94.74      92.31         19\n",
-                "    music_likeness (label_id: 38)                           70.59      66.67      68.57         18\n",
-                "    music_query (label_id: 39)                              77.78      73.68      75.68         19\n",
-                "    music_settings (label_id: 40)                            0.00       0.00       0.00          7\n",
-                "    news_query (label_id: 41)                               77.78      73.68      75.68         19\n",
-                "    play_audiobook (label_id: 42)                           90.00      94.74      92.31         19\n",
-                "    play_game (label_id: 43)                                80.00      84.21      82.05         19\n",
-                "    play_music (label_id: 44)                               53.85      73.68      62.22         19\n",
-                "    play_podcasts (label_id: 45)                            89.47      89.47      89.47         19\n",
-                "    play_radio (label_id: 46)                               93.75      78.95      85.71         19\n",
-                "    qa_currency (label_id: 47)                              95.00     100.00      97.44         19\n",
-                "    qa_definition (label_id: 48)                            85.00      89.47      87.18         19\n",
-                "    qa_factoid (label_id: 49)                               45.16      73.68      56.00         19\n",
-                "    qa_maths (label_id: 50)                                100.00     100.00     100.00         14\n",
-                "    qa_stock (label_id: 51)                                 95.00     100.00      97.44         19\n",
-                "    recommendation_events (label_id: 52)                    94.44      89.47      91.89         19\n",
-                "    recommendation_locations (label_id: 53)                 94.74      94.74      94.74         19\n",
-                "    recommendation_movies (label_id: 54)                   100.00     100.00     100.00         10\n",
-                "    social_post (label_id: 55)                              90.00      94.74      92.31         19\n",
-                "    social_query (label_id: 56)                             94.74     100.00      97.30         18\n",
-                "    takeaway_order (label_id: 57)                           93.75      78.95      85.71         19\n",
-                "    takeaway_query (label_id: 58)                           85.71      94.74      90.00         19\n",
-                "    transport_query (label_id: 59)                          83.33      78.95      81.08         19\n",
-                "    transport_taxi (label_id: 60)                          100.00     100.00     100.00         18\n",
-                "    transport_ticket (label_id: 61)                         89.47      89.47      89.47         19\n",
-                "    transport_traffic (label_id: 62)                       100.00     100.00     100.00         19\n",
-                "    weather_query (label_id: 63)                           100.00      89.47      94.44         19\n",
-                "    -------------------\n",
-                "    micro avg                                               85.04      85.04      85.04       1076\n",
-                "    macro avg                                               81.13      80.81      79.36       1076\n",
-                "    weighted avg                                            84.10      85.04      83.54       1076\n",
-                "    \n",
-                "Slots:\n",
-                "    label                                                precision    recall       f1           support   \n",
-                "    alarm_type (label_id: 0)                                 0.00       0.00       0.00          0\n",
-                "    app_name (label_id: 1)                                   0.00       0.00       0.00          6\n",
-                "    artist_name (label_id: 2)                                0.00       0.00       0.00         21\n",
-                "    audiobook_author (label_id: 3)                           0.00       0.00       0.00          1\n",
-                "    audiobook_name (label_id: 4)                             0.00       0.00       0.00         18\n",
-                "    business_name (label_id: 5)                             60.00      56.60      58.25         53\n",
-                "    business_type (label_id: 6)                              0.00       0.00       0.00         24\n",
-                "    change_amount (label_id: 7)                              0.00       0.00       0.00         25\n",
-                "    coffee_type (label_id: 8)                                0.00       0.00       0.00          4\n",
-                "    color_type (label_id: 9)                                 0.00       0.00       0.00         12\n",
-                "    cooking_type (label_id: 10)                              0.00       0.00       0.00          0\n",
-                "    currency_name (label_id: 11)                            84.09      75.51      79.57         49\n",
-                "    date (label_id: 12)                                     57.95      91.07      70.83        112\n",
-                "    definition_word (label_id: 13)                           0.00       0.00       0.00         20\n",
-                "    device_type (label_id: 14)                              74.55      51.25      60.74         80\n",
-                "    drink_type (label_id: 15)                                0.00       0.00       0.00          0\n",
-                "    email_address (label_id: 16)                             0.00       0.00       0.00         14\n",
-                "    email_folder (label_id: 17)                              0.00       0.00       0.00          1\n",
-                "    event_name (label_id: 18)                              100.00      13.24      23.38         68\n",
-                "    food_type (label_id: 19)                                51.72      69.77      59.41         43\n",
-                "    game_name (label_id: 20)                                60.00      14.29      23.08         21\n",
-                "    game_type (label_id: 21)                                 0.00       0.00       0.00          0\n",
-                "    general_frequency (label_id: 22)                         0.00       0.00       0.00          9\n",
-                "    house_place (label_id: 23)                              93.33      42.42      58.33         33\n",
-                "    ingredient (label_id: 24)                                0.00       0.00       0.00          6\n",
-                "    joke_type (label_id: 25)                                 0.00       0.00       0.00          4\n",
-                "    list_name (label_id: 26)                                 0.00       0.00       0.00         21\n",
-                "    meal_type (label_id: 27)                                 0.00       0.00       0.00          0\n",
-                "    media_type (label_id: 28)                                0.00       0.00       0.00         37\n",
-                "    movie_name (label_id: 29)                                0.00       0.00       0.00          0\n",
-                "    movie_type (label_id: 30)                                0.00       0.00       0.00          0\n",
-                "    music_album (label_id: 31)                               0.00       0.00       0.00          0\n",
-                "    music_descriptor (label_id: 32)                          0.00       0.00       0.00          3\n",
-                "    music_genre (label_id: 33)                               0.00       0.00       0.00          9\n",
-                "    news_topic (label_id: 34)                                0.00       0.00       0.00         17\n",
-                "    order_type (label_id: 35)                                0.00       0.00       0.00         17\n",
-                "    person (label_id: 36)                                   44.86      92.31      60.38         52\n",
-                "    personal_info (label_id: 37)                             0.00       0.00       0.00         20\n",
-                "    place_name (label_id: 38)                               71.25      77.03      74.03        148\n",
-                "    player_setting (label_id: 39)                            0.00       0.00       0.00          1\n",
-                "    playlist_name (label_id: 40)                             0.00       0.00       0.00          1\n",
-                "    podcast_descriptor (label_id: 41)                        0.00       0.00       0.00         13\n",
-                "    podcast_name (label_id: 42)                              0.00       0.00       0.00          4\n",
-                "    radio_name (label_id: 43)                               66.67      10.53      18.18         38\n",
-                "    relation (label_id: 44)                                  0.00       0.00       0.00         17\n",
-                "    song_name (label_id: 45)                                 0.00       0.00       0.00         22\n",
-                "    time (label_id: 46)                                     70.27      78.20      74.02        133\n",
-                "    time_zone (label_id: 47)                                 0.00       0.00       0.00          9\n",
-                "    timeofday (label_id: 48)                                 0.00       0.00       0.00         28\n",
-                "    transport_agency (label_id: 49)                          0.00       0.00       0.00          9\n",
-                "    transport_descriptor (label_id: 50)                      0.00       0.00       0.00          0\n",
-                "    transport_name (label_id: 51)                            0.00       0.00       0.00          4\n",
-                "    transport_type (label_id: 52)                           78.38      82.86      80.56         35\n",
-                "    weather_descriptor (label_id: 53)                        0.00       0.00       0.00         17\n",
-                "    O (label_id: 54)                                        92.42      98.80      95.50       5920\n",
-                "    -------------------\n",
-                "    micro avg                                               89.10      89.10      89.10       7199\n",
-                "    macro avg                                               21.86      18.56      18.18       7199\n",
-                "    weighted avg                                            84.42      89.10      86.01       7199\n",
-                "```"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Evaluation\n",
-                "To see how the model performs, we can evaluate the performance of the trained model on a test data file. Here we will reload the model from the `.nemo` file saved during training. By default, the `.nemo` file contains the final checkpoint. We will use the same trainer for testing."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# extract the path of the best checkpoint from the training, you may update it to any other saved checkpoint file\n",
-                "checkpoint_path = trainer.checkpoint_callback.best_model_path\n",
-                "\n",
-                "# load the model from this checkpoint\n",
-                "eval_model = nemo_nlp.models.IntentSlotClassificationModel.load_from_checkpoint(checkpoint_path=checkpoint_path)"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# we will setup testing data reusing the same config (test section)\n",
-                "eval_model.setup_test_data(test_data_config=config.model.test_ds)\n",
-                "\n",
-                "# run the evaluation on the test dataset\n",
-                "trainer.test(model=eval_model, ckpt_path=None, verbose=False)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Inference from Examples\n",
-                "Next step to see how the trained model will classify Intents and Slots for given queries from this domain. To improve the predictions you may need to train the model for more than 5 epochs.\n"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "queries = [\n",
-                "    'set alarm for seven thirty am',\n",
-                "    'lower volume by fifty percent',\n",
-                "    'what is my schedule for tomorrow',\n",
-                "]\n",
-                "\n",
-                "pred_intents, pred_slots = eval_model.predict_from_examples(queries, config.model.test_ds)\n",
-                "\n",
-                "logging.info('The prediction results of some sample queries with the trained model:')\n",
-                "for query, intent, slots in zip(queries, pred_intents, pred_slots):\n",
-                "    logging.info(f'Query : {query}')\n",
-                "    logging.info(f'Predicted Intent: {intent}')\n",
-                "    logging.info(f'Predicted Slots: {slots}')"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Training Script\n",
-                "\n",
-                "If you have NeMo installed locally (eg. cloned from the Github), you can also train the model with the example script: `examples/nlp/intent_slot_classification/intent_slot_classification.py.`\n",
-                "This script contains an example on how to train, evaluate and perform inference with the IntentSlotClassificationModel.\n",
-                "\n",
-                "To run a training script, use:\n",
-                "\n",
-                "`cd examples/nlp/intent_slot_classification`\n",
-                "\n",
-                "`python intent_slot_classification.py model.data_dir=PATH_TO_DATA_DIR`\n",
-                "\n",
-                "By default, this script uses examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.py config file, and you may update all the params inside of this config file or alternatively providing them in the command line."
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "<a id='multi-label'></a>\n",
-                "# Multi-Label Intent Classification\n",
-                "---\n",
-                "\n",
-                "As mentioned above, our multi-label model will be very similar the single intent classification model, with the added functionality of predicting multiple different intents for a single query. For example, the query `show all flights and fares from denver to san francisco` would have intents `atis_airfare` and `atis_flight`. From our list of intents found in `dict.intents.csv`, the model checks whether each individual intent is suitable for the given query.\n",
-                "\n",
-                "For this tutorial, we will be using the ATIS (Airline Travel Information System) dataset, converting it to a multi-label data format, and then using the new data to train our model.\n",
-                "\n"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Download the dataset and convert it to the NeMo format"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# download the converter files from github for the purpose of this tutorial\n",
-                "DATA_DIR = './multiatis'\n",
-                "NEMO_DIR = './atis'\n",
-                "\n",
-                "!mkdir {DATA_DIR}\n",
-                "!mkdir {NEMO_DIR}\n",
-                "\n",
-                "\n",
-                "files = [f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.dict.intent.csv', \n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.dict.slots.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.dict.vocab.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.test.intent.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.test.pkl', \n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.test.query.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.test.slots.csv', \n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.train.intent.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.train.pkl',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.train.query.csv',\n",
-                "         f'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/raw_data/ms-cntk-atis/atis.train.slots.csv']\n",
-                "\n",
-                "         \n",
-                "for file in files:\n",
-                "    wget.download(file, DATA_DIR)\n",
-                "\n",
-                "\n",
-                "# download the converter files from github for the purpose of this tutorial\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py', NEMO_DIR)\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py', NEMO_DIR)\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/convert_datasets.py', NEMO_DIR)\n",
-                "\n",
-                "# Get original atis dataset\n",
-                "!python {NEMO_DIR}/import_datasets.py --dataset_name=atis --source_data_dir={DATA_DIR} --target_data_dir={DATA_DIR}/nemo_format\n",
-                "# Script will create new files at {DATA_DIR}/new_format\n",
-                "!mkdir {DATA_DIR}/new_format\n",
-                "!python {NEMO_DIR}/convert_datasets.py --source_data_dir={DATA_DIR}/nemo_format --target_data_dir={DATA_DIR}/new_format"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Data Augmentation (Optional)\n",
-                "---\n",
-                "\n",
-                "In scenarios when we don't have many training examples with multiple intent labels, data augmentation can be very useful. This can be done by concatenating utterances together, and adding it to our training data. Some ways of concatenating include adding a period or \\\"and\\\" between the two utterances. A script has been provided below to help with augmentation, but it can be changed depending on your use case."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# download the data augmentation script\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/dataset_processing/nlp/intent_and_slot/augment_training_data.py', NEMO_DIR)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "The script augment_training_data.py allows for four command line arguments to be passed in: \n",
-                "\n",
-                "source_data_dir: directory that contains the original multi-label data <br>\n",
-                "target_data_dir: directory to store the new data directory <br>\n",
-                "num_mixed: number of new utterances to add to dataset per class pair (utterances with labels 1 and 2) <br>\n",
-                "link_string: string that is in between the two utterances (\".\", \"\", \"and\", \"with\") <br>"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "!python {NEMO_DIR}/augment_training_data.py --source_data_dir={DATA_DIR}/new_format --target_data_dir={DATA_DIR}/augmented_data --num_mixed=10"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Training the Model"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# download the model config file from repository for the purpose of this example\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml', NEMO_DIR)\n",
-                "\n",
-                "# print content of the config file\n",
-                "config_file = f\"{NEMO_DIR}/multi_label_intent_slot_classification_config.yaml\"\n",
-                "print(config_file)\n",
-                "config = OmegaConf.load(config_file)\n",
-                "print(OmegaConf.to_yaml(config))"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "config.model.data_dir = f\"{DATA_DIR}/new_format\"\n",
-                "config.model.validation_ds.prefix = \"dev\"\n",
-                "config.model.test_ds.prefix = \"dev\"\n",
-                "config.model.class_balancing = \"weighted_loss\"\n",
-                "config.trainer.max_epochs = 5\n",
-                "run_name = \"test\"\n",
-                "\n",
-                "# checks if we have GPU available and uses it\n",
-                "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
-                "config.trainer.devices = 1\n",
-                "config.trainer.accelerator = accelerator\n",
-                "\n",
-                "# remove distributed training flags\n",
-                "config.trainer.strategy = 'auto'\n",
-                "\n",
-                "trainer = pl.Trainer(**config.trainer)\n",
-                "config.exp_manager.exp_dir = os.path.join(DATA_DIR, \"output/\" + run_name)\n",
-                "config.exp_manager.create_checkpoint_callback = True\n",
-                "\n",
-                "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))\n",
-                "model = nemo_nlp.models.MultiLabelIntentSlotClassificationModel(config.model, trainer=trainer)\n",
-                "trainer.fit(model)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Evaluation"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "To see how the model performs, we can evaluate the performance of the trained model on a test data file. Here we will reload the model from the `.nemo` file saved during training. By default, the `.nemo` file contains the final checkpoint. We will use the same trainer for testing."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# specify checkpoint path with .nemo file\n",
-                "checkpoint_path = os.path.join(exp_dir, \"checkpoints\", \"MultiLabelIntentSlot.nemo\")\n",
-                "\n",
-                "# load the model from this checkpoint\n",
-                "eval_model =  nemo_nlp.models.MultiLabelIntentSlotClassificationModel.restore_from(checkpoint_path)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "### Optimizing Threshold\n",
-                "\n",
-                "As mentioned above, when classifying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n",
-                "\n",
-                "We need to use these probabilities to generate final label predictions of 0 or 1 for each label. While we can use 0.5 as the probability threshold, it is usually the case that there is a better threshold to use depending on the metric we want to optimize. For this tutorial, we will be finding the threshold that gives us the best micro-F1 score on the validation set. After running the `optimize_threshold` method, the threshold attribute for our model will be updated."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "eval_model.optimize_threshold(config.model.test_ds, 'dev')"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "eval_model.threshold"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "###  Inference from Examples\n",
-                "Similar to the previous example we can run inference to see how the trained model will classify Intents and Slots for given queries from this domain. To improve the predictions you may need to train the model for more than 10 epochs.\n"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "queries = [\n",
-                "    'i would like to find a flight from charlotte to las vegas that makes a stop in st. louis',\n",
-                "    'on april first i need a ticket from tacoma to san jose departing before 7 am',\n",
-                "    'how much is the limousine service in boston',\n",
-                "]\n",
-                "\n",
-                "# We use the optimized threshold for predictions\n",
-                "pred_intents, pred_slots, pred_list = eval_model.predict_from_examples(queries, config.model.test_ds)\n",
-                "logging.info('The prediction results of some sample queries with the trained model:')\n",
-                "    \n",
-                "for query, intent, slots in zip(queries, pred_intents, pred_slots):\n",
-                "    logging.info(f'Query : {query}')\n",
-                "    logging.info(f'Predicted Intents: {intent}')\n",
-                "    logging.info(f'Predicted Slots: {slots}')"
-            ]
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "Python 3 (ipykernel)",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.8.12"
-        }
-    },
-    "nbformat": 4,
-    "nbformat_minor": 1
-}

From b53e8ba373adfbc39840a16a9c4bf98d572f1959 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Sat, 19 Oct 2024 08:23:11 -0700
Subject: [PATCH 18/37] [Nemo CICD] Remove deprecated tests (#10960)

* remove deprecated tutorial

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove deprecated ci tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add deprecation note

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add deprecation note

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove bart tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Remove deleted CI tests

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
---
 .github/workflows/cicd-main.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index b576ddfd4d50..4d201b9c55a8 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -4370,8 +4370,6 @@ jobs:
       - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
       - L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
       - L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference
-      - L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification
-      - L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification
       - L2_Pretraining_BERT_pretraining_from_Text
       - L2_Pretraining_BERT_from_Preprocessed
       - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN

From 8949924d7e28dacfa5a573ee1e92c64cf8beb7c9 Mon Sep 17 00:00:00 2001
From: Adi Renduchintala <adithya.r@gmail.com>
Date: Sat, 19 Oct 2024 14:33:59 -0700
Subject: [PATCH 19/37] Adithyare/oai chat completion (#10785)

* updates

Signed-off-by: adithyare <adithyare@nvidia.com>

* open ai chat completion wip

Signed-off-by: adithyare <adithyare@nvidia.com>

* responding with model responses

Signed-off-by: adithyare <adithyare@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: arendu <arendu@users.noreply.github.com>

* also support general completion

Signed-off-by: adithyare <adithyare@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: arendu <arendu@users.noreply.github.com>

---------

Signed-off-by: adithyare <adithyare@nvidia.com>
Signed-off-by: arendu <arendu@users.noreply.github.com>
Co-authored-by: arendu <arendu@users.noreply.github.com>
---
 .../conf/megatron_gpt_finetuning_config.yaml  |   7 +
 .../modules/common/text_generation_server.py  | 210 +++++++++++++++++-
 2 files changed, 215 insertions(+), 2 deletions(-)

diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
index 06551f46486c..79a07ce4e2c0 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
@@ -126,6 +126,13 @@ model:
       tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
 
   data:
+    chat: False # whether use chatbot data or not
+    chat_prompt_tokens:  # special tokens for the chat prompts, a dictionary of {token_type: token}. note that some tokenizer may combine the characters at the junction between {end_of_turn}{turn_start}. e.g. '<im end><im start>', the '><' sometimes is merged to be a single token. This is not supported, try to avoid
+      system_turn_start: "\x00"
+      turn_start: "\x11"
+      label_start: "\x12"
+      end_of_turn: "\x0A"  # \0x0A is '\n'
+      end_of_name: "\x0A"  # \0x0A is '\n'
     train_ds:
       # Example of how to specify paths to multiple datasets
       # file_names:
diff --git a/nemo/collections/nlp/modules/common/text_generation_server.py b/nemo/collections/nlp/modules/common/text_generation_server.py
index 6c257317b99f..3f8e34b94134 100644
--- a/nemo/collections/nlp/modules/common/text_generation_server.py
+++ b/nemo/collections/nlp/modules/common/text_generation_server.py
@@ -15,11 +15,17 @@
 
 import json
 import threading
+import time
+import uuid
 
 import torch
 from flask import Flask, jsonify, request
 from flask_restful import Api, Resource
 
+from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_chat_dataset import (
+    _get_header_conversation_type_mask_role,
+    get_prompt_template_example,
+)
 from nemo.collections.nlp.modules.common.retro_inference_strategies import (
     RetroModelTextGenerationStrategy,
     RetroQAModelTextGenerationStrategy,
@@ -61,6 +67,189 @@ def send_do_generate():
         choice = torch.cuda.LongTensor([GENERATE_NUM])
         torch.distributed.broadcast(choice, 0)
 
+    def convert_messages(self, input_list):
+        output_dict = {
+            'system': '',
+            'conversations': [],
+            'mask': 'User',
+            'type': 'VALUE_TO_TEXT',
+        }
+
+        # Extract the system message
+        for msg in input_list:
+            if msg['role'] == 'system':
+                output_dict['system'] = msg['content']
+                break  # Assuming only one system message
+
+        # Build the conversations list
+        for msg in input_list:
+            if msg['role'] != 'system':
+                conversation_entry = {
+                    'from': msg['role'].capitalize(),  # Capitalize 'user' and 'assistant'
+                    'value': msg['content'],
+                    'label': None,
+                }
+                output_dict['conversations'].append(conversation_entry)
+
+        return output_dict
+
+    def completion(self, data):
+        output_sentence = ""
+        with lock:  # Need to get lock to keep multiple threads from hitting code
+            MegatronGenerate.send_do_generate()  # Tell other ranks we're doing generate
+            extra = {}
+            if self.inference_strategy is not None:
+                extra['strategy'] = self.inference_strategy
+
+            all_probs = False
+            add_BOS = False
+            top_p = data.get("top_p", 1.0)
+            top_k = data.get("top_k", 0)
+            max_tokens = data.get("max_tokens", 32)
+            temperature = data.get("temperature", 0.0)
+            logprobs = data.get("logprobs", False)
+            greedy = temperature == 0.0
+            end_strings = ['<|endoftext|>'] + data.get("end_strings", [])
+            prompt = data["prompt"]
+            random_seed = data.get("seed", 1234)
+
+            output = generate(
+                self.model,
+                [prompt],
+                tokens_to_generate=max_tokens,
+                all_probs=all_probs,
+                temperature=temperature,
+                add_BOS=add_BOS,
+                top_k=top_k,
+                top_p=top_p,
+                greedy=greedy,
+                repetition_penalty=1.0,
+                end_strings=end_strings,
+                min_tokens_to_generate=0,
+                compute_logprob=logprobs,
+                random_seed=random_seed,
+                **extra,
+            )
+            for k in output:
+                if isinstance(output[k], torch.Tensor):
+                    output[k] = output[k].tolist()
+
+            output_sentence = output['sentences'][0][len(prompt) :]
+            tokens = output['tokens'][0]
+            logprobs = output['logprob'][0] if output['logprob'] is not None else None
+            num_prompt_tokens = len(prompt.split())
+            num_output_sentence = len(output_sentence.split())
+
+        return jsonify(
+            {
+                "choices": [
+                    {
+                        "finish_reason": "",
+                        "index": 0,
+                        "logprobs": logprobs,
+                        "text": output_sentence,
+                        "tokens": tokens,
+                    }
+                ],
+                "created": int(time.time()),
+                "id": f"cmpl-{uuid.uuid4()}",
+                "model": "nemo model",
+                "object": "text_completion",
+                "usage": {
+                    "completion_tokens": num_output_sentence,
+                    "prompt_tokens": num_prompt_tokens,
+                    "total_tokens": num_output_sentence + num_prompt_tokens,
+                },
+            }
+        )
+
+    def chat_completion(self, data):
+        data['messages'] = data['messages'] + [
+            {'role': 'assistant', 'content': ''}
+        ]  # adding trailing assistant message so that prompt ends with Assistant tag.
+        special_tokens = self.model.cfg.data.chat_prompt_tokens
+        nemo_source = self.convert_messages(data['messages'])
+        header, conversation, data_type, mask_role = _get_header_conversation_type_mask_role(
+            nemo_source, special_tokens
+        )
+        len_strip = len(special_tokens['end_of_turn'] + special_tokens['turn_start'])
+        conversation = conversation[:-len_strip]
+        # Return a response mimicking the OpenAI ChatCompletion API format
+        with lock:  # Need to get lock to keep multiple threads from hitting code
+            MegatronGenerate.send_do_generate()  # Tell other ranks we're doing generate
+            extra = {}
+            if self.inference_strategy is not None:
+                extra['strategy'] = self.inference_strategy
+
+            all_probs = False
+            add_BOS = False
+            top_k = 0
+            greedy = data['temperature'] == 0.0
+            logprobs = data.get("logprobs", False)
+            end_strings = ['<|endoftext|>', special_tokens['turn_start'], special_tokens['label_start']]
+            random_seed = None
+
+            output = generate(
+                self.model,
+                [conversation],
+                data.get('max_tokens', 32),
+                all_probs=all_probs,
+                temperature=data.get('temperature', 1.0),
+                add_BOS=add_BOS,
+                top_k=top_k,
+                top_p=data.get("top_p", 0.95),
+                greedy=greedy,
+                repetition_penalty=1.0,
+                end_strings=end_strings,
+                min_tokens_to_generate=0,
+                compute_logprob=logprobs,
+                random_seed=random_seed,
+                **extra,
+            )
+            for k in output:
+                if isinstance(output[k], torch.Tensor):
+                    output[k] = output[k].tolist()
+
+        output_sentence = output['sentences'][0][len(conversation) :]
+        tokens = output['tokens'][0]
+        logprobs = output['logprob'][0] if output['logprob'] is not None else None
+        num_prompt_tokens = len(conversation.split())  # @adithyare only produces an approx. number of tokens
+        num_output_sentence = len(output_sentence.split())
+
+        return jsonify(
+            {
+                "id": f"chatcmpl-{uuid.uuid4()}",
+                "object": "chat.completion",
+                "created": int(time.time()),
+                "model": data.get("model", "nemo model"),
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {"role": "assistant", "content": output_sentence},
+                        "logprobs": logprobs,
+                        "tokens": tokens,
+                        "finish_reason": "",
+                    }
+                ],
+                "usage": {
+                    "prompt_tokens": num_prompt_tokens,
+                    "completion_tokens": num_output_sentence,
+                    "total_tokens": num_output_sentence + num_prompt_tokens,
+                },
+            }
+        )
+
+    def post(self):
+        # Access the request data if needed
+        if request.endpoint == "oai_completions":
+            data = request.get_json()
+            return self.completion(data)
+        elif request.endpoint == "oai_chat_completions":
+            data = request.get_json()
+            return self.chat_completion(data)
+        else:
+            raise RuntimeError("Unknown enpoint requested.")
+
     def put(self):
         logging.info("request IP: " + str(request.remote_addr))
         logging.info(json.dumps(request.get_json()))
@@ -135,7 +324,7 @@ def put(self):
             if not (0.0 <= top_p <= 1.0):
                 return "top_p must be a positive number less than or equal to 1.0"
 
-        repetition_penalty = 1.2
+        repetition_penalty = 1.0
         if "repetition_penalty" in request.get_json():
             repetition_penalty = request.get_json()["repetition_penalty"]
             if not (type(repetition_penalty) == int or type(repetition_penalty) == float):
@@ -231,7 +420,24 @@ class MegatronServer(object):
     def __init__(self, model, inference_strategy=None):
         self.app = Flask(__name__, static_url_path='')
         api = Api(self.app)
-        api.add_resource(MegatronGenerate, '/generate', resource_class_args=[model, inference_strategy])
+        api.add_resource(
+            MegatronGenerate,
+            '/generate',
+            endpoint="generate",
+            resource_class_kwargs={"model": model, "inference_strategy": inference_strategy},
+        )
+        api.add_resource(
+            MegatronGenerate,
+            '/v1/completions',
+            endpoint="oai_completions",
+            resource_class_kwargs={"model": model, "inference_strategy": inference_strategy},
+        )
+        api.add_resource(
+            MegatronGenerate,
+            '/v1/chat/completions',
+            endpoint="oai_chat_completions",
+            resource_class_kwargs={"model": model, "inference_strategy": inference_strategy},
+        )
 
     def run(self, url, port=5000):
         self.app.run(url, threaded=True, port=port, debug=False)

From 7a3dd6bfe17c0ff16abf04b55c37a5e56f564227 Mon Sep 17 00:00:00 2001
From: Huy Vu <86480512+huvunvidia@users.noreply.github.com>
Date: Sat, 19 Oct 2024 19:36:03 -0400
Subject: [PATCH 20/37] Update megatron_t5_pretraining.py (#10952)

Signed-off-by: Huy Vu <86480512+huvunvidia@users.noreply.github.com>
---
 tests/collections/llm/megatron_t5_pretraining.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/collections/llm/megatron_t5_pretraining.py b/tests/collections/llm/megatron_t5_pretraining.py
index 29d7eb2ebf2b..5d8f55a7f26f 100644
--- a/tests/collections/llm/megatron_t5_pretraining.py
+++ b/tests/collections/llm/megatron_t5_pretraining.py
@@ -59,16 +59,16 @@ def get_args():
         paths=args.data_path,
         seq_length=512,
         seq_length_dec=128,
-        micro_batch_size=args.devices,
-        global_batch_size=2 * args.devices,
+        micro_batch_size=64,
+        global_batch_size=512,
         seed=1234,
         tokenizer=tokenizer,
         split="99982,9,9",
         index_mapping_dir=args.index_mapping_dir,
     )
     t5_config = llm.t5.model.t5.T5Config(
-        num_layers=args.devices,
-        encoder_num_layers=args.devices,
+        num_layers=12,
+        encoder_num_layers=12,
         hidden_size=768,
         ffn_hidden_size=3072,
         num_attention_heads=12,

From b77c74302ab81ba4e9ee21ccf800f852b6fcad96 Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Sun, 20 Oct 2024 17:57:47 -0700
Subject: [PATCH 21/37] Convert perf plugin env vars to strings (#10947)

Signed-off-by: Hemil Desai <hemild@nvidia.com>
---
 nemo/lightning/run/plugins.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py
index 45905729b8b1..dfcc7c1650ce 100644
--- a/nemo/lightning/run/plugins.py
+++ b/nemo/lightning/run/plugins.py
@@ -287,16 +287,16 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
             tp_size = task.trainer.strategy.tensor_model_parallel_size
             cp_size = task.trainer.strategy.context_parallel_size
             if tp_size > 1 and cp_size > 1:
-                executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = 1
+                executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
 
             # Set LayerNorm SM margin to support the overlap with LayerNorm kernel
             if self.enable_layernorm_sm_margin:
-                executor.env_vars["NVTE_FWD_LAYERNORM_SM_MARGIN"] = self.layernorm_sm_margin
-                executor.env_vars["NVTE_BWD_LAYERNORM_SM_MARGIN"] = self.layernorm_sm_margin
+                executor.env_vars["NVTE_FWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)
+                executor.env_vars["NVTE_BWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)
 
         # Force Transformer Engine to use cuDNN attention over HazyResearch's Flash Attention
-        executor.env_vars["NVTE_FLASH_ATTN"] = 0
-        executor.env_vars["NVTE_FUSED_ATTN"] = 1
+        executor.env_vars["NVTE_FLASH_ATTN"] = "0"
+        executor.env_vars["NVTE_FUSED_ATTN"] = "1"
 
         # Improve perf by steering power to tensor cores, may not work on all systems
         if self.enable_vboost and isinstance(executor, run.SlurmExecutor):

From dab509fceea21d7072e2e626b9c2bf7315038b2f Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Mon, 21 Oct 2024 01:18:12 -0700
Subject: [PATCH 22/37] disable dynamo for ddp checker (#10961)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .github/workflows/cicd-main.yml | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 4d201b9c55a8..345482e9a1a8 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -3887,7 +3887,7 @@ jobs:
         rm -rf tests/collections/llm/gpt_pretrain_results
         rm -rf tests/collections/llm/gpt_index_mappings
 
-  OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check:
+  L2_NeMo_2_GPT_DDP_Param_Parity_check:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check') || needs.cicd-test-container-setup.outputs.all == 'true'
@@ -3895,7 +3895,7 @@ jobs:
       RUNNER: self-hosted-azure
       SCRIPT: |
 
-        python tests/lightning/test_ddp_parity_checker.py \
+        TORCHDYNAMO_DISABLE=1 python tests/lightning/test_ddp_parity_checker.py \
         --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
         --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
         --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document
@@ -3903,8 +3903,7 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf tests/collections/llm/gpt_pretrain_results
         rm -rf tests/collections/llm/gpt_index_mappings
-      IS_OPTIONAL: true
-      
+
   L2_NeMo_2_SSM_Pretraining:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4423,7 +4422,7 @@ jobs:
       - Speech_Checkpoints_tests
       - L2_Stable_Diffusion_Training
       - L2_NeMo_2_GPT_Pretraining_no_transformer_engine
-      #- OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check
+      - L2_NeMo_2_GPT_DDP_Param_Parity_check
       - L2_NeMo_2_HF_MODEL_IMPORT
       - L2_NeMo_2_SSM_Pretraining
       - L2_NeMo_2_SSM_Finetuning
@@ -4587,4 +4586,4 @@ jobs:
 
       - name: "Pipeline not successful, set exit code to 1"
         if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }}
-        run: exit 1
\ No newline at end of file
+        run: exit 1

From cec3e0a6a0161e0bd87eb90e257a56d2dd1dc761 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 21 Oct 2024 12:40:40 +0200
Subject: [PATCH 23/37] =?UTF-8?q?[=F0=9F=A4=A0]:=20Howdy=20folks,=20let's?=
 =?UTF-8?q?=20bump=20`Dockerfile.ci`=20to=20db7d37b=20!=20(#10965)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>
---
 Dockerfile.ci | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ci b/Dockerfile.ci
index dbcd92cfcb65..f01025873628 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.17.0
-ARG MCORE_TAG=0d89fc4c0d4394f915fffff11212d6957652337f
+ARG MCORE_TAG=db7d37b54ef96e35f7afc56e29fffb60f5c957b9
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \

From e8a801b58ea64f7ddb61c30e3bb71e966f8c103c Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Mon, 21 Oct 2024 04:20:58 -0700
Subject: [PATCH 24/37] Mistral-NeMo-12B recipe (#10607)

* Mistral-NeMo-12B recipe

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rename mistral to mistral_7b

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* include mistral_nemo_12b in __init__

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* add to __init__

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* Remove stale imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* TP=2

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove finetune_reci[e

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Rename MistralNeMo2407Config12B to MistralNeMoConfig12B per review's suggestion

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update config names in tests

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* mistral-nemo-12b from llama_8b

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* TP=2; SP=True

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix overlap value

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* update mistral-nemo-base-12b finetune recipe

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
---
 nemo/collections/llm/__init__.py              |   2 +
 nemo/collections/llm/gpt/model/__init__.py    |   2 +-
 nemo/collections/llm/gpt/model/mistral.py     |   4 +-
 nemo/collections/llm/recipes/__init__.py      |   6 +-
 .../llm/recipes/{mistral.py => mistral_7b.py} |   2 +-
 .../llm/recipes/mistral_nemo_12b.py           | 285 ++++++++++++++++++
 .../collections/llm/gpt/model/test_mistral.py |   6 +-
 tests/collections/llm/recipes/test_mistral.py |   2 +-
 tests/lightning/test_nemo_run.py              |   4 +-
 9 files changed, 301 insertions(+), 12 deletions(-)
 rename nemo/collections/llm/recipes/{mistral.py => mistral_7b.py} (99%)
 create mode 100644 nemo/collections/llm/recipes/mistral_nemo_12b.py

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 5ddbcf5913ad..4205c401eea8 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -70,6 +70,7 @@
     MaskedTokenLossReduction,
     MistralConfig7B,
     MistralModel,
+    MistralNeMoConfig12B,
     MixtralConfig8x3B,
     MixtralConfig8x7B,
     MixtralConfig8x22B,
@@ -115,6 +116,7 @@
     "t5_forward_step",
     "MaskedTokenLossReduction",
     "MistralConfig7B",
+    "MistralNeMoConfig12B",
     "MistralModel",
     "MixtralConfig8x3B",
     "MixtralConfig8x7B",
diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index aa3615b3ddfd..ebecc06140fe 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -53,7 +53,7 @@
     LlamaConfig,
     LlamaModel,
 )
-from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel
+from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel, MistralNeMoConfig12B
 from nemo.collections.llm.gpt.model.mixtral import (
     MixtralConfig8x3B,
     MixtralConfig8x7B,
diff --git a/nemo/collections/llm/gpt/model/mistral.py b/nemo/collections/llm/gpt/model/mistral.py
index f353362c9cbd..b9f4b6fb8f65 100644
--- a/nemo/collections/llm/gpt/model/mistral.py
+++ b/nemo/collections/llm/gpt/model/mistral.py
@@ -59,7 +59,7 @@ class MistralConfig7B(GPTConfig):
 
 
 @dataclass
-class MistralNeMo2407Config12B(MistralConfig7B):
+class MistralNeMoConfig12B(MistralConfig7B):
     """
     https://mistral.ai/news/mistral-nemo/
     """
@@ -75,7 +75,7 @@ class MistralNeMo2407Config12B(MistralConfig7B):
 
 
 @dataclass
-class MistralNeMo2407Config123B(MistralConfig7B):
+class MistralNeMoConfig123B(MistralConfig7B):
     """
     https://mistral.ai/news/mistral-large-2407/
     """
diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
index 6bee8c882ffd..7a21633b79ec 100644
--- a/nemo/collections/llm/recipes/__init__.py
+++ b/nemo/collections/llm/recipes/__init__.py
@@ -21,7 +21,8 @@
     llama3_70b_16k,
     llama3_70b_64k,
     llama31_405b,
-    mistral,
+    mistral_7b,
+    mistral_nemo_12b,
     mixtral_8x7b,
     mixtral_8x7b_16k,
     mixtral_8x7b_64k,
@@ -48,7 +49,8 @@
     "llama3_70b_16k",
     "llama3_70b_64k",
     "llama31_405b",
-    "mistral",
+    "mistral_7b",
+    "mistral_nemo_12b",
     "mixtral_8x7b",
     "mixtral_8x7b_16k",
     "mixtral_8x7b_64k",
diff --git a/nemo/collections/llm/recipes/mistral.py b/nemo/collections/llm/recipes/mistral_7b.py
similarity index 99%
rename from nemo/collections/llm/recipes/mistral.py
rename to nemo/collections/llm/recipes/mistral_7b.py
index 2b8c42e54ee7..6e82df598140 100644
--- a/nemo/collections/llm/recipes/mistral.py
+++ b/nemo/collections/llm/recipes/mistral_7b.py
@@ -33,7 +33,7 @@
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
 from nemo.utils.exp_manager import TimingCallback
 
-NAME = "mistral"
+NAME = "mistral_7b"
 
 
 @run.cli.factory(name=NAME)
diff --git a/nemo/collections/llm/recipes/mistral_nemo_12b.py b/nemo/collections/llm/recipes/mistral_nemo_12b.py
new file mode 100644
index 000000000000..e74fa5435b62
--- /dev/null
+++ b/nemo/collections/llm/recipes/mistral_nemo_12b.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.mistral import MistralModel, MistralNeMoConfig12B
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mistral_nemo_base_12b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mistral-Nemo-Base-12B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mistral-Nemo-Base-12B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mistral_nemo_base_12b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(MistralModel, config=run.Config(MistralNeMoConfig12B))
+
+
+def trainer(
+    tensor_parallelism: int = 2,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = True,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mistral-Nemo-Base-12B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mistral_nemo_base_12b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mistral-Nemo-Base-12B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mistral_nemo_base_12b
+            $ nemo llm pretrain --factory "mistral_nemo_base_12b(num_nodes=2, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mistral_nemo_base_12b", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_optimized")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Mistral-Nemo-Base-12B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory mistral_nemo_base_12b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="mistral_nemo_base_12b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+        )
+    )
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mistral-Nemo-Base-12B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mistral_nemo_base_12b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mistral_nemo_base_12b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    recipe = default_finetune_recipe(
+        model(), "mistralai/Mistral-Nemo-Base-2407", dir, name, num_nodes, num_gpus_per_node
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32)
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/tests/collections/llm/gpt/model/test_mistral.py b/tests/collections/llm/gpt/model/test_mistral.py
index 365bb35b2725..025ea35dd6e9 100644
--- a/tests/collections/llm/gpt/model/test_mistral.py
+++ b/tests/collections/llm/gpt/model/test_mistral.py
@@ -1,6 +1,6 @@
 import torch.nn.functional as F
 
-from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralNeMo2407Config12B, MistralNeMo2407Config123B
+from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralNeMoConfig12B, MistralNeMoConfig123B
 
 
 def test_mistral_config7b():
@@ -25,7 +25,7 @@ def test_mistral_config7b():
 
 
 def test_mistral_nemo_config_12b():
-    config = MistralNeMo2407Config12B()
+    config = MistralNeMoConfig12B()
     assert config.normalization == "RMSNorm"
     assert config.activation_func == F.silu
     assert config.position_embedding_type == "rope"
@@ -49,7 +49,7 @@ def test_mistral_nemo_config_12b():
 
 
 def test_mistral_nemo_config_123b():
-    config = MistralNeMo2407Config123B()
+    config = MistralNeMoConfig123B()
     assert config.normalization == "RMSNorm"
     assert config.activation_func == F.silu
     assert config.position_embedding_type == "rope"
diff --git a/tests/collections/llm/recipes/test_mistral.py b/tests/collections/llm/recipes/test_mistral.py
index 490f26a363fc..a7d83edcc370 100644
--- a/tests/collections/llm/recipes/test_mistral.py
+++ b/tests/collections/llm/recipes/test_mistral.py
@@ -6,7 +6,7 @@
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel
 from nemo.collections.llm.peft.lora import LoRA
-from nemo.collections.llm.recipes import mistral
+from nemo.collections.llm.recipes import mistral_7b as mistral
 from nemo.lightning import AutoResume, Trainer
 
 
diff --git a/tests/lightning/test_nemo_run.py b/tests/lightning/test_nemo_run.py
index 8d7814bfe530..947930c84847 100644
--- a/tests/lightning/test_nemo_run.py
+++ b/tests/lightning/test_nemo_run.py
@@ -17,8 +17,8 @@
         ("llama3_70b_16k", "pretrain_recipe", "llama3_70b_16k_pretrain"),
         ("llama3_70b_64k", "pretrain_recipe", "llama3_70b_64k_pretrain"),
         ("llama31_405b", "pretrain_recipe", "llama31_405b_pretrain"),
-        ("mistral", "pretrain_recipe", "mistral_pretrain"),
-        ("mistral", "finetune_recipe", "mistral_finetune"),
+        ("mistral_7b", "pretrain_recipe", "mistral_pretrain"),
+        ("mistral_7b", "finetune_recipe", "mistral_finetune"),
         ("mixtral_8x7b", "pretrain_recipe", "mixtral_8x7b_pretrain"),
         ("mixtral_8x7b", "finetune_recipe", "mixtral_8x7b_finetune"),
         ("mixtral_8x7b_16k", "pretrain_recipe", "mixtral_8x7b_16k_pretrain"),

From ff4e519dd0eaaa1d22c815382c9337442bc16215 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Mon, 21 Oct 2024 10:37:10 -0400
Subject: [PATCH 25/37] Make nemo text processing optional in TTS (#10584)

* move TN guard to better location; make guard print error message rather than throwing error

Signed-off-by: Jason <jasoli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: blisc <blisc@users.noreply.github.com>

* Forgot to add the actual normalizer

Signed-off-by: Jason <jasoli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: blisc <blisc@users.noreply.github.com>

---------

Signed-off-by: Jason <jasoli@nvidia.com>
Signed-off-by: blisc <blisc@users.noreply.github.com>
Co-authored-by: blisc <blisc@users.noreply.github.com>
---
 nemo/collections/tts/data/dataset.py     |  3 +-
 nemo/collections/tts/models/aligner.py   | 26 +------
 nemo/collections/tts/models/base.py      | 54 +++++++++++---
 nemo/collections/tts/models/fastpitch.py | 73 ++++++++++--------
 nemo/collections/tts/models/mixer_tts.py | 94 +++++++++++++++---------
 nemo/collections/tts/models/radtts.py    | 60 +++++++--------
 nemo/collections/tts/models/tacotron2.py | 29 +-------
 nemo/collections/tts/models/vits.py      | 59 +++++++--------
 8 files changed, 213 insertions(+), 185 deletions(-)

diff --git a/nemo/collections/tts/data/dataset.py b/nemo/collections/tts/data/dataset.py
index 83d2b969ea91..901b4168130f 100644
--- a/nemo/collections/tts/data/dataset.py
+++ b/nemo/collections/tts/data/dataset.py
@@ -204,7 +204,8 @@ def __init__(
             self.text_normalizer_call = None
         elif not PYNINI_AVAILABLE:
             raise ImportError(
-                "`nemo_text_processing` is not installed, see https://github.com/NVIDIA/NeMo-text-processing for details"
+                "`nemo_text_processing` is not installed, see https://github.com/NVIDIA/NeMo-text-processing for details. "
+                "If you wish to continue without text normalization, please remove the text_normalizer part in your TTS yaml file."
             )
         else:
             self.text_normalizer_call = (
diff --git a/nemo/collections/tts/models/aligner.py b/nemo/collections/tts/models/aligner.py
index 72d023e9ee10..d8e65d6e6821 100644
--- a/nemo/collections/tts/models/aligner.py
+++ b/nemo/collections/tts/models/aligner.py
@@ -24,6 +24,7 @@
 from torch import nn
 
 from nemo.collections.tts.losses.aligner_loss import BinLoss, ForwardSumLoss
+from nemo.collections.tts.models.base import NeedsNormalizer
 from nemo.collections.tts.parts.utils.helpers import (
     binarize_attention,
     g2p_backward_compatible_support,
@@ -41,7 +42,7 @@
     HAVE_WANDB = False
 
 
-class AlignerModel(ModelPT):
+class AlignerModel(NeedsNormalizer, ModelPT):
     """Speech-to-text alignment model (https://arxiv.org/pdf/2108.10447.pdf) that is used to learn alignments between mel spectrogram and text."""
 
     def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
@@ -77,29 +78,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self.bin_loss_start_ratio = cfg.bin_loss_start_ratio
         self.bin_loss_warmup_epochs = cfg.bin_loss_warmup_epochs
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-
-            self.text_normalizer_call = self.normalizer.normalize
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def _setup_tokenizer(self, cfg):
         text_tokenizer_kwargs = {}
         if "g2p" in cfg.text_tokenizer:
diff --git a/nemo/collections/tts/models/base.py b/nemo/collections/tts/models/base.py
index fe19ae75a3b3..b4b0ea9c43fa 100644
--- a/nemo/collections/tts/models/base.py
+++ b/nemo/collections/tts/models/base.py
@@ -18,6 +18,7 @@
 from typing import List, Optional
 
 import torch
+from hydra.utils import instantiate
 from omegaconf import DictConfig
 from tqdm import tqdm
 
@@ -28,9 +29,39 @@
 from nemo.core.neural_types.neural_type import NeuralType
 from nemo.utils import logging, model_utils
 
+PYNINI_AVAILABLE = True
+try:
+    import nemo_text_processing
+except (ImportError, ModuleNotFoundError):
+    PYNINI_AVAILABLE = False
 
-class SpectrogramGenerator(ModelPT, ABC):
-    """ Base class for all TTS models that turn text into a spectrogram """
+
+class NeedsNormalizer:
+    """Base class for all TTS models that needs text normalization(TN)"""
+
+    def _setup_normalizer(self, cfg):
+        if "text_normalizer" in cfg:
+            if not PYNINI_AVAILABLE:
+                logging.error(
+                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details."
+                )
+                logging.error("The normalizer will be disabled.")
+                return
+            normalizer_kwargs = {}
+
+            if "whitelist" in cfg.text_normalizer:
+                normalizer_kwargs["whitelist"] = self.register_artifact(
+                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
+                )
+
+            self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
+            self.text_normalizer_call = self.normalizer.normalize
+            if "text_normalizer_call_kwargs" in cfg:
+                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
+
+
+class SpectrogramGenerator(NeedsNormalizer, ModelPT, ABC):
+    """Base class for all TTS models that turn text into a spectrogram"""
 
     @abstractmethod
     def parse(self, str_input: str, **kwargs) -> 'torch.tensor':
@@ -115,7 +146,7 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]':
 
 
 class GlowVocoder(Vocoder):
-    """ Base class for all Vocoders that use a Glow or reversible Flow-based setup. All child class are expected
+    """Base class for all Vocoders that use a Glow or reversible Flow-based setup. All child class are expected
     to have a parameter called audio_to_melspec_precessor that is an instance of
     nemo.collections.asr.parts.FilterbankFeatures"""
 
@@ -175,7 +206,11 @@ def yet_another_patch(audio, n_fft, hop_length, win_length, window):
                 return torch.sqrt(spec.pow(2).sum(-1)), torch.atan2(spec[..., -1], spec[..., 0])
 
             self.stft = lambda x: yet_another_patch(
-                x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window,
+                x,
+                n_fft=n_fft,
+                hop_length=hop_length,
+                win_length=win_length,
+                window=window,
             )
             self.istft = lambda x, y: torch.istft(
                 torch.complex(x * torch.cos(y), x * torch.sin(y)),
@@ -252,15 +287,15 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]':
         return list_of_models
 
 
-class TextToWaveform(ModelPT, ABC):
-    """ Base class for all end-to-end TTS models that generate a waveform from text """
+class TextToWaveform(NeedsNormalizer, ModelPT, ABC):
+    """Base class for all end-to-end TTS models that generate a waveform from text"""
 
     @abstractmethod
     def parse(self, str_input: str, **kwargs) -> 'torch.tensor':
         """
-       A helper function that accepts a raw python string and turns it into a tensor. The tensor should have 2
-        dimensions. The first is the batch, which should be of size 1. The second should represent time. The tensor
-        should represent either tokenized or embedded text, depending on the model.
+        A helper function that accepts a raw python string and turns it into a tensor. The tensor should have 2
+         dimensions. The first is the batch, which should be of size 1. The second should represent time. The tensor
+         should represent either tokenized or embedded text, depending on the model.
         """
 
     @abstractmethod
@@ -299,7 +334,6 @@ def convert_graphemes_to_phonemes(
         num_workers: int = 0,
         pred_field: Optional[str] = "pred_text",
     ) -> List[str]:
-
         """
         Main function for Inference. Converts grapheme entries from the manifest "graheme_field" to phonemes
         Args:
diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py
index 3235a096a04b..b1e702c89124 100644
--- a/nemo/collections/tts/models/fastpitch.py
+++ b/nemo/collections/tts/models/fastpitch.py
@@ -200,28 +200,6 @@ def _get_default_text_tokenizer_conf(self):
         text_tokenizer: TextTokenizerConfig = TextTokenizerConfig()
         return OmegaConf.create(OmegaConf.to_yaml(text_tokenizer))
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-
-            self.text_normalizer_call = self.normalizer.normalize
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def _setup_tokenizer(self, cfg):
         text_tokenizer_kwargs = {}
 
@@ -240,12 +218,14 @@ def _setup_tokenizer(self, cfg):
 
             if "phoneme_dict" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["phoneme_dict"] = self.register_artifact(
-                    'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict,
+                    'text_tokenizer.g2p.phoneme_dict',
+                    cfg.text_tokenizer.g2p.phoneme_dict,
                 )
 
             if "heteronyms" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["heteronyms"] = self.register_artifact(
-                    'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms,
+                    'text_tokenizer.g2p.heteronyms',
+                    cfg.text_tokenizer.g2p.heteronyms,
                 )
 
             # for backward compatability
@@ -478,16 +458,25 @@ def training_step(self, batch, batch_idx):
             )
             spec_predict = mels_pred[0].data.cpu().float().numpy()
             self.tb_logger.add_image(
-                "train_mel_predicted", plot_spectrogram_to_numpy(spec_predict), self.global_step, dataformats="HWC",
+                "train_mel_predicted",
+                plot_spectrogram_to_numpy(spec_predict),
+                self.global_step,
+                dataformats="HWC",
             )
             if self.learn_alignment:
                 attn = attn_hard[0].data.cpu().float().numpy().squeeze()
                 self.tb_logger.add_image(
-                    "train_attn", plot_alignment_to_numpy(attn.T), self.global_step, dataformats="HWC",
+                    "train_attn",
+                    plot_alignment_to_numpy(attn.T),
+                    self.global_step,
+                    dataformats="HWC",
                 )
                 soft_attn = attn_soft[0].data.cpu().float().numpy().squeeze()
                 self.tb_logger.add_image(
-                    "train_soft_attn", plot_alignment_to_numpy(soft_attn.T), self.global_step, dataformats="HWC",
+                    "train_soft_attn",
+                    plot_alignment_to_numpy(soft_attn.T),
+                    self.global_step,
+                    dataformats="HWC",
                 )
 
         return loss
@@ -527,7 +516,20 @@ def validation_step(self, batch, batch_idx):
             )
 
         # Calculate val loss on ground truth durations to better align L2 loss in time
-        (mels_pred, _, _, log_durs_pred, pitch_pred, _, _, _, attn_hard_dur, pitch, energy_pred, energy_tgt,) = self(
+        (
+            mels_pred,
+            _,
+            _,
+            log_durs_pred,
+            pitch_pred,
+            _,
+            _,
+            _,
+            attn_hard_dur,
+            pitch,
+            energy_pred,
+            energy_tgt,
+        ) = self(
             text=text,
             durs=durs,
             pitch=pitch,
@@ -587,7 +589,10 @@ def on_validation_epoch_end(self):
             )
             spec_predict = spec_predict[0].data.cpu().float().numpy()
             self.tb_logger.add_image(
-                "val_mel_predicted", plot_spectrogram_to_numpy(spec_predict), self.global_step, dataformats="HWC",
+                "val_mel_predicted",
+                plot_spectrogram_to_numpy(spec_predict),
+                self.global_step,
+                dataformats="HWC",
             )
             self.log_train_images = True
         self.validation_step_outputs.clear()  # free memory)
@@ -598,7 +603,10 @@ def _setup_train_dataloader(self, cfg):
             phon_mode = self.vocab.set_phone_prob(self.vocab.phoneme_probability)
 
         with phon_mode:
-            dataset = instantiate(cfg.dataset, text_tokenizer=self.vocab,)
+            dataset = instantiate(
+                cfg.dataset,
+                text_tokenizer=self.vocab,
+            )
 
         sampler = dataset.get_sampler(cfg.dataloader_params.batch_size, world_size=self.trainer.world_size)
         return torch.utils.data.DataLoader(
@@ -611,7 +619,10 @@ def _setup_test_dataloader(self, cfg):
             phon_mode = self.vocab.set_phone_prob(0.0)
 
         with phon_mode:
-            dataset = instantiate(cfg.dataset, text_tokenizer=self.vocab,)
+            dataset = instantiate(
+                cfg.dataset,
+                text_tokenizer=self.vocab,
+            )
 
         return torch.utils.data.DataLoader(dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params)
 
diff --git a/nemo/collections/tts/models/mixer_tts.py b/nemo/collections/tts/models/mixer_tts.py
index 1a44cd5b31c8..c260df22e3c0 100644
--- a/nemo/collections/tts/models/mixer_tts.py
+++ b/nemo/collections/tts/models/mixer_tts.py
@@ -123,29 +123,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
         self.decoder = instantiate(cfg.decoder)
         self.proj = nn.Linear(self.decoder.d_model, cfg.n_mel_channels)
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-
-            self.text_normalizer_call = self.normalizer.normalize
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def _setup_tokenizer(self, cfg):
         text_tokenizer_kwargs = {}
         if "g2p" in cfg.text_tokenizer:
@@ -163,12 +140,14 @@ def _setup_tokenizer(self, cfg):
 
             if "phoneme_dict" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["phoneme_dict"] = self.register_artifact(
-                    'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict,
+                    'text_tokenizer.g2p.phoneme_dict',
+                    cfg.text_tokenizer.g2p.phoneme_dict,
                 )
 
             if "heteronyms" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["heteronyms"] = self.register_artifact(
-                    'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms,
+                    'text_tokenizer.g2p.heteronyms',
+                    cfg.text_tokenizer.g2p.heteronyms,
                 )
 
             text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs)
@@ -269,7 +248,10 @@ def _metrics(
     def run_aligner(self, text, text_len, text_mask, spect, spect_len, attn_prior):
         text_emb = self.symbol_emb(text)
         attn_soft, attn_logprob = self.aligner(
-            spect, text_emb.permute(0, 2, 1), mask=text_mask == 0, attn_prior=attn_prior,
+            spect,
+            text_emb.permute(0, 2, 1),
+            mask=text_mask == 0,
+            attn_prior=attn_prior,
         )
         attn_hard = binarize_attention_parallel(attn_soft, text_len, spect_len)
         attn_hard_dur = attn_hard.sum(2)[:, 0, :]
@@ -444,7 +426,16 @@ def training_step(self, batch, batch_idx):
         pitch = (pitch - self.pitch_mean) / self.pitch_std
         pitch[zero_pitch_idx] = 0.0
 
-        (pred_spect, _, pred_log_durs, pred_pitch, attn_soft, attn_logprob, attn_hard, attn_hard_dur,) = self(
+        (
+            pred_spect,
+            _,
+            pred_log_durs,
+            pred_pitch,
+            attn_soft,
+            attn_logprob,
+            attn_hard,
+            attn_hard_dur,
+        ) = self(
             text=text,
             text_len=text_len,
             pitch=pitch,
@@ -454,7 +445,17 @@ def training_step(self, batch, batch_idx):
             lm_tokens=lm_tokens,
         )
 
-        (loss, durs_loss, acc, acc_dist_1, acc_dist_3, pitch_loss, mel_loss, ctc_loss, bin_loss,) = self._metrics(
+        (
+            loss,
+            durs_loss,
+            acc,
+            acc_dist_1,
+            acc_dist_3,
+            pitch_loss,
+            mel_loss,
+            ctc_loss,
+            bin_loss,
+        ) = self._metrics(
             pred_durs=pred_log_durs,
             pred_pitch=pred_pitch,
             true_durs=attn_hard_dur,
@@ -496,7 +497,16 @@ def validation_step(self, batch, batch_idx):
         pitch = (pitch - self.pitch_mean) / self.pitch_std
         pitch[zero_pitch_idx] = 0.0
 
-        (pred_spect, _, pred_log_durs, pred_pitch, attn_soft, attn_logprob, attn_hard, attn_hard_dur,) = self(
+        (
+            pred_spect,
+            _,
+            pred_log_durs,
+            pred_pitch,
+            attn_soft,
+            attn_logprob,
+            attn_hard,
+            attn_hard_dur,
+        ) = self(
             text=text,
             text_len=text_len,
             pitch=pitch,
@@ -506,7 +516,17 @@ def validation_step(self, batch, batch_idx):
             lm_tokens=lm_tokens,
         )
 
-        (loss, durs_loss, acc, acc_dist_1, acc_dist_3, pitch_loss, mel_loss, ctc_loss, bin_loss,) = self._metrics(
+        (
+            loss,
+            durs_loss,
+            acc,
+            acc_dist_1,
+            acc_dist_3,
+            pitch_loss,
+            mel_loss,
+            ctc_loss,
+            bin_loss,
+        ) = self._metrics(
             pred_durs=pred_log_durs,
             pred_pitch=pred_pitch,
             true_durs=attn_hard_dur,
@@ -605,7 +625,9 @@ def validation_step(self, batch, batch_idx):
             "raw_texts": [NeuralType(optional=True)],
             "lm_model": NeuralType(optional=True),
         },
-        output_types={"spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()),},
+        output_types={
+            "spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()),
+        },
     )
     def generate_spectrogram(
         self,
@@ -694,7 +716,9 @@ def _loader(self, cfg):
             text_tokenizer=self.tokenizer,
         )
         return torch.utils.data.DataLoader(  # noqa
-            dataset=dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params,
+            dataset=dataset,
+            collate_fn=dataset.collate_fn,
+            **cfg.dataloader_params,
         )
 
     def setup_training_data(self, cfg):
@@ -749,7 +773,11 @@ def output_types(self):
 
     def input_example(self, max_text_len=10, max_lm_tokens_len=10):
         text = torch.randint(
-            low=0, high=len(self.tokenizer.tokens), size=(1, max_text_len), device=self.device, dtype=torch.long,
+            low=0,
+            high=len(self.tokenizer.tokens),
+            size=(1, max_text_len),
+            device=self.device,
+            dtype=torch.long,
         )
 
         inputs = {'text': text}
diff --git a/nemo/collections/tts/models/radtts.py b/nemo/collections/tts/models/radtts.py
index 959720910f11..82f85d1ed6a2 100644
--- a/nemo/collections/tts/models/radtts.py
+++ b/nemo/collections/tts/models/radtts.py
@@ -296,7 +296,9 @@ def _loader(self, cfg):
             text_tokenizer=self.tokenizer,
         )
         return torch.utils.data.DataLoader(  # noqa
-            dataset=dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params,
+            dataset=dataset,
+            collate_fn=dataset.collate_fn,
+            **cfg.dataloader_params,
         )
 
     def setup_training_data(self, cfg):
@@ -315,7 +317,9 @@ def setup_test_data(self, cfg):
             "speaker": NeuralType(('B'), Index(), optional=True),
             "sigma": NeuralType(optional=True),
         },
-        output_types={"spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()),},
+        output_types={
+            "spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()),
+        },
     )
     def generate_spectrogram(self, tokens: 'torch.tensor', speaker: int = 0, sigma: float = 1.0) -> torch.tensor:
         self.eval()
@@ -350,12 +354,14 @@ def _setup_tokenizer(self, cfg):
 
             if "phoneme_dict" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["phoneme_dict"] = self.register_artifact(
-                    'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict,
+                    'text_tokenizer.g2p.phoneme_dict',
+                    cfg.text_tokenizer.g2p.phoneme_dict,
                 )
 
             if "heteronyms" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["heteronyms"] = self.register_artifact(
-                    'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms,
+                    'text_tokenizer.g2p.heteronyms',
+                    cfg.text_tokenizer.g2p.heteronyms,
                 )
 
             text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs)
@@ -374,29 +380,6 @@ def _setup_tokenizer(self, cfg):
             self.text_tokenizer_pad_id = text_tokenizer_pad_id
             self.tokens = tokens
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-                self.text_normalizer_call = self.normalizer.normalize
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-            self.text_normalizer_call = self.normalizer.normalize
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def parse(self, text: str, normalize=False) -> torch.Tensor:
         if self.training:
             logging.warning("parse() is meant to be called in eval mode.")
@@ -479,7 +462,11 @@ def input_example(self, max_batch=1, max_dim=400):
         inp[inp == pad_id] = pad_id - 1 if pad_id > 0 else pad_id + 1
 
         inputs.update(
-            {'speaker_id': speaker, 'speaker_id_text': speaker, 'speaker_id_attributes': speaker,}
+            {
+                'speaker_id': speaker,
+                'speaker_id_text': speaker,
+                'speaker_id_attributes': speaker,
+            }
         )
         new_inputs = {
             'text': inp,
@@ -495,11 +482,24 @@ def input_example(self, max_batch=1, max_dim=400):
         return (new_inputs,)
 
     def forward_for_export(
-        self, text, batch_lengths, speaker_id, speaker_id_text, speaker_id_attributes, pitch, pace, volume,
+        self,
+        text,
+        batch_lengths,
+        speaker_id,
+        speaker_id_text,
+        speaker_id_attributes,
+        pitch,
+        pace,
+        volume,
     ):
         if self.export_config["enable_ragged_batches"]:
             text, pitch, pace, volume_tensor, lens = batch_from_ragged(
-                text, pitch, pace, batch_lengths=batch_lengths, padding_idx=self.tokenizer_pad, volume=volume,
+                text,
+                pitch,
+                pace,
+                batch_lengths=batch_lengths,
+                padding_idx=self.tokenizer_pad,
+                volume=volume,
             )
             if volume is not None:
                 volume = volume_tensor
diff --git a/nemo/collections/tts/models/tacotron2.py b/nemo/collections/tts/models/tacotron2.py
index 3fcdee9832ef..2fb005d80ca6 100644
--- a/nemo/collections/tts/models/tacotron2.py
+++ b/nemo/collections/tts/models/tacotron2.py
@@ -322,29 +322,6 @@ def on_validation_epoch_end(self):
         self.log('val_loss', avg_loss)
         self.validation_step_outputs.clear()  # free memory
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-
-            self.text_normalizer_call = self.normalizer.normalize
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def _setup_tokenizer(self, cfg):
         text_tokenizer_kwargs = {}
         if "g2p" in cfg.text_tokenizer and cfg.text_tokenizer.g2p is not None:
@@ -362,12 +339,14 @@ def _setup_tokenizer(self, cfg):
 
             if "phoneme_dict" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["phoneme_dict"] = self.register_artifact(
-                    'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict,
+                    'text_tokenizer.g2p.phoneme_dict',
+                    cfg.text_tokenizer.g2p.phoneme_dict,
                 )
 
             if "heteronyms" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["heteronyms"] = self.register_artifact(
-                    'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms,
+                    'text_tokenizer.g2p.heteronyms',
+                    cfg.text_tokenizer.g2p.heteronyms,
                 )
 
             text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs)
diff --git a/nemo/collections/tts/models/vits.py b/nemo/collections/tts/models/vits.py
index 319221d04ee0..4a891fa8823e 100644
--- a/nemo/collections/tts/models/vits.py
+++ b/nemo/collections/tts/models/vits.py
@@ -92,28 +92,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
 
         self.automatic_optimization = False
 
-    def _setup_normalizer(self, cfg):
-        if "text_normalizer" in cfg:
-            normalizer_kwargs = {}
-
-            if "whitelist" in cfg.text_normalizer:
-                normalizer_kwargs["whitelist"] = self.register_artifact(
-                    'text_normalizer.whitelist', cfg.text_normalizer.whitelist
-                )
-
-            try:
-                import nemo_text_processing
-
-                self.normalizer = instantiate(cfg.text_normalizer, **normalizer_kwargs)
-                self.text_normalizer_call = self.normalizer.normalize
-            except Exception as e:
-                logging.error(e)
-                raise ImportError(
-                    "`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details"
-                )
-            if "text_normalizer_call_kwargs" in cfg:
-                self.text_normalizer_call_kwargs = cfg.text_normalizer_call_kwargs
-
     def _setup_tokenizer(self, cfg):
         text_tokenizer_kwargs = {}
         if "g2p" in cfg.text_tokenizer and cfg.text_tokenizer.g2p is not None:
@@ -131,12 +109,14 @@ def _setup_tokenizer(self, cfg):
 
             if "phoneme_dict" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["phoneme_dict"] = self.register_artifact(
-                    'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict,
+                    'text_tokenizer.g2p.phoneme_dict',
+                    cfg.text_tokenizer.g2p.phoneme_dict,
                 )
 
             if "heteronyms" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["heteronyms"] = self.register_artifact(
-                    'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms,
+                    'text_tokenizer.g2p.heteronyms',
+                    cfg.text_tokenizer.g2p.heteronyms,
                 )
 
             text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs)
@@ -164,8 +144,14 @@ def configure_optimizers(self):
         sched_config = optim_config.pop("sched", None)
         OmegaConf.set_struct(optim_config, True)
 
-        optim_g = instantiate(optim_config, params=self.net_g.parameters(),)
-        optim_d = instantiate(optim_config, params=self.net_d.parameters(),)
+        optim_g = instantiate(
+            optim_config,
+            params=self.net_g.parameters(),
+        )
+        optim_d = instantiate(
+            optim_config,
+            params=self.net_d.parameters(),
+        )
 
         if sched_config is not None:
             if sched_config.name == 'ExponentialLR':
@@ -173,10 +159,14 @@ def configure_optimizers(self):
                 scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=sched_config.lr_decay)
             elif sched_config.name == 'CosineAnnealing':
                 scheduler_g = CosineAnnealing(
-                    optimizer=optim_g, max_steps=sched_config.max_steps, min_lr=sched_config.min_lr,
+                    optimizer=optim_g,
+                    max_steps=sched_config.max_steps,
+                    min_lr=sched_config.min_lr,
                 )
                 scheduler_d = CosineAnnealing(
-                    optimizer=optim_d, max_steps=sched_config.max_steps, min_lr=sched_config.min_lr,
+                    optimizer=optim_d,
+                    max_steps=sched_config.max_steps,
+                    min_lr=sched_config.min_lr,
                 )
             else:
                 raise ValueError("Unknown optimizer.")
@@ -362,7 +352,9 @@ def _loader(self, cfg):
             text_tokenizer=self.tokenizer,
         )
         return torch.utils.data.DataLoader(  # noqa
-            dataset=dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params,
+            dataset=dataset,
+            collate_fn=dataset.collate_fn,
+            **cfg.dataloader_params,
         )
 
     def train_dataloader(self):
@@ -377,7 +369,10 @@ def train_dataloader(self):
         train_sampler = DistributedBucketSampler(dataset, **self.cfg.train_ds.batch_sampler)
 
         dataloader = torch.utils.data.DataLoader(
-            dataset, collate_fn=dataset.collate_fn, batch_sampler=train_sampler, **self.cfg.train_ds.dataloader_params,
+            dataset,
+            collate_fn=dataset.collate_fn,
+            batch_sampler=train_sampler,
+            **self.cfg.train_ds.dataloader_params,
         )
         return dataloader
 
@@ -412,7 +407,9 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]':
         return list_of_models
 
     @typecheck(
-        input_types={"tokens": NeuralType(('B', 'T_text'), TokenIndex(), optional=True),},
+        input_types={
+            "tokens": NeuralType(('B', 'T_text'), TokenIndex(), optional=True),
+        },
         output_types={"audio": NeuralType(('B', 'T_audio'), AudioSignal())},
     )
     def convert_text_to_waveform(self, *, tokens, speakers=None):

From 4be4e8f59ceb70cde23029108e53de168b15a748 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Mon, 21 Oct 2024 09:40:17 -0700
Subject: [PATCH 26/37] respect warnings' filters (#10953)

* respect warnings' filters

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
---
 nemo/utils/nemo_logging.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/nemo/utils/nemo_logging.py b/nemo/utils/nemo_logging.py
index 95e17e5c5f6c..bcc7ad199603 100644
--- a/nemo/utils/nemo_logging.py
+++ b/nemo/utils/nemo_logging.py
@@ -76,7 +76,7 @@ def __init__(self, capture_warnings=True):
         self.rank = 0 if is_global_rank_zero() else "UNK"
 
     def _define_logger(self, capture_warnings=True):
-        """ Creates the logger if not already created. Called in init"""
+        """Creates the logger if not already created. Called in init"""
 
         # Use double-checked locking to avoid taking lock unnecessarily.
         if self._logger is not None:
@@ -126,7 +126,7 @@ def record_factory(*args, **kwargs):
         self._logger.propagate = False
 
     def remove_stream_handlers(self):
-        """ Removes StreamHandler that log to stdout and stderr from the logger."""
+        """Removes StreamHandler that log to stdout and stderr from the logger."""
         if self._logger is None:
             raise RuntimeError("Impossible to set handlers if the Logger is not predefined")
 
@@ -236,7 +236,7 @@ def set_verbosity(self, verbosity_level):
 
     @contextmanager
     def patch_stderr_handler(self, stream):
-        """ Sends messages that should log to stderr to stream instead. Useful for unittests """
+        """Sends messages that should log to stderr to stream instead. Useful for unittests"""
         if self._logger is not None:
             try:
                 old_stream = self._handlers["stream_stderr"].stream
@@ -268,7 +268,7 @@ def patch_stderr_handler(self, stream):
 
     @contextmanager
     def patch_stdout_handler(self, stream):
-        """ Sends messages that should log to stdout to stream instead. Useful for unittests """
+        """Sends messages that should log to stdout to stream instead. Useful for unittests"""
         if self._logger is not None:
             try:
                 old_stream = self._handlers["stream_stdout"].stream
@@ -339,6 +339,16 @@ def captureWarnings(self, capture):
                 warnings.showwarning = self.old_warnings_showwarning
                 self.old_warnings_showwarning = None
 
+    def _warning_is_ignored(self, category):
+        from warnings import filters
+
+        # Search the filters
+        for action, msg, cat, mod, ln in filters:
+            # least-common demoninator if multiple filters for the same class.
+            if cat == category and action == 'ignore':
+                return True
+        return False
+
     def _showwarning(self, message, category, filename, lineno, file=None, line=None):
         """
         Implementation of showwarnings which redirects to logging.
@@ -346,6 +356,8 @@ def _showwarning(self, message, category, filename, lineno, file=None, line=None
         with level logging.WARNING.
         """
         s = warnings.formatwarning(message, category, filename, lineno, line)
+        if self._warning_is_ignored(category):
+            return
         self.warning("%s", s)
 
     def _logged_once(self, msg, mode):

From 6607e760926b7dd3afc1bcf0050a5df408ab1950 Mon Sep 17 00:00:00 2001
From: Huy Vu <86480512+huvunvidia@users.noreply.github.com>
Date: Mon, 21 Oct 2024 14:15:46 -0400
Subject: [PATCH 27/37] Update T5 tokenizer (adding additional tokens to
 tokenizer config) (#10972)

* initial commit

* restore t5_pretraining

* Apply isort and black reformatting

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>

---------

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>
Co-authored-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: huvunvidia <huvunvidia@users.noreply.github.com>
---
 .../common/tokenizers/huggingface/auto_tokenizer.py   | 11 ++++++++++-
 nemo/collections/llm/t5/data/fine_tuning.py           |  2 --
 nemo/collections/llm/t5/data/pre_training.py          |  4 ----
 .../collections/nlp/modules/common/tokenizer_utils.py |  9 +++++++--
 tests/collections/llm/megatron_t5_finetuning.py       |  3 +++
 tests/collections/llm/megatron_t5_pretraining.py      |  3 +++
 6 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
index 76dca1268c3b..439322b8e810 100644
--- a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
+++ b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from collections import OrderedDict
-from typing import Optional
+from typing import List, Optional
 
 from transformers import AutoTokenizer as AUTOTOKENIZER
 
@@ -43,6 +43,7 @@ def __init__(
         sep_token: Optional[str] = None,
         cls_token: Optional[str] = None,
         unk_token: Optional[str] = None,
+        additional_special_tokens: Optional[List] = [],
         use_fast: Optional[bool] = False,
         trust_remote_code: Optional[bool] = False,
     ):
@@ -60,6 +61,7 @@ def __init__(
             sep_token: token used for separating sequences
             cls_token: class token. Usually equal to bos_token
             unk_token: token to use for unknown tokens
+            additional_special_tokens: list of other tokens beside standard special tokens (bos, eos, pad, etc.). For example, sentinel tokens for T5 (<extra_id_0>, <extra_id_1>, etc.)
             use_fast: whether to use fast HuggingFace tokenizer
         """
         try:
@@ -124,10 +126,17 @@ def __init__(
         elif self.tokenizer.cls_token is None and self.tokenizer.bos_token:
             special_tokens_dict["cls_token"] = self.tokenizer.bos_token
 
+        # add additional special tokens (not standard special tokens such as bos, eod, sep)
+        if additional_special_tokens is not None:
+            special_tokens_dict["additional_special_tokens"] = additional_special_tokens
+
         new_tokens_in_vocab = []
         for token in [mask_token, bos_token, eos_token, pad_token, sep_token, cls_token, unk_token]:
             if token is not None and token not in self.tokenizer.get_vocab():
                 new_tokens_in_vocab.append(token)
+        for token in additional_special_tokens:
+            if token is not None and token not in self.tokenizer.get_vocab():
+                new_tokens_in_vocab.append(token)
 
         if len(new_tokens_in_vocab) > 0:
             """
diff --git a/nemo/collections/llm/t5/data/fine_tuning.py b/nemo/collections/llm/t5/data/fine_tuning.py
index b1315f7a708a..9326dabe7b84 100644
--- a/nemo/collections/llm/t5/data/fine_tuning.py
+++ b/nemo/collections/llm/t5/data/fine_tuning.py
@@ -61,8 +61,6 @@ def __init__(
         from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 
         self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "BertWordPieceCase")
-        additional_tokens = {'additional_special_tokens': [f'<extra_id_{i}>' for i in range(100)]}
-        self.tokenizer.add_special_tokens(additional_tokens)
 
         self.memmap_workers = memmap_workers
         self.num_workers = num_workers
diff --git a/nemo/collections/llm/t5/data/pre_training.py b/nemo/collections/llm/t5/data/pre_training.py
index 2c73e0b78b11..e6f619972284 100644
--- a/nemo/collections/llm/t5/data/pre_training.py
+++ b/nemo/collections/llm/t5/data/pre_training.py
@@ -130,10 +130,6 @@ def __init__(
         # add additional tokens for T5 tokenizer
         from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 
-        self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "BertWordPieceCase")
-        additional_tokens = {'additional_special_tokens': [f'<extra_id_{i}>' for i in range(100)]}
-        self.tokenizer.add_special_tokens(additional_tokens)
-
         self.data_sampler = MegatronDataSampler(
             seq_len=self.seq_length,
             micro_batch_size=micro_batch_size,
diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py
index 4e6f9e15b839..dfc55a6c9065 100644
--- a/nemo/collections/nlp/modules/common/tokenizer_utils.py
+++ b/nemo/collections/nlp/modules/common/tokenizer_utils.py
@@ -69,7 +69,8 @@ def get_tokenizer(
             To see the list of all HuggingFace pretrained models, use:
             nemo_nlp.modules.common.get_huggingface_pretrained_lm_models_list()
         tokenizer_model: tokenizer model file of sentencepiece
-        special_tokens: dict of special tokens
+        special_tokens: dict of special tokens.
+            For additional special tokens besides standard special tokens (bos, eos, pad, etc.), such as sentinel tokens for T5 (<extra_id_0>, <extra_id_1>, etc.), use key 'additional_special_tokens'
         vocab_file: path to vocab file
         use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer
         bpe_dropout: (experimental) BPE dropout tries to corrupt the standard segmentation
@@ -224,7 +225,11 @@ def get_nmt_tokenizer(
             f'Getting Megatron tokenizer for pretrained model name: {model_name}, custom vocab file: {vocab_file}, and merges file: {merges_file}'
         )
         return get_tokenizer(
-            tokenizer_name=model_name, vocab_file=vocab_file, merges_file=merges_file, chat_template=chat_template
+            tokenizer_name=model_name,
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            special_tokens=special_tokens_dict,
+            chat_template=chat_template,
         )
     elif library == 'tabular':
         from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer
diff --git a/tests/collections/llm/megatron_t5_finetuning.py b/tests/collections/llm/megatron_t5_finetuning.py
index a204e6797926..f54e858cfb43 100644
--- a/tests/collections/llm/megatron_t5_finetuning.py
+++ b/tests/collections/llm/megatron_t5_finetuning.py
@@ -35,9 +35,12 @@ def get_args():
 
     args = get_args()
 
+    special_tokens = {}
+    special_tokens['additional_special_tokens'] = [f'<extra_id_{i}>' for i in range(100)]
     tokenizer = get_nmt_tokenizer(
         "megatron",
         "BertWordPieceCase",
+        special_tokens=special_tokens,
     )
 
     data = SquadDataModule(
diff --git a/tests/collections/llm/megatron_t5_pretraining.py b/tests/collections/llm/megatron_t5_pretraining.py
index 5d8f55a7f26f..a5460be3d154 100644
--- a/tests/collections/llm/megatron_t5_pretraining.py
+++ b/tests/collections/llm/megatron_t5_pretraining.py
@@ -50,10 +50,13 @@ def get_args():
 
     args = get_args()
 
+    special_tokens = {}
+    special_tokens['additional_special_tokens'] = [f'<extra_id_{i}>' for i in range(100)]
     tokenizer = get_nmt_tokenizer(
         "megatron",
         "BertWordPieceCase",
         vocab_file=args.vocab_path,
+        special_tokens=special_tokens,
     )
     data = PreTrainingDataModule(
         paths=args.data_path,

From b1cbd06f3c4be1f17cd447e6f054add8b46af923 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Mon, 21 Oct 2024 21:37:16 +0300
Subject: [PATCH 28/37] Alit/mamba recipe (#10935)

* add some mamba recipe

* add 130m

* add the rest of the recipes

* add tokenizer

* add tokenizer

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* minor fix

* add fixes to ssm for nemorun recipes

* add hybrid tokenizer

* updating some recipes

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* remove comments

* update gbs

* fix ckpt resume

* fix ckpt resume

* fix ckpt resume

* update recipes final

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* remove redundant imports

* ckpt convertor dtype fix

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

---------

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>
Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Co-authored-by: JRD971000 <JRD971000@users.noreply.github.com>
---
 nemo/collections/llm/gpt/model/ssm.py         |  14 +-
 nemo/collections/llm/recipes/__init__.py      |  14 +
 nemo/collections/llm/recipes/mamba2_130m.py   | 321 +++++++++++++++++
 nemo/collections/llm/recipes/mamba2_1_3b.py   | 321 +++++++++++++++++
 nemo/collections/llm/recipes/mamba2_2_7b.py   | 321 +++++++++++++++++
 nemo/collections/llm/recipes/mamba2_370m.py   | 321 +++++++++++++++++
 nemo/collections/llm/recipes/mamba2_780m.py   | 321 +++++++++++++++++
 nemo/collections/llm/recipes/mamba2_8b.py     | 321 +++++++++++++++++
 .../llm/recipes/mamba2_hybrid_8b.py           | 323 ++++++++++++++++++
 nemo/lightning/io/connector.py                |   6 +-
 .../llm/gpt/model/megatron_ssm_finetuning.py  |   1 +
 11 files changed, 2281 insertions(+), 3 deletions(-)
 create mode 100644 nemo/collections/llm/recipes/mamba2_130m.py
 create mode 100644 nemo/collections/llm/recipes/mamba2_1_3b.py
 create mode 100644 nemo/collections/llm/recipes/mamba2_2_7b.py
 create mode 100644 nemo/collections/llm/recipes/mamba2_370m.py
 create mode 100644 nemo/collections/llm/recipes/mamba2_780m.py
 create mode 100644 nemo/collections/llm/recipes/mamba2_8b.py
 create mode 100644 nemo/collections/llm/recipes/mamba2_hybrid_8b.py

diff --git a/nemo/collections/llm/gpt/model/ssm.py b/nemo/collections/llm/gpt/model/ssm.py
index 954fa8bfe9f7..c7228951fa78 100644
--- a/nemo/collections/llm/gpt/model/ssm.py
+++ b/nemo/collections/llm/gpt/model/ssm.py
@@ -53,6 +53,9 @@ class SSMConfig(TransformerConfig, io.IOMixin):
     fp16_lm_cross_entropy: bool = False
     parallel_output: bool = True
     share_embeddings_and_output_weights: bool = False
+    params_dtype: torch.dtype = torch.bfloat16
+    fp16: bool = False
+    bf16: bool = True
     num_layers: int = 2
     mamba_ssm_ngroups: int = 8
     num_attention_heads: int = 1
@@ -81,6 +84,7 @@ class SSMConfig(TransformerConfig, io.IOMixin):
 
     forward_step_fn: Callable = ssm_forward_step
     data_step_fn: Callable = gpt_data_step
+    tokenizer_model_path: str = None
 
     def configure_model(self, tokenizer) -> "MCoreMambaModel":
 
@@ -127,9 +131,17 @@ def __init__(self, state_dict):
             def state_dict(self):
                 return self._state_dict
 
+            def to(self, dtype):
+                for k, v in self._state_dict.items():
+                    if v.dtype != dtype:
+                        logging.warning(f"Converting {k} from {v.dtype} (source model) to {dtype} (target model)")
+                    self._state_dict[k] = v.to(dtype)
+
         source = ModelState(source)
         target = self.init()
-        trainer = self.nemo_setup(target)
+        trainer = self.nemo_setup(target, ckpt_async_save=False)
+        source.to(self.config.params_dtype)
+        target.to(self.config.params_dtype)
         self.convert_state(source, target)
         self.nemo_save(output_path, trainer)
 
diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
index 7a21633b79ec..47cc4e71448d 100644
--- a/nemo/collections/llm/recipes/__init__.py
+++ b/nemo/collections/llm/recipes/__init__.py
@@ -21,6 +21,13 @@
     llama3_70b_16k,
     llama3_70b_64k,
     llama31_405b,
+    mamba2_1_3b,
+    mamba2_2_7b,
+    mamba2_8b,
+    mamba2_130m,
+    mamba2_370m,
+    mamba2_780m,
+    mamba2_hybrid_8b,
     mistral_7b,
     mistral_nemo_12b,
     mixtral_8x7b,
@@ -49,6 +56,13 @@
     "llama3_70b_16k",
     "llama3_70b_64k",
     "llama31_405b",
+    "mamba2_130m",
+    "mamba2_370m",
+    "mamba2_780m",
+    "mamba2_1_3b",
+    "mamba2_2_7b",
+    "mamba2_8b",
+    "mamba2_hybrid_8b",
     "mistral_7b",
     "mistral_nemo_12b",
     "mixtral_8x7b",
diff --git a/nemo/collections/llm/recipes/mamba2_130m.py b/nemo/collections/llm/recipes/mamba2_130m.py
new file mode 100644
index 000000000000..08640604a112
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_130m.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_130m"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='huggingface',
+        model_name="EleutherAI/gpt-neox-20b",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 130M model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 130M model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_130m ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.BaseMambaConfig130M), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 130M model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_130m ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 130M model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_130M
+            $ nemo llm pretrain --factory "mamba2_130M(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_130M_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    resume_path: str = None,
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 130M model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_130m
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_130m_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.BaseMambaConfig130M(), tokenizer=tokenizer()).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.BaseMambaConfig130M())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_1_3b.py b/nemo/collections/llm/recipes/mamba2_1_3b.py
new file mode 100644
index 000000000000..58eaf049b059
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_1_3b.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_1_3b"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='huggingface',
+        model_name="EleutherAI/gpt-neox-20b",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 1.3B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 1.3B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_1_3B ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.BaseMambaConfig1_3B), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 1.3B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_1_3b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 1.3B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_1_3b
+            $ nemo llm pretrain --factory "mamba2_1_3b(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_1_3b_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    resume_path: str = None,
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 1.3B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_1_3b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_1_3b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.BaseMambaConfig1_3B(), tokenizer=tokenizer()).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.BaseMambaConfig1_3B())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_2_7b.py b/nemo/collections/llm/recipes/mamba2_2_7b.py
new file mode 100644
index 000000000000..5cb37c6a02a5
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_2_7b.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_2_7b"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='huggingface',
+        model_name="EleutherAI/gpt-neox-20b",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 2.7B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 2.7B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_2_7B ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.BaseMambaConfig2_7B), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 2.7B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_2_7b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 2.7B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_2_7b
+            $ nemo llm pretrain --factory "mamba2_2_7b(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_2_7b_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    resume_path: str = None,
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 2.7B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_2_7b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_2_7b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.BaseMambaConfig2_7B(), tokenizer=tokenizer()).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.BaseMambaConfig2_7B())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_370m.py b/nemo/collections/llm/recipes/mamba2_370m.py
new file mode 100644
index 000000000000..bb8bddc4045a
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_370m.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_370m"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='huggingface',
+        model_name="EleutherAI/gpt-neox-20b",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 370M model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 370M model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_370m ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.BaseMambaConfig370M), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 370M model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_370m ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 370M model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_370M
+            $ nemo llm pretrain --factory "mamba2_370M(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_370M_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    resume_path: str = None,
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 370M model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_370m
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_370m_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.BaseMambaConfig370M(), tokenizer=tokenizer()).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.BaseMambaConfig370M())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_780m.py b/nemo/collections/llm/recipes/mamba2_780m.py
new file mode 100644
index 000000000000..2f6ab6717ae1
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_780m.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_780m"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='huggingface',
+        model_name="EleutherAI/gpt-neox-20b",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 780M model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 780M model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_780m ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.BaseMambaConfig780M), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 780M model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_780m ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 780M model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_780M
+            $ nemo llm pretrain --factory "mamba2_780M(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_780M_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    resume_path: str = None,
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 780M model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_780m
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_780m_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.BaseMambaConfig780M(), tokenizer=tokenizer()).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.BaseMambaConfig780M())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_8b.py b/nemo/collections/llm/recipes/mamba2_8b.py
new file mode 100644
index 000000000000..58883deba732
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_8b.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_8b"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='megatron',
+        model_name="GPTSentencePieceTokenizer",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 8B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 Hybrid 8B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_8b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel, config=run.Config(llm.NVIDIAMambaConfig8B), tokenizer=tokenizer(tokenizer_model=tokenizer_model)
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 8,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 Hybrid 8B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_8b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 Hybrid 8B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_8b
+            $ nemo llm pretrain --factory "mamba2_8b(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_8b_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    resume_path,
+    tokenizer_model,
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 8B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_8b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_8b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.NVIDIAMambaConfig8B(), tokenizer=tokenizer(tokenizer_model=tokenizer_model)).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.NVIDIAMambaConfig8B())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=8,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 8
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/mamba2_hybrid_8b.py b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
new file mode 100644
index 000000000000..eff37da46fca
--- /dev/null
+++ b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py
@@ -0,0 +1,323 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "mamba2_hybrid_8b"
+
+
+@run.cli.factory(name=NAME)
+def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+
+    return run.Config(
+        get_nmt_tokenizer,
+        library='megatronNVIDIAMambaConfig8B',
+        model_name="GPTSentencePieceTokenizer",
+        tokenizer_model=tokenizer_model,
+        use_fast=True,
+    )
+
+
+@run.cli.factory(name=NAME)
+def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mamba2 Hybrid 8B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mamba2 Hybrid 8B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mamba2_hybrid_8b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(
+        llm.GPTModel,
+        config=run.Config(llm.NVIDIAMambaHybridConfig8B),
+        tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+    )
+
+
+def trainer(
+    tensor_parallelism: int = 8,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mamba2 Hybrid 8B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mamba2_hybrid_8b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=False,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    tokenizer_model: str = None,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mamba2 Hybrid 8B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mamba2_hybrid_8b
+            $ nemo llm pretrain --factory "mamba2_hybrid_8b(num_nodes=1, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mamba2_hybrid_8b_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(
+            MockDataModule,
+            seq_length=4096,
+            global_batch_size=8,
+            micro_batch_size=1,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    resume_path,
+    tokenizer_model,
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    gbs: int = 8,
+    mbs: int = 1,
+    peft_scheme: Optional[str] = 'none',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mamba2 Hybrid 8B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the NeMo checkpoint (refer to notes below
+                            on how to convert a pytorch checkpoint to NeMo)
+        tokenizer_model (str): Path to tokenizer model (defaults to None)
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mamba2_hybrid_8b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mamba2_hybrid_8b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+        For converting an SSM pytorch checkpoint, use the following line of python code:
+
+        llm.GPTModel(llm.NVIDIAMambaHybridConfig8B(), tokenizer=tokenizer(tokenizer_model=tokenizer_model)).import_ckpt(
+            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
+            model_config=llm.NVIDIAMambaHybridConfig8B())
+        This line will cache the nemo checkpoint to following directory:
+            /root/.cache/nemo/models/your_pytorch_state_dict_file
+
+    """
+    nemo_resume = run.Config(
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path=resume_path),
+    )
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=8,
+        pipeline_model_parallel_size=1,
+        gradient_as_bucket_view=True,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+        ckpt_async_save=False,
+    )
+    checkpoint_callback = run.Config(
+        nl.ModelCheckpoint,
+        every_n_train_steps=10,
+        dirpath=dir,
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        devices=num_gpus_per_node,
+        limit_test_batches=10,
+        limit_val_batches=10,
+        log_every_n_steps=20,
+        max_steps=100,
+        num_nodes=num_nodes,
+        plugins=run.Config(
+            nl.MegatronMixedPrecision,
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=20,
+    )
+    recipe = run.Partial(
+        llm.finetune,
+        model=model(tokenizer_model=tokenizer_model),
+        trainer=trainer,
+        data=run.Config(
+            llm.SquadDataModule,
+            seq_length=2048,
+            global_batch_size=gbs,
+            micro_batch_size=mbs,
+            tokenizer=tokenizer(tokenizer_model=tokenizer_model),
+        ),
+        log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        resume=nemo_resume,
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 8
+        recipe.optim.config.lr = 5e-6
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
index 38fbda42c67d..e7ba67b277f8 100644
--- a/nemo/lightning/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -134,7 +134,9 @@ class ModelConnector(Connector, Generic[SourceT, TargetT]):
             Loads a model from the specified path, optionally using a CPU-focused strategy, and returns the model and trainer.
     """
 
-    def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = None) -> pl.Trainer:
+    def nemo_setup(
+        self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = None, *args, **kwargs
+    ) -> pl.Trainer:
         """
         Sets up the model and trainer using a specified strategy, preparing it for training or inference.
 
@@ -150,7 +152,7 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] =
         _trainer = trainer or Trainer(
             devices=1,
             accelerator="cpu",
-            strategy=MegatronStrategy(ckpt_save_optimizer=False, always_save_context=True),
+            strategy=MegatronStrategy(ckpt_save_optimizer=False, always_save_context=True, *args, **kwargs),
         )
         # Note: set trainer to fitting state to avoid the following code path. Feel free to refactor if we no longer
         #  need to avoid this:
diff --git a/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py b/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py
index 67174974f9a3..e0b9862f23e1 100644
--- a/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py
+++ b/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py
@@ -59,6 +59,7 @@ def get_args():
         strategy=nl.MegatronStrategy(
             ckpt_load_optimizer=False,
             ckpt_save_optimizer=False,
+            ckpt_async_save=False,
             tensor_model_parallel_size=1,
         ),
         plugins=nl.MegatronMixedPrecision(

From b39e679ba9991269d712bd473ebbcf74520e9c20 Mon Sep 17 00:00:00 2001
From: Youngeun Kwon <youngeunk@nvidia.com>
Date: Mon, 21 Oct 2024 14:42:24 -0700
Subject: [PATCH 29/37] Long context performance doc hot fix (#10946)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* long context perf

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* update the long context perf

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Akoumparouli/mcore microbatch calculator fix (#10780)

* move tests/lightning/{,_}io

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add microbatch calculator context manager

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* use microbatch calculator context manager

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add on_load_checkpoint test to ValidateModelRestoration; use ctx manager to reconfigure microbatch calculator; update save/restore path; add cleanup step at the end

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove unused var

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* remove 8x3b recipes (#10764)

* remove 8x3b recipes

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove 8x3b from test_nemo_run

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rm from __init__

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* change the figure file name

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Accommodating the reviewer's comment

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* update the y-axis title

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 3f90b98 ! (#10789)

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Add ModelOpt transformer model pruning example for Llama models, default to llama3.1-8b-base (#10294)

* Add ModelOpt transformer model pruning example for Llama3 model

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: shengliangxu <shengliangxu@users.noreply.github.com>
Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* examples code is at wrong dir, move them

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* changes as suggested in comment

remove some logging and unused config code, update example model to
llama3.1

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* Add pruning of hidden_size into example

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: shengliangxu <shengliangxu@users.noreply.github.com>
Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>

* Update examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Add pruning test to cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

* Update cicd-main.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>

---------

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>
Signed-off-by: shengliangxu <shengliangxu@users.noreply.github.com>
Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Co-authored-by: shengliangxu <shengliangxu@users.noreply.github.com>
Co-authored-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Update mamba.rst after dist ckpt addition (#10800)

Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* fix chunked infer (#10581)

Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* fix state transform (#10728)

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* use ckpt_to_weights_subdir in restore (#10786)

* use ckpt_to_weights_subdir in restore

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* make ckpt_to_{weight,context}_subdir idempotent

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Mixtral set seq_length=4k (#10704)

* enable SP & set seq_lenght=4k

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update test expected values

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* 8x22b 4k

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Fix for crashes with tensorboard_logger=false and VP + LoRA (#10792)

* Fix for crashes with tensorboard_logger=false and virtual pipeline parallel + LoRA

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: vysarge <vysarge@users.noreply.github.com>

---------

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: vysarge <vysarge@users.noreply.github.com>
Co-authored-by: vysarge <vysarge@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* Disable checkpoint conversion inside AutoResume (#10645)

* Disable checkpoint conversion inside AutoResume

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Update resume docstrings

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* add default finetuning recipe and refactor llama3 8b recipe

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* address comment

Signed-off-by: Chen Cui <chcui@nvidia.com>

* refactor other recipes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* remove 8x3b finetuning recipe for now because HF version not available

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add copyright header

Signed-off-by: Chen Cui <chcui@nvidia.com>

* adjust unit tests based on recipe fixes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix failed unit test

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* replace png file to github assets

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* change image url to github release

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

* hot fix on table style

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>

---------

Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>
Signed-off-by: shengliangxu <shengliangxu@users.noreply.github.com>
Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: vysarge <vysarge@users.noreply.github.com>
Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: oliver könig <okoenig@nvidia.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>
Co-authored-by: Shengliang Xu <106840466+shengliangxu@users.noreply.github.com>
Co-authored-by: shengliangxu <shengliangxu@users.noreply.github.com>
Co-authored-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Co-authored-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: vysarge <vysarge@users.noreply.github.com>
Co-authored-by: Hemil Desai <hemild@nvidia.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
---
 .../performance/performance_long_sequence.md  | 21 -------------------
 1 file changed, 21 deletions(-)

diff --git a/docs/source/performance/performance_long_sequence.md b/docs/source/performance/performance_long_sequence.md
index 9dc9c6c52be3..d9f26dcf0d61 100644
--- a/docs/source/performance/performance_long_sequence.md
+++ b/docs/source/performance/performance_long_sequence.md
@@ -7,27 +7,6 @@
   - Container: [NeMo24.03.01.framework](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags)
   - System: DGX-H100
 
-<style>
-  table {
-    border-collapse: collapse;
-  }
-  th {
-    border: 1px solid;
-    padding: 5px;
-    text-align: center; /* Center-align all header cells */
-  }
-  td {
-    border: 1px solid;
-    padding: 5px;
-  }
-  th.top-border {
-    border-top: 2px solid;
-  }
-  td.speedup {
-    font-weight: bold;
-  }
-</style>
-
 
 <table>
   <thead>

From c7a539a6cb4e7cb59b51eab67dad627862c2c9f9 Mon Sep 17 00:00:00 2001
From: malay-nagda <164242706+malay-nagda@users.noreply.github.com>
Date: Tue, 22 Oct 2024 03:20:21 +0530
Subject: [PATCH 30/37] Performance mode (#10926)

* llama3 performance mode

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* llama3 performance mode tests

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* mixtral performance mode

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* remove unused

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* nemotron perf mode

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* 405b, 174b perf mode

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* perf mode comment

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

---------

Signed-off-by: Malay Nagda <malayn@nvidia.com>
Signed-off-by: malay-nagda <164242706+malay-nagda@users.noreply.github.com>
Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
Co-authored-by: malay-nagda <malay-nagda@users.noreply.github.com>
---
 nemo/collections/llm/recipes/gpt3_175b.py     | 52 ++++++++-----------
 nemo/collections/llm/recipes/llama31_405b.py  | 52 ++++++++-----------
 nemo/collections/llm/recipes/llama3_70b.py    | 51 +++++++++---------
 nemo/collections/llm/recipes/llama3_8b.py     | 43 ++++++---------
 nemo/collections/llm/recipes/mixtral_8x22b.py | 50 ++++++++----------
 nemo/collections/llm/recipes/mixtral_8x7b.py  | 50 ++++++++----------
 nemo/collections/llm/recipes/nemotron3_8b.py  | 36 +++++--------
 nemo/collections/llm/recipes/nemotron4_15b.py | 37 +++++--------
 nemo/collections/llm/recipes/nemotron4_22b.py | 45 ++++++----------
 .../collections/llm/recipes/nemotron4_340b.py | 45 ++++++----------
 .../llm/recipes/test_llama3_70b.py            |  6 +--
 .../collections/llm/recipes/test_llama3_8b.py |  6 +--
 12 files changed, 193 insertions(+), 280 deletions(-)

diff --git a/nemo/collections/llm/recipes/gpt3_175b.py b/nemo/collections/llm/recipes/gpt3_175b.py
index 7e016154aa3e..1abe8a218e82 100644
--- a/nemo/collections/llm/recipes/gpt3_175b.py
+++ b/nemo/collections/llm/recipes/gpt3_175b.py
@@ -142,7 +142,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for GPT3 175B model.
@@ -155,6 +160,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -172,7 +178,7 @@ def pretrain_recipe(
     Note:
         This recipe is optimized for the large 175B model and requires significant computational resources.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -186,49 +192,35 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 1,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for GPT3 175B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-        CLI usage:
-            $ nemo llm pretrain --factory "gpt3_175b.pretrain_recipe_performance(num_nodes=64, name='perf_pretrain')"
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="gpt3_175b_perf", num_nodes=64)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
-    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
-    # They are added here for user's knowledge
-    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
-    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
 
     recipe.trainer.callbacks.append(
         run.Config(
diff --git a/nemo/collections/llm/recipes/llama31_405b.py b/nemo/collections/llm/recipes/llama31_405b.py
index 45efedc3cbd6..055e9a06fcba 100644
--- a/nemo/collections/llm/recipes/llama31_405b.py
+++ b/nemo/collections/llm/recipes/llama31_405b.py
@@ -144,7 +144,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Llama3.1 405B model.
@@ -157,6 +162,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -174,7 +180,7 @@ def pretrain_recipe(
     Note:
         This recipe is optimized for the large 405B model and requires significant computational resources.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -188,49 +194,35 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 1,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Llama3.1 405B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-        CLI usage:
-            $ nemo llm pretrain --factory "llama31_405b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')"
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="llama31_405b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
-    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
-    # They are added here for user's knowledge
-    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
-    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
 
     recipe.trainer.callbacks.append(
         run.Config(
diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py
index ffd4a833885e..b283c68b222b 100644
--- a/nemo/collections/llm/recipes/llama3_70b.py
+++ b/nemo/collections/llm/recipes/llama3_70b.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -142,7 +142,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 4, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Llama3 70B model.
@@ -155,6 +160,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -172,7 +178,8 @@ def pretrain_recipe(
     Note:
         This recipe is optimized for the large 70B model and requires significant computational resources.
     """
-    return run.Partial(
+
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -186,45 +193,35 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 4, num_gpus_per_node: int = 8, fn=pretrain
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Llama3 70B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-        CLI usage:
-            $ nemo llm pretrain --factory "llama3_70b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')"
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="llama3_70b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
-    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
-    # They are added here for user's knowledge
-    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
-    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
 
     recipe.trainer.callbacks.append(
         run.Config(
diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py
index dd162ed29914..269eb7865dcf 100644
--- a/nemo/collections/llm/recipes/llama3_8b.py
+++ b/nemo/collections/llm/recipes/llama3_8b.py
@@ -143,7 +143,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Llama3 8B model.
@@ -156,6 +161,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -174,7 +180,7 @@ def pretrain_recipe(
         For more details on pre-training LLMs with NeMo, see the pre-training
         guide in the `examples/llm/pretrain/` directory.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -188,44 +194,29 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 1,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Llama3 8B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-            $ nemo llm pretrain --factory llama3_8b_optimized
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="llama3_8b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
-
     recipe.trainer.callbacks.append(
         run.Config(
             MegatronCommOverlapCallback,
diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py
index f023eae01440..1bfef9be5582 100644
--- a/nemo/collections/llm/recipes/mixtral_8x22b.py
+++ b/nemo/collections/llm/recipes/mixtral_8x22b.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -145,7 +145,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 16, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 16,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Mixtral 8x22B model.
@@ -158,6 +163,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -172,7 +178,7 @@ def pretrain_recipe(
             >>> recipe = pretrain_recipe(name="mixtral_pretrain", num_nodes=16)
             >>> print(recipe)
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -184,45 +190,35 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Mixtral 8x22B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-        CLI usage:
-            $ nemo llm pretrain --factory "mixtral_8x22b.pretrain_recipe_performance(num_nodes=8, name='perf_pretrain')"
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="mixtral_8x22b_perf", num_nodes=8)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
-    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
-    # They are added here for user's knowledge
-    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
-    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
 
     recipe.trainer.callbacks.extend(
         [
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py
index e80be03e3217..8e39e73aab76 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -142,7 +142,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 8,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Mixtral 8x7B model.
@@ -155,6 +160,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -169,7 +175,7 @@ def pretrain_recipe(
             >>> recipe = pretrain_recipe(name="mixtral_8x7b_pretrain", num_nodes=8)
             >>> print(recipe)
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -181,45 +187,35 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Mixtral 8x7B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-        CLI usage:
-            $ nemo llm pretrain --factory "mixtral_8x3b.pretrain_recipe_performance(num_nodes=8, name='perf_pretrain')"
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="mixtral_8x7b_perf", num_nodes=8)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
-    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
-    # They are added here for user's knowledge
-    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
-    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
 
     recipe.trainer.callbacks.extend(
         [
diff --git a/nemo/collections/llm/recipes/nemotron3_8b.py b/nemo/collections/llm/recipes/nemotron3_8b.py
index 928f0d177947..7dcebe17f872 100644
--- a/nemo/collections/llm/recipes/nemotron3_8b.py
+++ b/nemo/collections/llm/recipes/nemotron3_8b.py
@@ -83,6 +83,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=3.0e-5,
     max_lr=3e-4,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -118,6 +119,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -135,7 +137,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -174,43 +176,29 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 1,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Nemotron3 8B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-            $ nemo llm pretrain --factory nemotron3_8b_optimized
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="nemotron3_8b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
     recipe.trainer.callbacks.append(
         run.Config(
diff --git a/nemo/collections/llm/recipes/nemotron4_15b.py b/nemo/collections/llm/recipes/nemotron4_15b.py
index 9f184a92d94b..16ae7b2b1e79 100644
--- a/nemo/collections/llm/recipes/nemotron4_15b.py
+++ b/nemo/collections/llm/recipes/nemotron4_15b.py
@@ -80,6 +80,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=4.5e-5,
     max_lr=4.5e-5,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -115,6 +116,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -132,7 +134,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -171,44 +173,29 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 8,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Nemotron4 15B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-            $ nemo llm pretrain --factory nemotron4_15b_optimized
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="nemotron4_15b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
-
     recipe.trainer.callbacks.append(
         run.Config(
             MegatronCommOverlapCallback,
diff --git a/nemo/collections/llm/recipes/nemotron4_22b.py b/nemo/collections/llm/recipes/nemotron4_22b.py
index 4fb697c006fc..a20afedfea56 100644
--- a/nemo/collections/llm/recipes/nemotron4_22b.py
+++ b/nemo/collections/llm/recipes/nemotron4_22b.py
@@ -80,6 +80,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=1e-5,
     max_lr=1e-4,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -115,6 +116,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -132,7 +134,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -171,48 +173,35 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 8,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Nemotron4 22B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-            $ nemo llm pretrain --factory nemotron4_22b_optimized
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="nemotron4_22b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
-    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
-    # They are added here for user's knowledge
-    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
-    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
 
     recipe.trainer.callbacks.append(
         run.Config(
diff --git a/nemo/collections/llm/recipes/nemotron4_340b.py b/nemo/collections/llm/recipes/nemotron4_340b.py
index cc9c7995c9e4..8268b2a87791 100644
--- a/nemo/collections/llm/recipes/nemotron4_340b.py
+++ b/nemo/collections/llm/recipes/nemotron4_340b.py
@@ -83,6 +83,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=1.0e-5,
     max_lr=1.0e-4,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -118,6 +119,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -135,7 +137,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -174,48 +176,35 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 16,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Nemotron4 340B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-            $ nemo llm pretrain --factory nemotron4_340b_optimized
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="nemotron4_340b_perf", num_nodes=16)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
-    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
-    # They are added here for user's knowledge
-    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
-    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
 
     recipe.trainer.callbacks.append(
         run.Config(
diff --git a/tests/collections/llm/recipes/test_llama3_70b.py b/tests/collections/llm/recipes/test_llama3_70b.py
index cc77ec921de7..d47b674b7b70 100644
--- a/tests/collections/llm/recipes/test_llama3_70b.py
+++ b/tests/collections/llm/recipes/test_llama3_70b.py
@@ -79,10 +79,8 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_pretrain_recipe_performance(self, recipe_module):
-        recipe = recipe_module.pretrain_recipe_performance(
-            name="test_perf", dir="/tmp", num_nodes=4, num_gpus_per_node=8
-        )
+    def test_pretrain_performance_optimizations(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe(performance_mode=True)
         assert any(
             isinstance(cb, run.Config) and cb.__fn_or_cls__ == MegatronCommOverlapCallback
             for cb in recipe.trainer.callbacks
diff --git a/tests/collections/llm/recipes/test_llama3_8b.py b/tests/collections/llm/recipes/test_llama3_8b.py
index df4f05eec2ae..88fab6d6325a 100644
--- a/tests/collections/llm/recipes/test_llama3_8b.py
+++ b/tests/collections/llm/recipes/test_llama3_8b.py
@@ -90,10 +90,8 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_pretrain_recipe_performance(self, recipe_module):
-        recipe = recipe_module.pretrain_recipe_performance(
-            name="test_perf", dir="/tmp", num_nodes=1, num_gpus_per_node=8
-        )
+    def test_pretrain_performance_optimizations(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe(performance_mode=True)
         assert any(cb.__fn_or_cls__.__name__ == "MegatronCommOverlapCallback" for cb in recipe.trainer.callbacks)
 
     def test_trainer_parallelism_options(self, recipe_module):

From 47f2446a01128e783e5cc9ac8b2058a081c7474f Mon Sep 17 00:00:00 2001
From: Mingyuan Ma <111467530+Victor49152@users.noreply.github.com>
Date: Mon, 21 Oct 2024 17:24:02 -0700
Subject: [PATCH 31/37] Add flux inference pipeline (#10752)

* Vae added and matched flux checkpoint

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Flux model added.

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Copying FlowMatchEulerScheduler over

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* WIP: Start to test the pipeline forward pass

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Vae added and matched flux checkpoint

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Inference pipeline runs with offloading function

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Start to test image generation

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Decoding with VAE part has been verified. Still need to check the denoising loop.

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* The inference pipeline is verified.

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Add arg parsers and refactoring

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Tested on multi batch sizes and prompts.

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Add headers

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Victor49152 <Victor49152@users.noreply.github.com>

* Renaming

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Move shceduler to sampler folder

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Merging folders.

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Victor49152 <Victor49152@users.noreply.github.com>

* Tested after path changing.

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Victor49152 <Victor49152@users.noreply.github.com>

* Move MMDIT block to NeMo

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Victor49152 <Victor49152@users.noreply.github.com>

* Add joint attention and single attention to NeMo

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Victor49152 <Victor49152@users.noreply.github.com>

* Joint attention updated

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Victor49152 <Victor49152@users.noreply.github.com>

* Remove redundant importing

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Refactor to inherit megatron module

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Victor49152 <Victor49152@users.noreply.github.com>

---------

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>
Signed-off-by: Victor49152 <Victor49152@users.noreply.github.com>
Co-authored-by: Victor49152 <Victor49152@users.noreply.github.com>
---
 .../diffusion/encoders/__init__.py            |  13 +
 .../diffusion/encoders/conditioner.py         | 199 ++++++++
 nemo/collections/diffusion/flux_infer.py      | 113 +++++
 .../diffusion/models/dit/dit_attention.py     | 428 ++++++++++++++++++
 .../diffusion/models/dit/dit_layer_spec.py    | 357 ++++++++++++++-
 .../diffusion/models/flux/__init__.py         |  13 +
 .../diffusion/models/flux/layers.py           | 173 +++++++
 .../diffusion/models/flux/model.py            | 156 +++++++
 .../diffusion/models/flux/pipeline.py         | 342 ++++++++++++++
 .../sampler/flow_matching/__init__.py         |  13 +
 .../flow_match_euler_discrete.py              | 284 ++++++++++++
 nemo/collections/diffusion/utils/__init__.py  |  13 +
 .../diffusion/utils/flux_ckpt_converter.py    | 206 +++++++++
 .../diffusion/utils/flux_pipeline_utils.py    |  76 ++++
 .../diffusion/utils/mcore_parallel_utils.py   |  80 ++++
 nemo/collections/diffusion/vae/autoencoder.py | 334 ++++++++++++++
 nemo/collections/diffusion/vae/blocks.py      | 180 ++++++++
 17 files changed, 2971 insertions(+), 9 deletions(-)
 create mode 100644 nemo/collections/diffusion/encoders/__init__.py
 create mode 100644 nemo/collections/diffusion/encoders/conditioner.py
 create mode 100644 nemo/collections/diffusion/flux_infer.py
 create mode 100644 nemo/collections/diffusion/models/dit/dit_attention.py
 create mode 100644 nemo/collections/diffusion/models/flux/__init__.py
 create mode 100644 nemo/collections/diffusion/models/flux/layers.py
 create mode 100644 nemo/collections/diffusion/models/flux/model.py
 create mode 100644 nemo/collections/diffusion/models/flux/pipeline.py
 create mode 100644 nemo/collections/diffusion/sampler/flow_matching/__init__.py
 create mode 100644 nemo/collections/diffusion/sampler/flow_matching/flow_match_euler_discrete.py
 create mode 100644 nemo/collections/diffusion/utils/__init__.py
 create mode 100644 nemo/collections/diffusion/utils/flux_ckpt_converter.py
 create mode 100644 nemo/collections/diffusion/utils/flux_pipeline_utils.py
 create mode 100644 nemo/collections/diffusion/utils/mcore_parallel_utils.py
 create mode 100644 nemo/collections/diffusion/vae/autoencoder.py
 create mode 100644 nemo/collections/diffusion/vae/blocks.py

diff --git a/nemo/collections/diffusion/encoders/__init__.py b/nemo/collections/diffusion/encoders/__init__.py
new file mode 100644
index 000000000000..9e3250071955
--- /dev/null
+++ b/nemo/collections/diffusion/encoders/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/diffusion/encoders/conditioner.py b/nemo/collections/diffusion/encoders/conditioner.py
new file mode 100644
index 000000000000..2bfb008c5d84
--- /dev/null
+++ b/nemo/collections/diffusion/encoders/conditioner.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Union
+
+import torch
+import torch.nn as nn
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5Tokenizer
+
+
+class AbstractEmbModel(nn.Module):
+    def __init__(self, enable_lora_finetune=False, target_block=[], target_module=[]):
+        super().__init__()
+        self._is_trainable = None
+        self._ucg_rate = None
+        self._input_key = None
+
+        self.TARGET_BLOCK = target_block
+        self.TARGET_MODULE = target_module
+        if enable_lora_finetune:
+            self.lora_layers = []
+
+    @property
+    def is_trainable(self) -> bool:
+        return self._is_trainable
+
+    @property
+    def ucg_rate(self) -> Union[float, torch.Tensor]:
+        return self._ucg_rate
+
+    @property
+    def input_key(self) -> str:
+        return self._input_key
+
+    @is_trainable.setter
+    def is_trainable(self, value: bool):
+        self._is_trainable = value
+
+    @ucg_rate.setter
+    def ucg_rate(self, value: Union[float, torch.Tensor]):
+        self._ucg_rate = value
+
+    @input_key.setter
+    def input_key(self, value: str):
+        self._input_key = value
+
+    @is_trainable.deleter
+    def is_trainable(self):
+        del self._is_trainable
+
+    @ucg_rate.deleter
+    def ucg_rate(self):
+        del self._ucg_rate
+
+    @input_key.deleter
+    def input_key(self):
+        del self._input_key
+
+    def encode(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def _enable_lora(self, lora_model):
+        for module_name, module in lora_model.named_modules():
+            if module.__class__.__name__ in self.TARGET_BLOCK:
+                tmp = {}
+                for sub_name, sub_module in module.named_modules():
+                    if sub_module.__class__.__name__ in self.TARGET_MODULE:
+                        if hasattr(sub_module, "input_size") and hasattr(
+                            sub_module, "output_size"
+                        ):  # for megatron ParallelLinear
+                            lora = LoraWrapper(sub_module, sub_module.input_size, sub_module.output_size)
+                        else:  # for nn.Linear
+                            lora = LoraWrapper(sub_module, sub_module.in_features, sub_module.out_features)
+                        self.lora_layers.append(lora)
+                        if sub_name not in tmp.keys():
+                            tmp.update({sub_name: lora})
+                        else:
+                            print(f"Duplicate subnames are found in module {module_name}")
+                for sub_name, lora_layer in tmp.items():
+                    lora_name = f'{sub_name}_lora'
+                    module.add_module(lora_name, lora_layer)
+
+
+class FrozenCLIPEmbedder(AbstractEmbModel):
+    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
+
+    LAYERS = ["last", "pooled", "hidden"]
+
+    def __init__(
+        self,
+        version="openai/clip-vit-large-patch14",
+        device="cuda",
+        max_length=77,
+        enable_lora_finetune=False,
+        layer="last",
+        layer_idx=None,
+        always_return_pooled=False,
+        dtype=torch.float,
+    ):
+        super().__init__(enable_lora_finetune, target_block=["CLIPAttention", "CLIPMLP"], target_module=["Linear"])
+        self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        self.transformer = CLIPTextModel.from_pretrained(version, torch_dtype=dtype).to(device)
+        self.device = device
+        self.max_length = max_length
+        self.freeze()
+        if enable_lora_finetune:
+            self._enable_lora(self.transformer)
+            print(f"CLIP transformer encoder add {len(self.lora_layers)} lora layers.")
+
+        self.layer = layer
+        self.layer_idx = layer_idx
+        self.return_pooled = always_return_pooled
+        if layer == "hidden":
+            assert layer_idx is not None
+            assert 0 <= abs(layer_idx) <= 12
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text, max_sequence_length=None):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=max_sequence_length if max_sequence_length else self.max_length,
+            return_length=True,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        tokens = batch_encoding["input_ids"].to(self.transformer.device, non_blocking=True)
+        outputs = self.transformer(input_ids=tokens, output_hidden_states=(self.layer == "hidden"))
+
+        if self.layer == "last":
+            z = outputs.last_hidden_state
+        elif self.layer == "pooled":
+            z = outputs.pooler_output[:, None, :]
+        else:
+            z = outputs.hidden_states[self.layer_idx]
+
+        # Pad the seq length to multiple of 8
+        seq_len = (z.shape[1] + 8 - 1) // 8 * 8
+        z = torch.nn.functional.pad(z, (0, 0, 0, seq_len - z.shape[1]), value=0.0)
+        if self.return_pooled:
+            return z, outputs.pooler_output
+        return z
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenT5Embedder(AbstractEmbModel):
+    def __init__(
+        self,
+        version="google/t5-v1_1-xxl",
+        max_length=512,
+        device="cuda",
+        dtype=torch.float,
+    ):
+        super().__init__()
+        self.tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-xxl", max_length=max_length)
+        self.transformer = T5EncoderModel.from_pretrained(version, torch_dtype=dtype).to(device)
+        self.max_length = max_length
+        self.freeze()
+        self.device = device
+        self.dtype = dtype
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text, max_sequence_length=None):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=max_sequence_length if max_sequence_length else self.max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+
+        tokens = batch_encoding["input_ids"].to(self.transformer.device, non_blocking=True)
+        outputs = self.transformer(input_ids=tokens, output_hidden_states=None)
+
+        return outputs.last_hidden_state
diff --git a/nemo/collections/diffusion/flux_infer.py b/nemo/collections/diffusion/flux_infer.py
new file mode 100644
index 000000000000..f914dbf50258
--- /dev/null
+++ b/nemo/collections/diffusion/flux_infer.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import torch
+
+from nemo.collections.diffusion.models.flux.pipeline import FluxInferencePipeline
+from nemo.collections.diffusion.utils.flux_pipeline_utils import configs
+from nemo.collections.diffusion.utils.mcore_parallel_utils import Utils
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="The flux inference pipeline is utilizing megatron core transformer.\nPlease prepare the necessary checkpoints for flux model on local disk in order to use this script"
+    )
+
+    parser.add_argument("--flux_ckpt", type=str, default="", help="Path to Flux transformer checkpoint(s)")
+    parser.add_argument("--vae_ckpt", type=str, default="/ckpts/ae.safetensors", help="Path to \'ae.safetensors\'")
+    parser.add_argument(
+        "--clip_version",
+        type=str,
+        default='/ckpts/text_encoder',
+        help="Clip version, provide either ckpt dir or clip version like openai/clip-vit-large-patch14",
+    )
+    parser.add_argument(
+        "--t5_version",
+        type=str,
+        default='/ckpts/text_encoder_2',
+        help="Clip version, provide either ckpt dir or clip version like google/t5-v1_1-xxl",
+    )
+    parser.add_argument(
+        "--do_convert_from_hf",
+        action='store_true',
+        default=False,
+        help="Must be true if provided checkpoint is not already converted to NeMo version",
+    )
+    parser.add_argument(
+        "--save_converted_model",
+        action="store_true",
+        default=False,
+        help="Whether to save the converted NeMo transformer checkpoint for Flux",
+    )
+    parser.add_argument(
+        "--version",
+        type=str,
+        default='dev',
+        choices=['dev', 'schnell'],
+        help="Must align with the checkpoint provided.",
+    )
+    parser.add_argument("--height", type=int, default=1024, help="Image height.")
+    parser.add_argument("--width", type=int, default=1024, help="Image width.")
+    parser.add_argument("--inference_steps", type=int, default=10, help="Number of inference steps to run.")
+    parser.add_argument(
+        "--num_images_per_prompt", type=int, default=1, help="Number of images to generate for each prompt."
+    )
+    parser.add_argument("--guidance", type=float, default=0.0, help="Guidance scale.")
+    parser.add_argument(
+        "--offload", action='store_true', default=False, help="Offload modules to cpu after being called."
+    )
+    parser.add_argument(
+        "--prompts",
+        type=str,
+        default="A cat holding a sign that says hello world",
+        help="Inference prompts, use \',\' to separate if multiple prompts are provided.",
+    )
+    parser.add_argument("--bf16", action='store_true', default=False, help="Use bf16 in inference.")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print('Initializing model parallel config')
+    Utils.initialize_distributed(1, 1, 1)
+
+    print('Initializing flux inference pipeline')
+    params = configs[args.version]
+    params.vae_params.ckpt = args.vae_ckpt
+    params.clip_params['version'] = args.clip_version
+    params.t5_params['version'] = args.t5_version
+    pipe = FluxInferencePipeline(params)
+
+    print('Loading transformer weights')
+    pipe.load_from_pretrained(
+        args.flux_ckpt,
+        do_convert_from_hf=args.do_convert_from_hf,
+        save_converted_model=args.save_converted_model,
+    )
+    dtype = torch.bfloat16 if args.bf16 else torch.float32
+    text = args.prompts.split(',')
+    pipe(
+        text,
+        max_sequence_length=256,
+        height=args.height,
+        width=args.width,
+        num_inference_steps=args.inference_steps,
+        num_images_per_prompt=args.num_images_per_prompt,
+        offload=args.offload,
+        guidance_scale=args.guidance,
+        dtype=dtype,
+    )
diff --git a/nemo/collections/diffusion/models/dit/dit_attention.py b/nemo/collections/diffusion/models/dit/dit_attention.py
new file mode 100644
index 000000000000..9e60b11dd1c6
--- /dev/null
+++ b/nemo/collections/diffusion/models/dit/dit_attention.py
@@ -0,0 +1,428 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+from dataclasses import dataclass
+from typing import Union
+
+import torch
+from megatron.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb
+from megatron.core.transformer.attention import Attention, SelfAttention
+from megatron.core.transformer.custom_layers.transformer_engine import SplitAlongDim
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+@dataclass
+class JointSelfAttentionSubmodules:
+    linear_qkv: Union[ModuleSpec, type] = None
+    added_linear_qkv: Union[ModuleSpec, type] = None
+    core_attention: Union[ModuleSpec, type] = None
+    linear_proj: Union[ModuleSpec, type] = None
+    q_layernorm: Union[ModuleSpec, type] = None
+    k_layernorm: Union[ModuleSpec, type] = None
+    added_q_layernorm: Union[ModuleSpec, type] = None
+    added_k_layernorm: Union[ModuleSpec, type] = None
+
+
+class JointSelfAttention(Attention):
+    """Joint Self-attention layer class
+
+    Used for MMDIT-like transformer block.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: JointSelfAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type=AttnMaskType.padding,
+        context_pre_only: bool = False,
+    ):
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attn_mask_type=attn_mask_type,
+            attention_type="self",
+        )
+
+        self.linear_qkv = build_module(
+            submodules.linear_qkv,
+            self.config.hidden_size,
+            self.query_projection_size + 2 * self.kv_projection_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=self.config.add_bias_linear or self.config.add_qkv_bias,
+            skip_bias_add=False,
+            is_expert=False,
+            tp_comm_buffer_name='qkv',
+        )
+
+        if submodules.added_linear_qkv is not None:
+            self.added_linear_qkv = build_module(
+                submodules.added_linear_qkv,
+                self.config.hidden_size,
+                self.query_projection_size + 2 * self.kv_projection_size,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=self.config.add_qkv_bias,
+                skip_bias_add=False,
+                is_expert=False,
+                tp_comm_buffer_name='qkv',
+            )
+
+        if not context_pre_only:
+            self.added_linear_proj = build_module(
+                submodules.linear_proj,
+                self.query_projection_size,
+                self.config.hidden_size,
+                config=self.config,
+                init_method=self.config.output_layer_init_method,
+                bias=self.config.add_bias_linear,
+                input_is_parallel=True,
+                skip_bias_add=True,
+                is_expert=False,
+                tp_comm_buffer_name='proj',
+            )
+
+        if submodules.q_layernorm is not None:
+            self.q_layernorm = build_module(
+                submodules.q_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.q_layernorm = None
+
+        if submodules.k_layernorm is not None:
+            self.k_layernorm = build_module(
+                submodules.k_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.k_layernorm = None
+
+        if submodules.added_q_layernorm is not None:
+            self.added_q_layernorm = build_module(
+                submodules.added_q_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.added_q_layernorm = None
+
+        if submodules.added_k_layernorm is not None:
+            self.added_k_layernorm = build_module(
+                submodules.added_k_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.added_k_layernorm = None
+
+    def _split_qkv(self, mixed_qkv):
+        # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn]
+        new_tensor_shape = mixed_qkv.size()[:-1] + (
+            self.num_query_groups_per_partition,
+            (
+                (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2)
+                * self.hidden_size_per_attention_head
+            ),
+        )
+        mixed_qkv = mixed_qkv.view(*new_tensor_shape)
+
+        split_arg_list = [
+            (
+                self.num_attention_heads_per_partition
+                // self.num_query_groups_per_partition
+                * self.hidden_size_per_attention_head
+            ),
+            self.hidden_size_per_attention_head,
+            self.hidden_size_per_attention_head,
+        ]
+
+        if SplitAlongDim is not None:
+
+            # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+            (query, key, value) = SplitAlongDim(
+                mixed_qkv,
+                3,
+                split_arg_list,
+            )
+        else:
+
+            # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+            (query, key, value) = torch.split(
+                mixed_qkv,
+                split_arg_list,
+                dim=3,
+            )
+
+        # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn]
+        query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head)
+        return query, key, value
+
+    def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
+        """
+        Derives `query`, `key` and `value` tensors from `hidden_states`.
+        """
+        # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
+        mixed_qkv, _ = self.linear_qkv(hidden_states)
+
+        query, key, value = self._split_qkv(mixed_qkv)
+
+        if self.config.test_mode:
+            self.run_realtime_tests()
+
+        if self.q_layernorm is not None:
+            query = self.q_layernorm(query)
+
+        if self.k_layernorm is not None:
+            key = self.k_layernorm(key)
+
+        return query, key, value
+
+    def get_added_query_key_value_tensors(self, added_hidden_states, key_value_states=None):
+        """
+        Derives `query`, `key` and `value` tensors from `hidden_states`.
+        """
+        # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
+        mixed_qkv, _ = self.added_linear_qkv(added_hidden_states)
+
+        query, key, value = self._split_qkv(mixed_qkv)
+
+        if self.config.test_mode:
+            self.run_realtime_tests()
+
+        if self.added_q_layernorm is not None:
+            query = self.added_q_layernorm(query)
+
+        if self.added_k_layernorm is not None:
+            key = self.added_k_layernorm(key)
+
+        return query, key, value
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        key_value_states=None,
+        inference_params=None,
+        rotary_pos_emb=None,
+        packed_seq_params=None,
+        additional_hidden_states=None,
+    ):
+        # hidden_states: [sq, b, h]
+
+        # For self attention we just duplicate the rotary_pos_emb if it isn't already
+        if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple):
+            rotary_pos_emb = (rotary_pos_emb,) * 2
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Get the query, key and value tensors based on the type of attention -
+        # self or cross attn.
+
+        query, key, value = self.get_query_key_value_tensors(hidden_states)
+        added_query, added_key, added_value = self.get_added_query_key_value_tensors(additional_hidden_states)
+
+        query = torch.cat([added_query, query], dim=0)
+        key = torch.cat([added_key, key], dim=0)
+        value = torch.cat([added_value, value], dim=0)
+
+        # ===================================================
+        # Adjust key, value, and rotary_pos_emb for inference
+        # ===================================================
+        key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference(
+            inference_params, key, value, rotary_pos_emb
+        )
+
+        if packed_seq_params is not None:
+            query = query.squeeze(1)
+            key = key.squeeze(1)
+            value = value.squeeze(1)
+
+        # ================================================
+        # relative positional embedding (rotary embedding)
+        # ================================================
+        if rotary_pos_emb is not None:
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+
+            if packed_seq_params is not None:
+                cu_seqlens_q = packed_seq_params.cu_seqlens_q
+                cu_seqlens_kv = packed_seq_params.cu_seqlens_kv
+            else:
+                cu_seqlens_q = cu_seqlens_kv = None
+            query = apply_rotary_pos_emb(
+                query,
+                q_pos_emb,
+                config=self.config,
+                cu_seqlens=cu_seqlens_q,
+            )
+            key = apply_rotary_pos_emb(
+                key,
+                k_pos_emb,
+                config=self.config,
+                cu_seqlens=cu_seqlens_kv,
+            )
+
+            # TODO, can apply positional embedding to value_layer so it has
+            # absolute positional embedding.
+            # otherwise, only relative positional embedding takes effect
+            # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb)
+
+        # ==================================
+        # core attention computation
+        # ==================================
+        if self.checkpoint_core_attention and self.training:
+            core_attn_out = self._checkpointed_attention_forward(
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
+            )
+        else:
+            core_attn_out = self.core_attention(
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
+            )
+
+        if packed_seq_params is not None:
+            # reshape to same output shape as unpacked case
+            # (t, np, hn) -> (t, b=1, h=np*hn)
+            # t is the pack size = sum (sq_i)
+            # note that batch is a dummy dimension in the packed case
+            core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+        encoder_attention_output = core_attn_out[: additional_hidden_states.shape[0], :, :]
+        attention_output = core_attn_out[additional_hidden_states.shape[0] :, :, :]
+
+        output, bias = self.linear_proj(attention_output)
+        encoder_output, encoder_bias = self.added_linear_proj(encoder_attention_output)
+
+        output = output + bias
+        encoder_output = encoder_output + encoder_bias
+
+        return output, encoder_output
+
+
+class FluxSingleAttention(SelfAttention):
+    """Self-attention layer class
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        key_value_states=None,
+        inference_params=None,
+        rotary_pos_emb=None,
+        packed_seq_params=None,
+    ):
+        # hidden_states: [sq, b, h]
+
+        # For self attention we just duplicate the rotary_pos_emb if it isn't already
+        if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple):
+            rotary_pos_emb = (rotary_pos_emb,) * 2
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Get the query, key and value tensors based on the type of attention -
+        # self or cross attn.
+        query, key, value = self.get_query_key_value_tensors(hidden_states, key_value_states)
+        # print(f'megatron q before ln: {query.transpose(0, 1).contiguous()}, {query.transpose(0, 1).contiguous().shape}')
+        # print(f'megatron k before ln: {key.transpose(0, 1).contiguous()}, {key.transpose(0, 1).contiguous().shape}')
+        # print(f'megatron v before ln: {value.transpose(0, 1).contiguous()}, {value.transpose(0, 1).contiguous().shape}')
+
+        # ===================================================
+        # Adjust key, value, and rotary_pos_emb for inference
+        # ===================================================
+        key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference(
+            inference_params, key, value, rotary_pos_emb
+        )
+
+        if packed_seq_params is not None:
+            query = query.squeeze(1)
+            key = key.squeeze(1)
+            value = value.squeeze(1)
+
+        # ================================================
+        # relative positional embedding (rotary embedding)
+        # ================================================
+        if rotary_pos_emb is not None:
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+
+            if packed_seq_params is not None:
+                cu_seqlens_q = packed_seq_params.cu_seqlens_q
+                cu_seqlens_kv = packed_seq_params.cu_seqlens_kv
+            else:
+                cu_seqlens_q = cu_seqlens_kv = None
+            query = apply_rotary_pos_emb(
+                query,
+                q_pos_emb,
+                config=self.config,
+                cu_seqlens=cu_seqlens_q,
+            )
+            key = apply_rotary_pos_emb(
+                key,
+                k_pos_emb,
+                config=self.config,
+                cu_seqlens=cu_seqlens_kv,
+            )
+
+            # TODO, can apply positional embedding to value_layer so it has
+            # absolute positional embedding.
+            # otherwise, only relative positional embedding takes effect
+            # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb)
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        if self.checkpoint_core_attention and self.training:
+            core_attn_out = self._checkpointed_attention_forward(
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
+            )
+        else:
+            core_attn_out = self.core_attention(
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
+            )
+
+        if packed_seq_params is not None:
+            # reshape to same output shape as unpacked case
+            # (t, np, hn) -> (t, b=1, h=np*hn)
+            # t is the pack size = sum (sq_i)
+            # note that batch is a dummy dimension in the packed case
+            core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1)
+
+        return core_attn_out
diff --git a/nemo/collections/diffusion/models/dit/dit_layer_spec.py b/nemo/collections/diffusion/models/dit/dit_layer_spec.py
index 672dcff3ba00..cb7c520493f0 100644
--- a/nemo/collections/diffusion/models/dit/dit_layer_spec.py
+++ b/nemo/collections/diffusion/models/dit/dit_layer_spec.py
@@ -42,6 +42,12 @@
 from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
 from megatron.core.utils import make_viewless_tensor
 
+from nemo.collections.diffusion.models.dit.dit_attention import (
+    FluxSingleAttention,
+    JointSelfAttention,
+    JointSelfAttentionSubmodules,
+)
+
 
 @dataclass
 class DiTWithAdaLNSubmodules(TransformerLayerSubmodules):
@@ -75,7 +81,14 @@ class AdaLN(MegatronModule):
     Adaptive Layer Normalization Module for DiT.
     """
 
-    def __init__(self, config: TransformerConfig, n_adaln_chunks=9, norm=nn.LayerNorm):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        n_adaln_chunks=9,
+        norm=nn.LayerNorm,
+        modulation_bias=False,
+        use_second_norm=False,
+    ):
         super().__init__(config)
         if norm == TENorm:
             self.ln = norm(config, config.hidden_size, config.layernorm_epsilon)
@@ -83,8 +96,11 @@ def __init__(self, config: TransformerConfig, n_adaln_chunks=9, norm=nn.LayerNor
             self.ln = norm(config.hidden_size, elementwise_affine=False, eps=self.config.layernorm_epsilon)
         self.n_adaln_chunks = n_adaln_chunks
         self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(), nn.Linear(config.hidden_size, self.n_adaln_chunks * config.hidden_size, bias=False)
+            nn.SiLU(), nn.Linear(config.hidden_size, self.n_adaln_chunks * config.hidden_size, bias=modulation_bias)
         )
+        self.use_second_norm = use_second_norm
+        if self.use_second_norm:
+            self.ln2 = nn.LayerNorm(config.hidden_size, elementwise_affine=False, eps=1e-6)
         nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
 
         setattr(self.adaLN_modulation[-1].weight, "sequence_parallel", config.sequence_parallel)
@@ -92,29 +108,59 @@ def __init__(self, config: TransformerConfig, n_adaln_chunks=9, norm=nn.LayerNor
     def forward(self, timestep_emb):
         return self.adaLN_modulation(timestep_emb).chunk(self.n_adaln_chunks, dim=-1)
 
-    @jit_fuser
+    # @jit_fuser
     def modulate(self, x, shift, scale):
         return x * (1 + scale) + shift
 
-    @jit_fuser
+    # @jit_fuser
     def scale_add(self, residual, x, gate):
         return residual + gate * x
 
-    @jit_fuser
-    def modulated_layernorm(self, x, shift, scale):
+    # @jit_fuser
+    def modulated_layernorm(self, x, shift, scale, layernorm_idx=0):
+        if self.use_second_norm and layernorm_idx == 1:
+            layernorm = self.ln2
+        else:
+            layernorm = self.ln
         # Optional Input Layer norm
-        input_layernorm_output = self.ln(x).type_as(x)
+        input_layernorm_output = layernorm(x).type_as(x)
 
         # DiT block specific
         return self.modulate(input_layernorm_output, shift, scale)
 
     # @jit_fuser
-    def scaled_modulated_layernorm(self, residual, x, gate, shift, scale):
+    def scaled_modulated_layernorm(self, residual, x, gate, shift, scale, layernorm_idx=0):
         hidden_states = self.scale_add(residual, x, gate)
-        shifted_pre_mlp_layernorm_output = self.modulated_layernorm(hidden_states, shift, scale)
+        shifted_pre_mlp_layernorm_output = self.modulated_layernorm(hidden_states, shift, scale, layernorm_idx)
         return hidden_states, shifted_pre_mlp_layernorm_output
 
 
+class AdaLNContinuous(MegatronModule):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        conditioning_embedding_dim: int,
+        modulation_bias: bool = True,
+        norm_type: str = "layer_norm",
+    ):
+        super().__init__(config)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), nn.Linear(conditioning_embedding_dim, config.hidden_size * 2, bias=modulation_bias)
+        )
+        if norm_type == "layer_norm":
+            self.norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False, eps=1e-6, bias=modulation_bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(config.hidden_size, eps=1e-6)
+        else:
+            raise ValueError("Unknown normalization type {}".format(norm_type))
+
+    def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor:
+        emb = self.adaLN_modulation(conditioning_embedding)
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale) + shift
+        return x
+
+
 class STDiTLayerWithAdaLN(TransformerLayer):
     """A single transformer layer.
 
@@ -407,6 +453,225 @@ def forward(
         return output, context
 
 
+class DiTLayer(TransformerLayer):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+
+    Original DiT layer implementation from [https://arxiv.org/pdf/2212.09748].
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        mlp_ratio: int = 4,
+        n_adaln_chunks: int = 6,
+        modulation_bias: bool = True,
+    ):
+        # Modify the mlp layer hidden_size of a dit layer according to mlp_ratio
+        config.ffn_hidden_size = int(mlp_ratio * config.hidden_size)
+        super().__init__(config=config, submodules=submodules, layer_number=layer_number)
+
+        self.adaLN = AdaLN(
+            config=config, n_adaln_chunks=n_adaln_chunks, modulation_bias=modulation_bias, use_second_norm=True
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        inference_params=None,
+        packed_seq_params=None,
+    ):
+        # passing in conditioning information via attention mask here
+        c = attention_mask
+
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN(c)
+
+        shifted_input_layernorm_output = self.adaLN.modulated_layernorm(
+            hidden_states, shift=shift_msa, scale=scale_msa, layernorm_idx=0
+        )
+
+        x, bias = self.self_attention(shifted_input_layernorm_output, attention_mask=None)
+
+        hidden_states = self.adaLN.scale_add(hidden_states, x=(x + bias), gate=gate_msa)
+
+        residual = hidden_states
+
+        shited_pre_mlp_layernorm_output = self.adaLN.modulated_layernorm(
+            hidden_states, shift=shift_mlp, scale=scale_mlp, layernorm_idx=1
+        )
+
+        x, bias = self.mlp(shited_pre_mlp_layernorm_output)
+
+        hidden_states = self.adaLN.scale_add(residual, x=(x + bias), gate=gate_mlp)
+
+        return hidden_states, context
+
+
+class MMDiTLayer(TransformerLayer):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+
+    MMDiT layer implementation from [https://arxiv.org/pdf/2403.03206].
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        context_pre_only: bool = False,
+    ):
+
+        hidden_size = config.hidden_size
+        super().__init__(config=config, submodules=submodules, layer_number=layer_number)
+
+        self.adaln = AdaLN(config, modulation_bias=True, n_adaln_chunks=6, use_second_norm=True)
+
+        self.context_pre_only = context_pre_only
+        context_norm_type = "ada_norm_continous" if context_pre_only else "ada_norm_zero"
+
+        if context_norm_type == "ada_norm_continous":
+            self.adaln_context = AdaLNContinous(config, hidden_size, modulation_bias=True, norm_type="layer_norm")
+        elif context_norm_type == "ada_norm_zero":
+            self.adaln_context = AdaLN(config, modulation_bias=True, n_adaln_chunks=6, use_second_norm=True)
+        else:
+            raise ValueError(
+                f"Unknown context_norm_type: {context_norm_type}, currently only support `ada_norm_continous`, `ada_norm_zero`"
+            )
+        # Override Cross Attention to disable CP.
+        # Disable TP Comm overlap as well. Not disabling will attempt re-use of buffer size same as Q and lead to incorrect tensor shapes.
+        cp_override_config = copy.deepcopy(config)
+        cp_override_config.context_parallel_size = 1
+        cp_override_config.tp_comm_overlap = False
+
+        if not context_pre_only:
+            self.context_mlp = build_module(
+                submodules.mlp,
+                config=cp_override_config,
+            )
+        else:
+            self.context_mlp = None
+
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states,
+        attention_mask=None,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        inference_params=None,
+        packed_seq_params=None,
+        emb=None,
+    ):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaln(emb)
+
+        norm_hidden_states = self.adaln.modulated_layernorm(
+            hidden_states, shift=shift_msa, scale=scale_msa, layernorm_idx=0
+        )
+        if self.context_pre_only:
+            norm_encoder_hidden_states = self.adaln_context(encoder_hidden_states, emb)
+        else:
+            c_shift_msa, c_scale_msa, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.adaln_context(emb)
+            norm_encoder_hidden_states = self.adaln_context.modulated_layernorm(
+                encoder_hidden_states, shift=c_shift_msa, scale=c_scale_msa, layernorm_idx=0
+            )
+
+        attention_output, encoder_attention_output = self.self_attention(
+            norm_hidden_states,
+            attention_mask=attention_mask,
+            key_value_states=None,
+            additional_hidden_states=norm_encoder_hidden_states,
+            rotary_pos_emb=rotary_pos_emb,
+        )
+        hidden_states = self.adaln.scale_add(hidden_states, x=attention_output, gate=gate_msa)
+        norm_hidden_states = self.adaln.modulated_layernorm(
+            hidden_states, shift=shift_mlp, scale=scale_mlp, layernorm_idx=1
+        )
+
+        mlp_output, mlp_output_bias = self.mlp(norm_hidden_states)
+        hidden_states = self.adaln.scale_add(hidden_states, x=(mlp_output + mlp_output_bias), gate=gate_mlp)
+
+        if self.context_pre_only:
+            encoder_hidden_states = None
+        else:
+            encoder_hidden_states = self.adaln_context.scale_add(
+                encoder_hidden_states, x=encoder_attention_output, gate=c_gate_msa
+            )
+            norm_encoder_hidden_states = self.adaln_context.modulated_layernorm(
+                encoder_hidden_states, shift=c_shift_mlp, scale=c_scale_mlp, layernorm_idx=1
+            )
+
+            context_mlp_output, context_mlp_output_bias = self.context_mlp(norm_encoder_hidden_states)
+            encoder_hidden_states = self.adaln.scale_add(
+                encoder_hidden_states, x=(context_mlp_output + context_mlp_output_bias), gate=c_gate_mlp
+            )
+
+        return hidden_states, encoder_hidden_states
+
+
+class FluxSingleTransformerBlock(TransformerLayer):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        mlp_ratio: int = 4,
+        n_adaln_chunks: int = 3,
+        modulation_bias: bool = True,
+    ):
+        super().__init__(config=config, submodules=submodules, layer_number=layer_number)
+        hidden_size = config.hidden_size
+        self.adaln = AdaLN(
+            config=config, n_adaln_chunks=n_adaln_chunks, modulation_bias=modulation_bias, use_second_norm=False
+        )
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.proj_in = nn.Linear(hidden_size, self.mlp_hidden_dim)
+        self.activation = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        inference_params=None,
+        packed_seq_params=None,
+        emb=None,
+    ):
+        residual = hidden_states
+
+        shift, scale, gate = self.adaln(emb)
+
+        norm_hidden_states = self.adaln.modulated_layernorm(hidden_states, shift=shift, scale=scale)
+
+        mlp_hidden_states = self.activation(self.proj_in(norm_hidden_states))
+
+        attention_output = self.self_attention(
+            norm_hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb
+        )
+
+        hidden_states = torch.cat((attention_output, mlp_hidden_states), dim=2)
+
+        hidden_states = self.proj_out(hidden_states)
+
+        hidden_states = self.adaln.scale_add(residual, x=hidden_states, gate=gate)
+
+        return hidden_states
+
+
 def get_stdit_adaln_block_with_transformer_engine_spec() -> ModuleSpec:
     params = {"attn_mask_type": AttnMaskType.padding}
     return ModuleSpec(
@@ -530,3 +795,77 @@ def get_official_dit_adaln_block_with_transformer_engine_spec() -> ModuleSpec:
             ),
         ),
     )
+
+
+def get_mm_dit_block_with_transformer_engine_spec() -> ModuleSpec:
+
+    return ModuleSpec(
+        module=MMDiTLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=JointSelfAttention,
+                params={"attn_mask_type": AttnMaskType.no_mask},
+                submodules=JointSelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    added_linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                ),
+            ),
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TEColumnParallelLinear,
+                    linear_fc2=TERowParallelLinear,
+                ),
+            ),
+        ),
+    )
+
+
+def get_flux_single_transformer_engine_spec() -> ModuleSpec:
+    return ModuleSpec(
+        module=FluxSingleTransformerBlock,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=FluxSingleAttention,
+                params={"attn_mask_type": AttnMaskType.no_mask},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    q_layernorm=RMSNorm,
+                    k_layernorm=RMSNorm,
+                    linear_proj=IdentityOp,
+                ),
+            ),
+        ),
+    )
+
+
+def get_flux_double_transformer_engine_spec() -> ModuleSpec:
+    return ModuleSpec(
+        module=MMDiTLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=JointSelfAttention,
+                params={"attn_mask_type": AttnMaskType.no_mask},
+                submodules=JointSelfAttentionSubmodules(
+                    q_layernorm=RMSNorm,
+                    k_layernorm=RMSNorm,
+                    added_q_layernorm=RMSNorm,
+                    added_k_layernorm=RMSNorm,
+                    linear_qkv=TEColumnParallelLinear,
+                    added_linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                ),
+            ),
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TEColumnParallelLinear,
+                    linear_fc2=TERowParallelLinear,
+                ),
+            ),
+        ),
+    )
diff --git a/nemo/collections/diffusion/models/flux/__init__.py b/nemo/collections/diffusion/models/flux/__init__.py
new file mode 100644
index 000000000000..9e3250071955
--- /dev/null
+++ b/nemo/collections/diffusion/models/flux/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/diffusion/models/flux/layers.py b/nemo/collections/diffusion/models/flux/layers.py
new file mode 100644
index 000000000000..222a9a1d67ae
--- /dev/null
+++ b/nemo/collections/diffusion/models/flux/layers.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import torch
+from torch import Tensor, nn
+
+
+def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
+    """
+    Different from the original ROPE used for flux.
+    Megatron attention takes the out product and calculate sin/cos inside, so we only need to get the freqs here
+    in the shape of [seq, ..., dim]
+    """
+    assert dim % 2 == 0, "The dimension must be even."
+
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+
+    out = torch.einsum("...n,d->...nd", pos, omega)
+
+    return out.float()
+
+
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+
+    def forward(self, ids: torch.Tensor) -> torch.Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-1,
+        )
+        emb = emb.unsqueeze(1).permute(2, 0, 1, 3)
+        return torch.stack([emb, emb], dim=-1).reshape(*emb.shape[:-1], -1)
+
+
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+
+
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = True,
+    downscale_freq_shift: float = 0,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+
+    Args
+        timesteps (torch.Tensor):
+            a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        embedding_dim (int):
+            the dimension of the output.
+        flip_sin_to_cos (bool):
+            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
+        downscale_freq_shift (float):
+            Controls the delta between frequencies between dimensions
+        scale (float):
+            Scaling factor applied to the embeddings.
+        max_period (int):
+            Controls the maximum frequency of the embeddings
+    Returns
+        torch.Tensor: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+
+    # scale embeddings
+    emb = scale * emb
+
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+
+
+class Timesteps(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        flip_sin_to_cos: bool = True,
+        downscale_freq_shift: float = 0,
+        scale: float = 1,
+        max_period: int = 10000,
+    ):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+        self.scale = scale
+        self.max_period = max_period
+
+    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.embedding_dim,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+            scale=self.scale,
+            max_period=self.max_period,
+        )
+        return t_emb
+
+
+class TimeStepEmbedder(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        hidden_dim: int,
+        flip_sin_to_cos: bool = True,
+        downscale_freq_shift: float = 0,
+        scale: float = 1,
+        max_period: int = 10000,
+    ):
+
+        super().__init__()
+
+        self.time_proj = Timesteps(
+            embedding_dim=embedding_dim,
+            flip_sin_to_cos=flip_sin_to_cos,
+            downscale_freq_shift=downscale_freq_shift,
+            scale=scale,
+            max_period=max_period,
+        )
+        self.time_embedder = MLPEmbedder(in_dim=embedding_dim, hidden_dim=hidden_dim)
+
+    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
+        timesteps_proj = self.time_proj(timesteps)
+        timesteps_emb = self.time_embedder(timesteps_proj)
+
+        return timesteps_emb
diff --git a/nemo/collections/diffusion/models/flux/model.py b/nemo/collections/diffusion/models/flux/model.py
new file mode 100644
index 000000000000..4d42c80a75a1
--- /dev/null
+++ b/nemo/collections/diffusion/models/flux/model.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Callable
+
+import torch
+from megatron.core.models.common.vision_module.vision_module import VisionModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.utils import openai_gelu
+from torch import nn
+
+from nemo.collections.diffusion.models.dit.dit_layer_spec import (
+    AdaLNContinuous,
+    FluxSingleTransformerBlock,
+    MMDiTLayer,
+    get_flux_double_transformer_engine_spec,
+    get_flux_single_transformer_engine_spec,
+)
+from nemo.collections.diffusion.models.flux.layers import EmbedND, MLPEmbedder, TimeStepEmbedder
+
+
+@dataclass
+class FluxParams:
+    num_joint_layers: int = 19
+    num_single_layers: int = 38
+    hidden_size: int = 3072
+    num_attention_heads: int = 24
+    activation_func: Callable = openai_gelu
+    add_qkv_bias: bool = True
+    ffn_hidden_size: int = 16384
+    in_channels: int = 64
+    context_dim: int = 4096
+    model_channels: int = 256
+    patch_size: int = 1
+    guidance_embed: bool = False
+    vec_in_dim: int = 768
+
+
+class Flux(VisionModule):
+    def __init__(self, config: FluxParams):
+
+        self.out_channels = config.in_channels
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.patch_size = config.patch_size
+        self.in_channels = config.in_channels
+        self.guidance_embed = config.guidance_embed
+        transformer_config = TransformerConfig(
+            num_layers=1,
+            hidden_size=self.hidden_size,
+            num_attention_heads=self.num_attention_heads,
+            use_cpu_initialization=True,
+            activation_func=config.activation_func,
+            hidden_dropout=0,
+            attention_dropout=0,
+            layernorm_epsilon=1e-6,
+            add_qkv_bias=config.add_qkv_bias,
+            rotary_interleaved=True,
+        )
+        super().__init__(transformer_config)
+
+        self.pos_embed = EmbedND(dim=self.hidden_size, theta=10000, axes_dim=[16, 56, 56])
+        self.img_embed = nn.Linear(config.in_channels, self.hidden_size)
+        self.txt_embed = nn.Linear(config.context_dim, self.hidden_size)
+        self.timestep_embedding = TimeStepEmbedder(config.model_channels, self.hidden_size)
+        self.vector_embedding = MLPEmbedder(in_dim=config.vec_in_dim, hidden_dim=self.hidden_size)
+        if config.guidance_embed:
+            self.guidance_embedding = (
+                MLPEmbedder(in_dim=config.model_channels, hidden_dim=self.hidden_size)
+                if config.guidance_embed
+                else nn.Identity()
+            )
+
+        self.double_blocks = nn.ModuleList(
+            [
+                MMDiTLayer(
+                    config=transformer_config,
+                    submodules=get_flux_double_transformer_engine_spec().submodules,
+                    layer_number=i,
+                    context_pre_only=False,
+                )
+                for i in range(config.num_joint_layers)
+            ]
+        )
+
+        self.single_blocks = nn.ModuleList(
+            [
+                FluxSingleTransformerBlock(
+                    config=transformer_config,
+                    submodules=get_flux_single_transformer_engine_spec().submodules,
+                    layer_number=i,
+                )
+                for i in range(config.num_single_layers)
+            ]
+        )
+
+        self.norm_out = AdaLNContinuous(config=transformer_config, conditioning_embedding_dim=self.hidden_size)
+        self.proj_out = nn.Linear(self.hidden_size, self.patch_size * self.patch_size * self.out_channels, bias=True)
+
+    def forward(
+        self,
+        img: torch.Tensor,
+        txt: torch.Tensor = None,
+        y: torch.Tensor = None,
+        timesteps: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+    ):
+        hidden_states = self.img_embed(img)
+        encoder_hidden_states = self.txt_embed(txt)
+
+        timesteps = timesteps.to(img.dtype) * 1000
+        vec_emb = self.timestep_embedding(timesteps)
+
+        if guidance is not None:
+            vec_emb = vec_emb + self.guidance_embedding(self.timestep_embedding.time_proj(guidance * 1000))
+        vec_emb = vec_emb + self.vector_embedding(y)
+
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        rotary_pos_emb = self.pos_embed(ids)
+        for id_block, block in enumerate(self.double_blocks):
+            hidden_states, encoder_hidden_states = block(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                rotary_pos_emb=rotary_pos_emb,
+                emb=vec_emb,
+            )
+
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=0)
+
+        for id_block, block in enumerate(self.single_blocks):
+            hidden_states = block(
+                hidden_states=hidden_states,
+                rotary_pos_emb=rotary_pos_emb,
+                emb=vec_emb,
+            )
+
+        hidden_states = hidden_states[encoder_hidden_states.shape[0] :, ...]
+
+        hidden_states = self.norm_out(hidden_states, vec_emb)
+        output = self.proj_out(hidden_states)
+
+        return output
diff --git a/nemo/collections/diffusion/models/flux/pipeline.py b/nemo/collections/diffusion/models/flux/pipeline.py
new file mode 100644
index 000000000000..e460f8f115bd
--- /dev/null
+++ b/nemo/collections/diffusion/models/flux/pipeline.py
@@ -0,0 +1,342 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+from PIL import Image
+from safetensors.torch import load_file as load_safetensors
+from safetensors.torch import save_file as save_safetensors
+from torch import nn
+from tqdm import tqdm
+
+from nemo.collections.diffusion.encoders.conditioner import FrozenCLIPEmbedder, FrozenT5Embedder
+from nemo.collections.diffusion.models.flux.model import Flux, FluxParams
+from nemo.collections.diffusion.sampler.flow_matching.flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
+from nemo.collections.diffusion.utils.flux_ckpt_converter import flux_transformer_converter
+from nemo.collections.diffusion.utils.flux_pipeline_utils import FluxModelParams
+from nemo.collections.diffusion.vae.autoencoder import AutoEncoder
+
+
+class FluxInferencePipeline(nn.Module):
+    def __init__(self, params: FluxModelParams):
+        super().__init__()
+        self.device = params.device
+        params.clip_params['device'] = self.device
+        params.t5_params['device'] = self.device
+
+        self.vae = AutoEncoder(params.vae_params).to(self.device).eval()
+        self.clip_encoder = FrozenCLIPEmbedder(**params.clip_params)
+        self.t5_encoder = FrozenT5Embedder(**params.t5_params)
+        self.transformer = Flux(params.flux_params).to(self.device).eval()
+        self.vae_scale_factor = 2 ** (len(self.vae.params.ch_mult))
+        self.scheduler = FlowMatchEulerDiscreteScheduler(**params.scheduler_params)
+        self.params = params
+
+    def load_from_pretrained(self, ckpt_path, do_convert_from_hf=True, save_converted_model=None):
+        if do_convert_from_hf:
+            ckpt = flux_transformer_converter(ckpt_path, self.transformer.config)
+            if save_converted_model:
+                save_path = os.path.join(ckpt_path, 'nemo_flux_transformer.safetensors')
+                save_safetensors(ckpt, save_path)
+                print(f'saving converted transformer checkpoint to {save_path}')
+        else:
+            ckpt = load_safetensors(ckpt_path)
+        missing, unexpected = self.transformer.load_state_dict(ckpt, strict=False)
+        missing = [
+            k for k in missing if not k.endswith('_extra_state')
+        ]  # These keys are mcore specific and should not affect the model performance
+        if len(missing) > 0:
+            print(
+                f"The folloing keys are missing during checkpoint loading, please check the ckpt provided or the image quality may be compromised.\n {missing}"
+            )
+            print(f"Found unexepected keys: \n {unexpected}")
+
+    def encoder_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = 'cuda',
+        dtype: Optional[torch.dtype] = torch.float,
+    ):
+        if prompt is not None:
+            batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            batch_size = prompt_embeds.shape[0]
+        else:
+            raise ValueError("Either prompt or prompt_embeds must be provided.")
+        if device == 'cuda' and self.t5_encoder.device != device:
+            self.t5_encoder.to(device)
+        if prompt_embeds is None:
+            prompt_embeds = self.t5_encoder(prompt, max_sequence_length=max_sequence_length)
+        seq_len = prompt_embeds.shape[1]
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1).to(dtype=dtype)
+
+        if device == 'cuda' and self.clip_encoder.device != device:
+            self.clip_encoder.to(device)
+        if pooled_prompt_embeds is None:
+            _, pooled_prompt_embeds = self.clip_encoder(prompt)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1).to(dtype=dtype)
+
+        dtype = dtype if dtype is not None else self.t5_encoder.dtype
+        text_ids = torch.zeros(batch_size, prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        text_ids = text_ids.repeat(num_images_per_prompt, 1, 1)
+
+        return prompt_embeds.transpose(0, 1), pooled_prompt_embeds, text_ids
+
+    @staticmethod
+    def _prepare_latent_image_ids(batch_size: int, height: int, width: int, device: torch.device, dtype: torch.dtype):
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+
+        latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1)
+        latent_image_ids = latent_image_ids.reshape(
+            batch_size, latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+
+        return latent_image_ids.to(device=device, dtype=dtype)
+
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+
+        return latents
+
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+
+        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+
+        return latents
+
+    @staticmethod
+    def _calculate_shift(
+        image_seq_len,
+        base_seq_len: int = 256,
+        max_seq_len: int = 4096,
+        base_shift: float = 0.5,
+        max_shift: float = 1.16,
+    ):
+        m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+        b = base_shift - m * base_seq_len
+        mu = image_seq_len * m + b
+        return mu
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        height = 2 * int(height) // self.vae_scale_factor
+        width = 2 * int(width) // self.vae_scale_factor
+
+        shape = (batch_size, num_channels_latents, height, width)
+
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = FluxInferencePipeline._generate_rand_latents(shape, generator=generator, device=device, dtype=dtype)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+
+        return latents.transpose(0, 1), latent_image_ids
+
+    @staticmethod
+    def _generate_rand_latents(
+        shape,
+        generator,
+        device,
+        dtype,
+    ):
+        if isinstance(generator, list):
+            shape = (1,) + shape[1:]
+            latents = [
+                torch.randn(shape, generator=generator[i], device=device, dtype=dtype, layout=layout)
+                for i in range(batch_size)
+            ]
+            latents = torch.cat(latents, dim=0).to(device=device)
+        else:
+            latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+
+        return latents
+
+    @staticmethod
+    def numpy_to_pil(images):
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        pil_images = [Image.fromarray(image) for image in images]
+
+        return pil_images
+
+    @staticmethod
+    def torch_to_numpy(images):
+        numpy_images = images.float().cpu().permute(0, 2, 3, 1).numpy()
+        return numpy_images
+
+    @staticmethod
+    def denormalize(image):
+        return (image / 2 + 0.5).clamp(0, 1)
+
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: int = 28,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 7.0,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        max_sequence_length: int = 512,
+        device: torch.device = 'cuda',
+        dtype: torch.dtype = torch.float32,
+        save_to_disk: bool = True,
+        offload: bool = True,
+    ):
+        assert device == 'cuda', 'Transformer blocks in Mcore must run on cuda devices'
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+            prompt = [prompt]
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        elif prompt_embeds is not None and isinstance(prompt_embeds, torch.FloatTensor):
+            batch_size = prompt_embeds.shape[0]
+        else:
+            raise ValueError("Either prompt or prompt_embeds must be provided.")
+
+        ## get text prompt embeddings
+        prompt_embeds, pooled_prompt_embeds, text_ids = self.encoder_prompt(
+            prompt=prompt,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            device=device,
+            dtype=dtype,
+        )
+        if offload:
+            self.t5_encoder.to('cpu')
+            self.clip_encoder.to('cpu')
+            torch.cuda.empty_cache()
+
+        ## prepare image latents
+        num_channels_latents = self.transformer.in_channels // 4
+        latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt, num_channels_latents, height, width, dtype, device, generator, latents
+        )
+        # prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = latents.shape[0]
+
+        mu = FluxInferencePipeline._calculate_shift(
+            image_seq_len,
+            self.scheduler.base_image_seq_len,
+            self.scheduler.max_image_seq_len,
+            self.scheduler.base_shift,
+            self.scheduler.max_shift,
+        )
+
+        self.scheduler.set_timesteps(sigmas=sigmas, device=device, mu=mu)
+        timesteps = self.scheduler.timesteps
+
+        if device == 'cuda' and device != self.device:
+            self.transformer.to(device)
+        with torch.no_grad():
+            for i, t in tqdm(enumerate(timesteps)):
+                timestep = t.expand(latents.shape[1]).to(device=latents.device, dtype=latents.dtype)
+                if self.transformer.guidance_embed:
+                    guidance = torch.tensor([guidance_scale], device=device).expand(latents.shape[1])
+                else:
+                    guidance = None
+                with torch.autocast(device_type='cuda', dtype=latents.dtype):
+                    pred = self.transformer(
+                        img=latents,
+                        txt=prompt_embeds,
+                        y=pooled_prompt_embeds,
+                        timesteps=timestep / 1000,
+                        img_ids=latent_image_ids,
+                        txt_ids=text_ids,
+                        guidance=guidance,
+                    )
+                    latents = self.scheduler.step(pred, t, latents)[0]
+            if offload:
+                self.transformer.to('cpu')
+                torch.cuda.empty_cache()
+
+            if output_type == "latent":
+                return latents.transpose(0, 1)
+            elif output_type == "pil":
+                latents = self._unpack_latents(latents.transpose(0, 1), height, width, self.vae_scale_factor)
+                latents = (latents / self.vae.params.scale_factor) + self.vae.params.shift_factor
+                if device == 'cuda' and device != self.device:
+                    self.vae.to(device)
+                with torch.autocast(device_type='cuda', dtype=latents.dtype):
+                    image = self.vae.decode(latents)
+                if offload:
+                    self.vae.to('cpu')
+                    torch.cuda.empty_cache()
+                image = FluxInferencePipeline.denormalize(image)
+                image = FluxInferencePipeline.torch_to_numpy(image)
+                image = FluxInferencePipeline.numpy_to_pil(image)
+        if save_to_disk:
+            print('Saving to disk')
+            assert len(image) == int(len(prompt) * num_images_per_prompt)
+            prompt = [p[:40] + f'_{idx}' for p in prompt for idx in range(num_images_per_prompt)]
+            for file_name, image in zip(prompt, image):
+                image.save(f'{file_name}.png')
+
+        return image
diff --git a/nemo/collections/diffusion/sampler/flow_matching/__init__.py b/nemo/collections/diffusion/sampler/flow_matching/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/diffusion/sampler/flow_matching/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/diffusion/sampler/flow_matching/flow_match_euler_discrete.py b/nemo/collections/diffusion/sampler/flow_matching/flow_match_euler_discrete.py
new file mode 100644
index 000000000000..5bde6b0d1dc1
--- /dev/null
+++ b/nemo/collections/diffusion/sampler/flow_matching/flow_match_euler_discrete.py
@@ -0,0 +1,284 @@
+# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from abc import ABC
+from typing import List, Optional, Tuple, Union
+
+
+import numpy as np
+import torch
+
+
+class FlowMatchEulerDiscreteScheduler(ABC):
+    """
+    Euler scheduler.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        shift (`float`, defaults to 1.0):
+            The shift value for the timestep schedule.
+    """
+
+    _compatibles = []
+    order = 1
+
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        shift: float = 1.0,
+        use_dynamic_shifting=False,
+        base_shift: Optional[float] = 0.5,
+        max_shift: Optional[float] = 1.15,
+        base_image_seq_len: Optional[int] = 256,
+        max_image_seq_len: Optional[int] = 4096,
+    ):
+        timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
+
+        sigmas = timesteps / num_train_timesteps
+        if not use_dynamic_shifting:
+            # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
+            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
+
+        self.timesteps = sigmas * num_train_timesteps
+
+        self._step_index = None
+        self._begin_index = None
+
+        self.sigmas = sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+        self.sigma_min = self.sigmas[-1].item()
+        self.sigma_max = self.sigmas[0].item()
+
+        self.base_shift = base_shift
+        self.max_shift = max_shift
+        self.base_image_seq_len = base_image_seq_len
+        self.max_image_seq_len = max_image_seq_len
+        self.use_dynamic_shifting = use_dynamic_shifting
+        self.num_train_timesteps = num_train_timesteps
+        self.shift = shift
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_noise(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        noise: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Forward process in flow-matching
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype)
+
+        if sample.device.type == "mps" and torch.is_floating_point(timestep):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32)
+            timestep = timestep.to(sample.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(sample.device)
+            timestep = timestep.to(sample.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timestep.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timestep.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(sample.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        sample = sigma * noise + (1.0 - sigma) * sample
+
+        return sample
+
+    def _sigma_to_t(self, sigma):
+        return sigma * self.num_train_timesteps
+
+    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
+        return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int = None,
+        device: Union[str, torch.device] = None,
+        sigmas: Optional[List[float]] = None,
+        mu: Optional[float] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+
+        if self.use_dynamic_shifting and mu is None:
+            raise ValueError(" you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`")
+
+        if sigmas is None:
+            self.num_inference_steps = num_inference_steps
+            timesteps = np.linspace(
+                self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps
+            )
+
+            sigmas = timesteps / self.num_train_timesteps
+
+        if self.use_dynamic_shifting:
+            sigmas = self.time_shift(mu, 1.0, sigmas)
+        else:
+            sigmas = self.shift * sigmas / (1 + (self.shift - 1) * sigmas)
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
+        timesteps = sigmas * self.num_train_timesteps
+
+        self.timesteps = timesteps.to(device=device)
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+
+        self._step_index = None
+        self._begin_index = None
+
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        s_churn: float = 0.0,
+        s_tmin: float = 0.0,
+        s_tmax: float = float("inf"),
+        s_noise: float = 1.0,
+        generator: Optional[torch.Generator] = None,
+    ) -> Tuple:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            s_churn (`float`):
+            s_tmin  (`float`):
+            s_tmax  (`float`):
+            s_noise (`float`, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+
+        Returns:
+            A tuple is returned where the first element is the sample tensor.
+        """
+
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+
+        sigma = self.sigmas[self.step_index]
+        sigma_next = self.sigmas[self.step_index + 1]
+        prev_sample = sample + (sigma_next - sigma) * model_output
+
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        return (prev_sample,)
+
+    def __len__(self):
+        return self.num_train_timesteps
diff --git a/nemo/collections/diffusion/utils/__init__.py b/nemo/collections/diffusion/utils/__init__.py
new file mode 100644
index 000000000000..9e3250071955
--- /dev/null
+++ b/nemo/collections/diffusion/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/diffusion/utils/flux_ckpt_converter.py b/nemo/collections/diffusion/utils/flux_ckpt_converter.py
new file mode 100644
index 000000000000..444a77bfad68
--- /dev/null
+++ b/nemo/collections/diffusion/utils/flux_ckpt_converter.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+from safetensors.torch import load_file as load_safetensors
+
+
+def _import_qkv_bias(transformer_config, qb, kb, vb):
+
+    head_num = transformer_config.num_attention_heads
+    num_query_groups = transformer_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = transformer_config.hidden_size
+    head_num = transformer_config.num_attention_heads
+    head_size = hidden_size // head_num
+
+    new_q_bias_tensor_shape = (head_num, head_size)
+    new_kv_bias_tensor_shape = (num_query_groups, head_size)
+
+    qb = qb.view(*new_q_bias_tensor_shape)
+    kb = kb.view(*new_kv_bias_tensor_shape)
+    vb = vb.view(*new_kv_bias_tensor_shape)
+
+    qkv_bias_l = []
+    for i in range(num_query_groups):
+        qkv_bias_l.append(qb[i * heads_per_group : (i + 1) * heads_per_group, :])
+        qkv_bias_l.append(kb[i : i + 1, :])
+        qkv_bias_l.append(vb[i : i + 1, :])
+
+    qkv_bias = torch.cat(qkv_bias_l)
+    qkv_bias = qkv_bias.reshape([head_size * (head_num + 2 * num_query_groups)])
+
+    return qkv_bias
+
+
+def _import_qkv(transformer_config, q, k, v):
+
+    head_num = transformer_config.num_attention_heads
+    num_query_groups = transformer_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = transformer_config.hidden_size
+    head_num = transformer_config.num_attention_heads
+    head_size = hidden_size // head_num
+
+    old_tensor_shape = q.size()
+    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_kv_tensor_shape)
+    v = v.view(*new_kv_tensor_shape)
+
+    qkv_weights_l = []
+    for i in range(num_query_groups):
+        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
+        qkv_weights_l.append(k[i : i + 1, :, :])
+        qkv_weights_l.append(v[i : i + 1, :, :])
+    qkv_weights = torch.cat(qkv_weights_l)
+    assert qkv_weights.ndim == 3, qkv_weights.shape
+    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
+    assert qkv_weights.shape[1] == head_size, qkv_weights.shape
+    assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
+
+    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+
+    return qkv_weights
+
+
+key_mapping = {
+    'double_blocks': {
+        'norm1.linear.weight': 'adaln.adaLN_modulation.1.weight',
+        'norm1.linear.bias': 'adaln.adaLN_modulation.1.bias',
+        'norm1_context.linear.weight': 'adaln_context.adaLN_modulation.1.weight',
+        'norm1_context.linear.bias': 'adaln_context.adaLN_modulation.1.bias',
+        'attn.norm_q.weight': 'self_attention.q_layernorm.weight',
+        'attn.norm_k.weight': 'self_attention.k_layernorm.weight',
+        'attn.norm_added_q.weight': 'self_attention.added_q_layernorm.weight',
+        'attn.norm_added_k.weight': 'self_attention.added_k_layernorm.weight',
+        'attn.to_out.0.weight': 'self_attention.linear_proj.weight',
+        'attn.to_out.0.bias': 'self_attention.linear_proj.bias',
+        'attn.to_add_out.weight': 'self_attention.added_linear_proj.weight',
+        'attn.to_add_out.bias': 'self_attention.added_linear_proj.bias',
+        'ff.net.0.proj.weight': 'mlp.linear_fc1.weight',
+        'ff.net.0.proj.bias': 'mlp.linear_fc1.bias',
+        'ff.net.2.weight': 'mlp.linear_fc2.weight',
+        'ff.net.2.bias': 'mlp.linear_fc2.bias',
+        'ff_context.net.0.proj.weight': 'context_mlp.linear_fc1.weight',
+        'ff_context.net.0.proj.bias': 'context_mlp.linear_fc1.bias',
+        'ff_context.net.2.weight': 'context_mlp.linear_fc2.weight',
+        'ff_context.net.2.bias': 'context_mlp.linear_fc2.bias',
+    },
+    'single_blocks': {
+        'norm.linear.weight': 'adaln.adaLN_modulation.1.weight',
+        'norm.linear.bias': 'adaln.adaLN_modulation.1.bias',
+        'proj_mlp.weight': 'proj_in.weight',
+        'proj_mlp.bias': 'proj_in.bias',
+        'proj_out.weight': 'proj_out.weight',
+        'proj_out.bias': 'proj_out.bias',
+        'attn.norm_q.weight': 'self_attention.q_layernorm.weight',
+        'attn.norm_k.weight': 'self_attention.k_layernorm.weight',
+    },
+    'norm_out.linear.bias': 'norm_out.adaLN_modulation.1.bias',
+    'norm_out.linear.weight': 'norm_out.adaLN_modulation.1.weight',
+    'proj_out.bias': 'proj_out.bias',
+    'proj_out.weight': 'proj_out.weight',
+    'time_text_embed.guidance_embedder.linear_1.bias': 'guidance_embedding.in_layer.bias',
+    'time_text_embed.guidance_embedder.linear_1.weight': 'guidance_embedding.in_layer.weight',
+    'time_text_embed.guidance_embedder.linear_2.bias': 'guidance_embedding.out_layer.bias',
+    'time_text_embed.guidance_embedder.linear_2.weight': 'guidance_embedding.out_layer.weight',
+    'x_embedder.bias': 'img_embed.bias',
+    'x_embedder.weight': 'img_embed.weight',
+    'time_text_embed.timestep_embedder.linear_1.bias': 'timestep_embedding.time_embedder.in_layer.bias',
+    'time_text_embed.timestep_embedder.linear_1.weight': 'timestep_embedding.time_embedder.in_layer.weight',
+    'time_text_embed.timestep_embedder.linear_2.bias': 'timestep_embedding.time_embedder.out_layer.bias',
+    'time_text_embed.timestep_embedder.linear_2.weight': 'timestep_embedding.time_embedder.out_layer.weight',
+    'context_embedder.bias': 'txt_embed.bias',
+    'context_embedder.weight': 'txt_embed.weight',
+    'time_text_embed.text_embedder.linear_1.bias': 'vector_embedding.in_layer.bias',
+    'time_text_embed.text_embedder.linear_1.weight': 'vector_embedding.in_layer.weight',
+    'time_text_embed.text_embedder.linear_2.bias': 'vector_embedding.out_layer.bias',
+    'time_text_embed.text_embedder.linear_2.weight': 'vector_embedding.out_layer.weight',
+}
+
+
+def flux_transformer_converter(ckpt_path=None, transformer_config=None):
+    diffuser_state_dict = {}
+    if os.path.isdir(ckpt_path):
+        files = os.listdir(ckpt_path)
+        for file in files:
+            if file.endswith('.safetensors'):
+                loaded_dict = load_safetensors(os.path.join(ckpt_path, file))
+                diffuser_state_dict.update(loaded_dict)
+    elif os.path.isfile(ckpt_path):
+        diffuser_state_dict = load_safetensors(ckpt_path)
+    else:
+        raise FileNotFoundError("Please provide a valid ckpt path.")
+    new_state_dict = {}
+    num_single_blocks = 0
+    num_double_blocks = 0
+    for key, value in diffuser_state_dict.items():
+        if 'attn.to_q' in key or 'attn.to_k' in key or 'attn.to_v' in key:
+            continue
+        if 'attn.add_q_proj' in key or 'attn.add_k_proj' in key or 'attn.add_v_proj' in key:
+            continue
+        if key.startswith('transformer_blocks'):
+            temp = key.split('.')
+            idx, k = temp[1], '.'.join(temp[2:])
+            num_double_blocks = max(int(idx), num_double_blocks)
+            new_key = '.'.join(['double_blocks', idx, key_mapping['double_blocks'][k]])
+        elif key.startswith('single_transformer_blocks'):
+            temp = key.split('.')
+            idx, k = temp[1], '.'.join(temp[2:])
+            num_single_blocks = max(int(idx), num_single_blocks)
+            new_key = '.'.join(['single_blocks', idx, key_mapping['single_blocks'][k]])
+        else:
+            new_key = key_mapping[key]
+        new_state_dict[new_key] = value
+
+    for i in range(num_double_blocks + 1):
+        new_key = f'double_blocks.{str(i)}.self_attention.linear_qkv.weight'
+        qk, kk, vk = [f'transformer_blocks.{str(i)}.attn.to_{n}.weight' for n in ('q', 'k', 'v')]
+        new_state_dict[new_key] = _import_qkv(
+            transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk]
+        )
+        new_key = f'double_blocks.{str(i)}.self_attention.linear_qkv.bias'
+        qk, kk, vk = [f'transformer_blocks.{str(i)}.attn.to_{n}.bias' for n in ('q', 'k', 'v')]
+        new_state_dict[new_key] = _import_qkv_bias(
+            transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk]
+        )
+        new_key = f'double_blocks.{str(i)}.self_attention.added_linear_qkv.weight'
+        qk, kk, vk = [f'transformer_blocks.{str(i)}.attn.add_{n}_proj.weight' for n in ('q', 'k', 'v')]
+        new_state_dict[new_key] = _import_qkv(
+            transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk]
+        )
+        new_key = f'double_blocks.{str(i)}.self_attention.added_linear_qkv.bias'
+        qk, kk, vk = [f'transformer_blocks.{str(i)}.attn.add_{n}_proj.bias' for n in ('q', 'k', 'v')]
+        new_state_dict[new_key] = _import_qkv_bias(
+            transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk]
+        )
+
+    for i in range(num_single_blocks + 1):
+        new_key = f'single_blocks.{str(i)}.self_attention.linear_qkv.weight'
+        qk, kk, vk = [f'single_transformer_blocks.{str(i)}.attn.to_{n}.weight' for n in ('q', 'k', 'v')]
+        new_state_dict[new_key] = _import_qkv(
+            transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk]
+        )
+        new_key = f'single_blocks.{str(i)}.self_attention.linear_qkv.bias'
+        qk, kk, vk = [f'single_transformer_blocks.{str(i)}.attn.to_{n}.bias' for n in ('q', 'k', 'v')]
+        new_state_dict[new_key] = _import_qkv_bias(
+            transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk]
+        )
+
+    return new_state_dict
diff --git a/nemo/collections/diffusion/utils/flux_pipeline_utils.py b/nemo/collections/diffusion/utils/flux_pipeline_utils.py
new file mode 100644
index 000000000000..77dcfa58450f
--- /dev/null
+++ b/nemo/collections/diffusion/utils/flux_pipeline_utils.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+import torch
+from megatron.core.transformer.utils import openai_gelu
+
+from nemo.collections.diffusion.models.flux.model import FluxParams
+from nemo.collections.diffusion.vae.autoencoder import AutoEncoderParams
+
+
+@dataclass
+class FluxModelParams:
+    flux_params: FluxParams
+    vae_params: AutoEncoderParams
+    clip_params: dict | None
+    t5_params: dict | None
+    scheduler_params: dict | None
+    device: str | torch.device
+
+
+configs = {
+    "dev": FluxModelParams(
+        flux_params=FluxParams(
+            num_joint_layers=19,
+            num_single_layers=38,
+            hidden_size=3072,
+            num_attention_heads=24,
+            activation_func=openai_gelu,
+            add_qkv_bias=True,
+            ffn_hidden_size=16384,
+            in_channels=64,
+            context_dim=4096,
+            model_channels=256,
+            patch_size=1,
+            guidance_embed=True,
+            vec_in_dim=768,
+        ),
+        vae_params=AutoEncoderParams(
+            ch_mult=[1, 2, 4, 4],
+            attn_resolutions=[],
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+            ckpt=None,
+        ),
+        clip_params={
+            'max_length': 77,
+            'always_return_pooled': True,
+        },
+        t5_params={
+            'max_length': 512,
+        },
+        scheduler_params={
+            'num_train_timesteps': 1000,
+        },
+        device='cpu',
+    )
+}
diff --git a/nemo/collections/diffusion/utils/mcore_parallel_utils.py b/nemo/collections/diffusion/utils/mcore_parallel_utils.py
new file mode 100644
index 000000000000..0b9bdec97464
--- /dev/null
+++ b/nemo/collections/diffusion/utils/mcore_parallel_utils.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Megatron Model Parallel Initialization
+"""
+
+import os
+
+import megatron.core.parallel_state as ps
+import torch
+
+
+class Utils:
+    world_size = torch.cuda.device_count()
+    # rank = int(os.environ["LOCAL_RANK"])
+    rank = 0
+
+    @staticmethod
+    def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1, context_parallel_size=1):
+        ps.destroy_model_parallel()
+
+        # Torch setup for distributed training
+        rank = int(os.environ['LOCAL_RANK'])
+        world_size = 1  # torch.cuda.device_count()
+        torch.cuda.set_device(rank)
+        torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+        # Megatron core distributed training initialization
+        ps.initialize_model_parallel(
+            tensor_model_parallel_size, pipeline_model_parallel_size, context_parallel_size=context_parallel_size
+        )
+
+    @staticmethod
+    def set_world_size(world_size=None, rank=None):
+        Utils.world_size = torch.cuda.device_count() if world_size is None else world_size
+        if torch.distributed.is_initialized() and Utils.world_size != torch.distributed.get_world_size():
+            torch.distributed.destroy_process_group()
+
+        if rank is None:
+            # Utils.rank = int(os.environ["LOCAL_RANK"])
+            Utils.rank = 0
+            if Utils.rank >= Utils.world_size:
+                Utils.rank = -1
+        else:
+            Utils.rank = rank
+
+    @staticmethod
+    def destroy_model_parallel():
+        ps.destroy_model_parallel()
+        torch.distributed.barrier()
+
+    @staticmethod
+    def initialize_model_parallel(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        virtual_pipeline_model_parallel_size=None,
+        pipeline_model_parallel_split_rank=None,
+        **kwargs,
+    ):
+        ps.destroy_model_parallel()
+        Utils.initialize_distributed()
+        ps.initialize_model_parallel(
+            tensor_model_parallel_size,
+            pipeline_model_parallel_size,
+            virtual_pipeline_model_parallel_size,
+            pipeline_model_parallel_split_rank,
+            **kwargs,
+        )
diff --git a/nemo/collections/diffusion/vae/autoencoder.py b/nemo/collections/diffusion/vae/autoencoder.py
new file mode 100644
index 000000000000..b356d74baac1
--- /dev/null
+++ b/nemo/collections/diffusion/vae/autoencoder.py
@@ -0,0 +1,334 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+import numpy as np
+import torch
+from torch import Tensor, nn
+
+from nemo.collections.diffusion.vae.blocks import AttnBlock, Downsample, Normalize, ResnetBlock, Upsample, make_attn
+
+
+@dataclass
+class AutoEncoderParams:
+    ch_mult: list[int]
+    attn_resolutions: list[int]
+    resolution: int = 256
+    in_channels: int = 3
+    ch: int = 128
+    out_ch: int = 3
+    num_res_blocks: int = 2
+    z_channels: int = 16
+    scale_factor: float = 0.3611
+    shift_factor: float = 0.1159
+    attn_type: str = 'vanilla'
+    double_z: bool = True
+    dropout: float = 0.0
+    ckpt: str = None
+
+
+def nonlinearity(x):
+    # swish
+    return torch.nn.functional.silu(x)
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        attn_resolutions: list[int],
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+        dropout=0.0,
+        resamp_with_conv=True,
+        double_z=True,
+        use_linear_attn=False,
+        attn_type="vanilla",
+    ):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout
+        )
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout
+        )
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, 2 * z_channels if double_z else z_channels, kernel_size=3, stride=1, padding=1
+        )
+
+    def forward(self, x):
+        # timestep embedding
+        temb = None
+
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        attn_resolutions: list[int],
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+        dropout=0.0,
+        resamp_with_conv=True,
+        give_pre_end=False,
+        tanh_out=False,
+        use_linear_attn=False,
+        attn_type="vanilla",
+        **ignorekwargs,
+    ):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        self.tanh_out = tanh_out
+
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        print("Working with z of shape {} = {} dimensions.".format(self.z_shape, np.prod(self.z_shape)))
+
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout
+        )
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout
+        )
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, z):
+        # assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+
+        # timestep embedding
+        temb = None
+
+        # z to block_in
+        h = self.conv_in(z)
+
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+
+        # end
+        if self.give_pre_end:
+            return h
+
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        if self.tanh_out:
+            h = torch.tanh(h)
+        return h
+
+
+class DiagonalGaussian(nn.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = 1):
+        super().__init__()
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+
+    def forward(self, z: Tensor) -> Tensor:
+        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        if self.sample:
+            std = torch.exp(0.5 * logvar)
+            return mean + std * torch.randn_like(mean)
+        else:
+            return mean
+
+
+class AutoEncoder(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+            double_z=params.double_z,
+            attn_type=params.attn_type,
+            dropout=params.dropout,
+            out_ch=params.out_ch,
+            attn_resolutions=params.attn_resolutions,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+            double_z=params.double_z,
+            attn_type=params.attn_type,
+            dropout=params.dropout,
+            attn_resolutions=params.attn_resolutions,
+        )
+        self.reg = DiagonalGaussian()
+
+        self.scale_factor = params.scale_factor
+        self.shift_factor = params.shift_factor
+        self.params = params
+
+        if params.ckpt is not None:
+            self.load_from_checkpoint(params.ckpt)
+
+    def encode(self, x: Tensor) -> Tensor:
+        z = self.reg(self.encoder(x))
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        return self.decoder(z)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))
+
+    def load_from_checkpoint(self, ckpt_path):
+        from safetensors.torch import load_file as load_sft
+
+        state_dict = load_sft(ckpt_path)
+        missing, unexpected = self.load_state_dict(state_dict)
+        if len(missing) > 0:
+            logger.warning(f"Following keys are missing from checkpoint loaded: {missing}")
diff --git a/nemo/collections/diffusion/vae/blocks.py b/nemo/collections/diffusion/vae/blocks.py
new file mode 100644
index 000000000000..ad38a7a463cf
--- /dev/null
+++ b/nemo/collections/diffusion/vae/blocks.py
@@ -0,0 +1,180 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+
+try:
+    from apex.contrib.group_norm import GroupNorm
+
+    OPT_GROUP_NORM = True
+except Exception:
+    print('Fused optimized group norm has not been installed.')
+    OPT_GROUP_NORM = False
+
+
+def Normalize(in_channels, num_groups=32, act=""):
+    return GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True, act=act)
+
+
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels, out_channels=None, conv_shortcut=False, dropout=0.0, temb_channels=0):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+
+        self.norm1 = Normalize(in_channels, act="silu")
+        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        self.norm2 = Normalize(out_channels, act="silu")
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = self.conv1(h)
+
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+
+        h = self.norm2(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+
+        return x + h
+
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        # TODO(yuya): Remove this cast once the issue is fixed in PyTorch
+        # https://github.com/pytorch/pytorch/issues/86679
+        dtype = x.dtype
+        if dtype == torch.bfloat16:
+            x = x.to(torch.float32)
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if dtype == torch.bfloat16:
+            x = x.to(dtype)
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+
+
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+
+
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = Normalize(in_channels, act="silu")
+
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+
+
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads=self.heads, qkv=3)
+        k = k.softmax(dim=-1)
+        context = torch.einsum('bhdn,bhen->bhde', k, v)
+        out = torch.einsum('bhde,bhdn->bhen', context, q)
+        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
+        return self.to_out(out)
+
+
+class LinAttnBlock(LinearAttention):
+    """
+    to match AttnBlock usage
+    """
+
+    def __init__(self, in_channels):
+        super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
+
+
+def make_attn(in_channels, attn_type="vanilla"):
+    assert attn_type in ["vanilla", "linear", "none"], f'attn_type {attn_type} unknown'
+    print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
+    if attn_type == "vanilla":
+        return AttnBlock(in_channels)
+    elif attn_type == "none":
+        return nn.Identity(in_channels)
+    else:
+        return LinAttnBlock(in_channels)

From f37d1691087c10fa3dfa9ebf88dc89a7e1f52692 Mon Sep 17 00:00:00 2001
From: BoxiangW <45734921+BoxiangW@users.noreply.github.com>
Date: Tue, 22 Oct 2024 00:49:58 -0700
Subject: [PATCH 32/37] Add assertion for always save nemo add model parallel
 size (#10690)

* Add assertion for always save nemo add model parallel size

Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>

* Add assertions

Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>

* Fix typo

Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: BoxiangW <BoxiangW@users.noreply.github.com>

* Revert nemo_model_checkpoint.py changes

Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>

* Add test

Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>

* Fix typo

* Fix test bug

Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>

* Fix test

Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>

---------

Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>
Signed-off-by: BoxiangW <BoxiangW@users.noreply.github.com>
Co-authored-by: BoxiangW <BoxiangW@users.noreply.github.com>
---
 nemo/utils/exp_manager.py      | 10 +++++
 tests/core/test_exp_manager.py | 72 ++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index 3d4b7189f56e..2bfb40e89e15 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -1169,6 +1169,16 @@ def configure_checkpointing(
         params.filename = f'{name}--{{{params.monitor}:.4f}}-{{epoch}}'
     if params.prefix is None:
         params.prefix = name
+    if params.always_save_nemo:
+        app_state = AppState()
+        if (app_state.tensor_model_parallel_size is not None and app_state.tensor_model_parallel_size > 1) or (app_state.pipeline_model_parallel_size is not None and app_state.pipeline_model_parallel_size > 1) or (app_state.context_parallel_size is not None and app_state.context_parallel_size > 1):
+            raise LoggerMisconfigurationError(
+                "always_save_nemo is set to True, please ensure that model parallel is not used."
+                f"tensor_model_parallel_size: {app_state.tensor_model_parallel_size},"
+                f"pipeline_model_parallel_size: {app_state.pipeline_model_parallel_size},"
+                f"context_parallel_size: {app_state.context_parallel_size},"
+            )
+
     NeMoModelCheckpoint.CHECKPOINT_NAME_LAST = params.filename + '-last'
 
     logging.debug(params.dirpath)
diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py
index a0b69927ecc0..fa2eeae9b538 100644
--- a/tests/core/test_exp_manager.py
+++ b/tests/core/test_exp_manager.py
@@ -29,6 +29,7 @@
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.constants import NEMO_ENV_VARNAME_VERSION
 from nemo.core.classes import ModelPT
+from nemo.utils.app_state import AppState
 from nemo.utils.callbacks import NeMoModelCheckpoint
 from nemo.utils.exp_manager import (
     CheckpointMisconfigurationError,
@@ -1097,3 +1098,74 @@ def test_doesnt_silently_start_from_scratch_dist(self, tmp_path):
                 restored_trainer,
                 {"resume_if_exists": True, "resume_ignore_no_checkpoint": True, "explicit_log_dir": str(test_dir)},
             )
+
+    @pytest.mark.unit
+    def test_save_nemo_not_comp_with_model_parallel(self, tmp_path):
+        """
+        Ensure that always_save_nemo is not compatible with model parallelism.
+        """
+
+        test_dir = tmp_path / "test"
+
+        with pytest.raises(LoggerMisconfigurationError):
+            appstate = AppState()
+            appstate.tensor_model_parallel_size = 2
+            appstate.pipeline_model_parallel_size = 1
+            appstate.context_parallel_size = 1
+            test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1)
+            exp_manager(
+                test_trainer,
+                {
+                    "checkpoint_callback_params": {
+                        "always_save_nemo": True,
+                    },
+                    "explicit_log_dir": str(test_dir),
+                }
+            )
+
+        with pytest.raises(LoggerMisconfigurationError):
+            appstate = AppState()
+            appstate.tensor_model_parallel_size = 1
+            appstate.pipeline_model_parallel_size = 2
+            appstate.context_parallel_size = 1
+            test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1)
+            exp_manager(
+                test_trainer,
+                {
+                    "checkpoint_callback_params": {
+                        "always_save_nemo": True,
+                    },
+                    "explicit_log_dir": str(test_dir),
+                },
+            )
+
+        with pytest.raises(LoggerMisconfigurationError):
+            appstate = AppState()
+            appstate.tensor_model_parallel_size = 1
+            appstate.pipeline_model_parallel_size = 1
+            appstate.context_parallel_size = 2
+            test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1)
+            exp_manager(
+                test_trainer,
+                {
+                    "checkpoint_callback_params": {
+                        "always_save_nemo": True,
+                    },
+                    "explicit_log_dir": str(test_dir),
+                },
+            )
+
+        appstate = AppState()
+        appstate.tensor_model_parallesl_size = 1
+        appstate.pipeline_model_parallel_size = 1
+        appstate.context_parallel_size = 1
+        test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1)
+        exp_manager(
+            test_trainer,
+            {
+                "checkpoint_callback_params": {
+                    "always_save_nemo": True,
+                },
+                "explicit_log_dir": str(test_dir),
+            },
+        )

From bc4bce71d01234f568c1327f0848001d86143b3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Tue, 22 Oct 2024 12:12:05 +0200
Subject: [PATCH 33/37] =?UTF-8?q?[=F0=9F=A4=A0]:=20Howdy=20folks,=20let's?=
 =?UTF-8?q?=20bump=20`Dockerfile.ci`=20to=20563d5d1=20!=20(#10979)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>
---
 Dockerfile.ci | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ci b/Dockerfile.ci
index f01025873628..09ffe9674e5d 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.17.0
-ARG MCORE_TAG=db7d37b54ef96e35f7afc56e29fffb60f5c957b9
+ARG MCORE_TAG=563d5d1726012e8077895b732d5bc81b6e975e8d
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \

From 58da88610f73a712684429b8207a5d9039924869 Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Tue, 22 Oct 2024 17:58:21 +0200
Subject: [PATCH 34/37] Reflect CLI change nemorun -> nemo (#10443)

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
---
 examples/llm/pretrain/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/llm/pretrain/README.md b/examples/llm/pretrain/README.md
index c9bb7331f972..61f64d7792bb 100644
--- a/examples/llm/pretrain/README.md
+++ b/examples/llm/pretrain/README.md
@@ -3,7 +3,7 @@
 ### Listing the available recipes for pretraining
 
 ```bash
-nemorun llm pretrain --help
+nemo llm pretrain --help
 ```
 
 ![recipe-listing](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/list-recipes.png)
@@ -12,7 +12,7 @@ nemorun llm pretrain --help
 ### Run pre-training with a default recipe
 
 ```bash
-nemorun llm pretrain --factory llama3_8b
+nemo llm pretrain --factory llama3_8b
 ```
 
 ![llama3_70b](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b.png)
@@ -20,7 +20,7 @@ nemorun llm pretrain --factory llama3_8b
 We can also call the factory function with custom parameters:
 
 ```bash
-nemorun llm pretrain --factory "llama3_70b(num_nodes=128)"
+nemo llm pretrain --factory "llama3_70b(num_nodes=128)"
 ```
 
 ![llama3_70b-128-nodes](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b_128nodes.png)
@@ -29,13 +29,13 @@ nemorun llm pretrain --factory "llama3_70b(num_nodes=128)"
 The CLI allows you to overwrite any parameter. For example, to run the recipe with 2000 steps: 
 
 ```bash
-nemorun llm pretrain --factory llama3_70b trainer.max_steps=2000
+nemo llm pretrain --factory llama3_70b trainer.max_steps=2000
 ```
 
 The syntax of the CLI is the same as the Python code. Which is great but in some cases you might want to inspect & edit a recipe interactively. An easy way to do this using the cli is the use the `--repl` flag.
 
 ```bash
-nemorun llm pretrain --factory llama3_70b --repl
+nemo llm pretrain --factory llama3_70b --repl
 ```
 
 ![repl](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/repl.gif)

From 746203add92094e385a97bfe54f819b1dd45146e Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Tue, 22 Oct 2024 19:02:32 +0300
Subject: [PATCH 35/37] minor fix (#10990)

Co-authored-by: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
---
 scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py  | 2 +-
 .../checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
index f395e34765d0..42d3e77ce4c8 100644
--- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
@@ -15,7 +15,7 @@
 r"""
 Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint.
   Example to run this conversion script:
-    python convert_llama_hf_to_nemo.py \
+    python convert_llama_hf_to_nemo_load.py \
      --input_name_or_path <path_to_hf_checkpoints_folder> \
      --input_state_dict <path_to_saved_state_dict> \
      --output_path <path_to_output_nemo_file> \
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
index 940a9df5f9a8..f7096996e5b1 100644
--- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
@@ -15,7 +15,7 @@
 r"""
 Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint.
   Example to run this conversion script:
-    python convert_llama_hf_to_nemo.py \
+    python convert_llama_hf_to_nemo_save_dict.py \
      --input_name_or_path <path_to_hf_checkpoints_folder> \
      --output_path <path_to_output_nemo_file>
      --precision bf16 

From 70d8cc191b322d25fdb9428396c21a66d19f3ffb Mon Sep 17 00:00:00 2001
From: anteju <108555623+anteju@users.noreply.github.com>
Date: Tue, 22 Oct 2024 09:33:55 -0700
Subject: [PATCH 36/37] Fixed sampler override and audio_key in
 prepare_audio_data (#10980)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ante Jukić <ajukic@nvidia.com>
---
 examples/audio/process_audio.py                      | 4 ++--
 nemo/collections/asr/parts/utils/transcribe_utils.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/audio/process_audio.py b/examples/audio/process_audio.py
index e28fb4e69627..ec88bda34954 100644
--- a/examples/audio/process_audio.py
+++ b/examples/audio/process_audio.py
@@ -159,8 +159,8 @@ def main(cfg: ProcessConfig) -> ProcessConfig:
     audio_to_audio_model.set_trainer(trainer)
     audio_to_audio_model = audio_to_audio_model.eval()
 
-    # override sampler
-    if cfg.sampler is not None:
+    # override sampler if necessary
+    if cfg.sampler:
         logging.info('Overriding sampler with %s', cfg.sampler)
 
         if hasattr(audio_to_audio_model, 'sampler'):
diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py
index c1e712c44aeb..0d4f4c895bcf 100644
--- a/nemo/collections/asr/parts/utils/transcribe_utils.py
+++ b/nemo/collections/asr/parts/utils/transcribe_utils.py
@@ -314,7 +314,7 @@ def prepare_audio_data(cfg: DictConfig) -> Tuple[List[str], bool]:
         with NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
             for item in read_and_maybe_sort_manifest(cfg.dataset_manifest, try_sort=cfg.presort_manifest):
                 audio_file = get_full_path(audio_file=item[audio_key], manifest_file=cfg.dataset_manifest)
-                item[audio_key] = audio_file
+                item['audio_filepath'] = audio_file
                 filepaths.append(audio_file)
                 f.write(json.dumps(item) + "\n")
         sorted_manifest_path = f.name

From c20e8922c434ccc22b7a8bf62acdb3276bd7a9f7 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Tue, 22 Oct 2024 14:08:43 -0400
Subject: [PATCH 37/37] Add more recipes (#10957)

* add recipes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* adjust finetuning recipe

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
---
 nemo/collections/llm/gpt/model/baichuan.py    |   2 +-
 nemo/collections/llm/gpt/model/chatglm.py     |   4 +-
 nemo/collections/llm/recipes/__init__.py      |   8 +
 nemo/collections/llm/recipes/baichuan2_7b.py  | 285 ++++++++++++++++++
 nemo/collections/llm/recipes/chatglm3_6b.py   | 283 +++++++++++++++++
 .../llm/recipes/finetune_default.py           |   8 +-
 nemo/collections/llm/recipes/gemma_2b.py      | 285 ++++++++++++++++++
 nemo/collections/llm/recipes/gemma_7b.py      | 285 ++++++++++++++++++
 nemo/collections/llm/recipes/optim/adam.py    |   8 +-
 9 files changed, 1158 insertions(+), 10 deletions(-)
 create mode 100644 nemo/collections/llm/recipes/baichuan2_7b.py
 create mode 100644 nemo/collections/llm/recipes/chatglm3_6b.py
 create mode 100644 nemo/collections/llm/recipes/gemma_2b.py
 create mode 100644 nemo/collections/llm/recipes/gemma_7b.py

diff --git a/nemo/collections/llm/gpt/model/baichuan.py b/nemo/collections/llm/gpt/model/baichuan.py
index 56231978061f..c283b802a118 100644
--- a/nemo/collections/llm/gpt/model/baichuan.py
+++ b/nemo/collections/llm/gpt/model/baichuan.py
@@ -215,7 +215,7 @@ def _import_qkv(ctx: io.TransformCTX, qkv_weights):
     q = qkv_weights[0].squeeze().view(*new_q_tensor_shape)
     k = qkv_weights[1].squeeze().view(*new_kv_tensor_shape)
     v = qkv_weights[2].squeeze().view(*new_kv_tensor_shape)
-    qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:])
+    qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:]).type_as(qkv_weights)
     for i in range(num_query_groups):
         qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
         qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
diff --git a/nemo/collections/llm/gpt/model/chatglm.py b/nemo/collections/llm/gpt/model/chatglm.py
index 5bd1319102e2..e7450a8db28d 100644
--- a/nemo/collections/llm/gpt/model/chatglm.py
+++ b/nemo/collections/llm/gpt/model/chatglm.py
@@ -221,7 +221,7 @@ def _import_qkv_weight(ctx: io.TransformCTX, hf_qkv_weights):
     k = k.view(*new_kv_tensor_shape)
     v = v.view(*new_kv_tensor_shape)
 
-    qkv_weights = torch.empty((0, head_size, old_tensor_shape[1]))
+    qkv_weights = torch.empty((0, head_size, old_tensor_shape[1])).type_as(hf_qkv_weights)
     for i in range(num_query_groups):
         qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
         qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
@@ -251,7 +251,7 @@ def _import_qkv_bias(ctx: io.TransformCTX, hf_qkv_bias):
     q = q.view(*new_q_tensor_shape)
     k = k.view(*new_kv_tensor_shape)
     v = v.view(*new_kv_tensor_shape)
-    qkv_bias = torch.empty((0, head_size))
+    qkv_bias = torch.empty((0, head_size)).type_as(hf_qkv_bias)
     for i in range(num_query_groups):
         qkv_bias = torch.cat((qkv_bias, q[i * heads_per_group : (i + 1) * heads_per_group, :]))
         qkv_bias = torch.cat((qkv_bias, k[i : i + 1, :]))
diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
index 47cc4e71448d..ff81c3b383fc 100644
--- a/nemo/collections/llm/recipes/__init__.py
+++ b/nemo/collections/llm/recipes/__init__.py
@@ -14,6 +14,10 @@
 
 
 from nemo.collections.llm.recipes import (
+    baichuan2_7b,
+    chatglm3_6b,
+    gemma_2b,
+    gemma_7b,
     llama3_8b,
     llama3_8b_16k,
     llama3_8b_64k,
@@ -49,6 +53,10 @@
 from nemo.collections.llm.recipes.optim import adam
 
 __all__ = [
+    "baichuan2_7b",
+    "chatglm3_6b",
+    "gemma_2b",
+    "gemma_7b",
     "llama3_8b",
     "llama3_8b_16k",
     "llama3_8b_64k",
diff --git a/nemo/collections/llm/recipes/baichuan2_7b.py b/nemo/collections/llm/recipes/baichuan2_7b.py
new file mode 100644
index 000000000000..3ebb643af779
--- /dev/null
+++ b/nemo/collections/llm/recipes/baichuan2_7b.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm import Baichuan2Config7B, Baichuan2Model
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "baichuan2_7b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Baichuan2 7B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Baichuan2 7B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=baichuan2_7b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(Baichuan2Model, config=run.Config(Baichuan2Config7B))
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Baichuan2 7B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=baichuan2_7b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Baichuan2 7B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory baichuan2_7b
+            $ nemo llm pretrain --factory "baichuan2_7b(num_nodes=2, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="baichuan2_7b_pretrain", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_optimized")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Baichuan2 7B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory baichuan2_7b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="baichuan2_7b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=False,
+        )
+    )
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Baichuan2 7B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory baichuan2_7b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="baichuan2_7b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    recipe = default_finetune_recipe(
+        model(), "baichuan-inc/Baichuan2-7B-Base", dir, name, num_nodes, num_gpus_per_node
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 2
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA)
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/chatglm3_6b.py b/nemo/collections/llm/recipes/chatglm3_6b.py
new file mode 100644
index 000000000000..f5d580a9c6ea
--- /dev/null
+++ b/nemo/collections/llm/recipes/chatglm3_6b.py
@@ -0,0 +1,283 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm import ChatGLM3Config6B, ChatGLMModel
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "chatglm3_6b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a ChatGLM3 6B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the ChatGLM3 6B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=chatglm3_6b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(ChatGLMModel, config=run.Config(ChatGLM3Config6B))
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for ChatGLM3 6B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=chatglm3_6b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for ChatGLM3 6B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory chatglm3_6b
+            $ nemo llm pretrain --factory "chatglm3_6b(num_nodes=2, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="chatglm3_6b_pretrain", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_optimized")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for ChatGLM3 6B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory chatglm3_6b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="chatglm3_6b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=False,
+        )
+    )
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for ChatGLM3 6B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory chatglm3_6b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="chatglm3_6b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    recipe = default_finetune_recipe(model(), "THUDM/chatglm3-6b", dir, name, num_nodes, num_gpus_per_node)
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 2
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA)
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/finetune_default.py b/nemo/collections/llm/recipes/finetune_default.py
index 89c982613126..255763abbf50 100644
--- a/nemo/collections/llm/recipes/finetune_default.py
+++ b/nemo/collections/llm/recipes/finetune_default.py
@@ -60,7 +60,7 @@ def default_finetune_recipe(
         ),
         data=run.Config(llm.SquadDataModule, seq_length=2048, global_batch_size=128, micro_batch_size=1),
         log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
-        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50, adam_beta2=0.98),
         resume=nemo_resume(resume_path),
     )
 
@@ -77,9 +77,9 @@ def default_finetune_trainer(
     num_nodes=1,
     num_gpus_per_node=8,
     max_steps=1000,
-    limit_test_batches=None,
-    limit_val_batches=None,
-    val_check_interval=5,
+    limit_test_batches=1,
+    limit_val_batches=1,
+    val_check_interval=30,
 ):
     strategy = run.Config(
         nl.MegatronStrategy,
diff --git a/nemo/collections/llm/recipes/gemma_2b.py b/nemo/collections/llm/recipes/gemma_2b.py
new file mode 100644
index 000000000000..cbcd340c1e92
--- /dev/null
+++ b/nemo/collections/llm/recipes/gemma_2b.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm import GemmaConfig2B, GemmaModel
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "gemma_2b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Gemma 2B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Gemma 2B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=gemma_2b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(GemmaModel, config=run.Config(GemmaConfig2B))
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Gemma 2B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=gemma_2b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Gemma 2B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory gemma_2b
+            $ nemo llm pretrain --factory "gemma_2b(num_nodes=2, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="gemma_2b_pretrain", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_optimized")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Gemma 2B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory gemma_2b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="gemma_2b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=False,
+        )
+    )
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Gemma 2B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory gemma_2b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="gemma_2b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    # Disable cuDNN attention since TE 1.8 does not support head dim > 128
+    os.environ['NVTE_FUSED_ATTN'] = "0"
+
+    recipe = default_finetune_recipe(model(), "google/gemma-2b", dir, name, num_nodes, num_gpus_per_node)
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 2
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA)
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/gemma_7b.py b/nemo/collections/llm/recipes/gemma_7b.py
new file mode 100644
index 000000000000..3b0e206d9ce7
--- /dev/null
+++ b/nemo/collections/llm/recipes/gemma_7b.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm import GemmaConfig7B, GemmaModel
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "gemma_7b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Gemma 7B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Gemma 7B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=gemma_7b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(GemmaModel, config=run.Config(GemmaConfig7B))
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Gemma 7B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=gemma_7b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Gemma 7B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory gemma_7b
+            $ nemo llm pretrain --factory "gemma_7b(num_nodes=2, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="gemma_7b_pretrain", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_optimized")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Gemma 7B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory gemma_7b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="gemma_7b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=False,
+        )
+    )
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Gemma 7B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory gemma_7b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="gemma_7b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    # Disable cuDNN attention since TE 1.8 does not support head dim > 128
+    os.environ['NVTE_FUSED_ATTN'] = "0"
+
+    recipe = default_finetune_recipe(model(), "google/gemma-7b", dir, name, num_nodes, num_gpus_per_node)
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 2
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA)
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/optim/adam.py b/nemo/collections/llm/recipes/optim/adam.py
index 5be87ac71e9d..c6510577711d 100644
--- a/nemo/collections/llm/recipes/optim/adam.py
+++ b/nemo/collections/llm/recipes/optim/adam.py
@@ -25,6 +25,8 @@ def distributed_fused_adam_with_cosine_annealing(
     precision: str = "bf16-mixed",  # or "16-mixed"
     warmup_steps: int = 2000,
     constant_steps: int = 0,
+    adam_beta1: float = 0.9,
+    adam_beta2: float = 0.95,
     max_lr: float = 1e-4,
     min_lr: Optional[float] = None,
     clip_grad: float = 1.0,
@@ -37,14 +39,14 @@ def distributed_fused_adam_with_cosine_annealing(
         weight_decay=0.1,
         bf16=precision == "bf16-mixed",
         fp16=precision == "16-mixed",
-        adam_beta1=0.9,
-        adam_beta2=0.95,
+        adam_beta1=adam_beta1,
+        adam_beta2=adam_beta2,
         adam_eps=1e-5,
         use_distributed_optimizer=True,
         clip_grad=clip_grad,
     )
 
-    min_lr = min_lr or (0.1 * max_lr)
+    min_lr = min_lr if min_lr is not None else (0.1 * max_lr)
     sched = run.Config(
         CosineAnnealingScheduler,
         warmup_steps=warmup_steps,