From ca6035aa77b96425a5cf40fdc24cd95b233e4947 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Wed, 23 Oct 2024 21:02:53 +0200
Subject: [PATCH 1/8] llm.generate fixes (#10983) (#11007)

* fix context path, disable optimizer init, add tp


* format


* address comments, require user to provide trainer


* minor fix


* minor fixes


---------

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Co-authored-by: Huiying <willwin.lee@gmail.com>
---
 nemo/collections/llm/api.py            |  2 +-
 nemo/collections/llm/inference/base.py | 16 ++++++++++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 71e006472db9..a9b3d4361f5b 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -436,7 +436,7 @@ def export_ckpt(
 def generate(
     path: Union[Path, str],
     prompts: list[str],
-    trainer: Optional[nl.Trainer] = None,
+    trainer: nl.Trainer,
     params_dtype: torch.dtype = torch.bfloat16,
     max_batch_size: int = 4,
     random_seed: Optional[int] = None,
diff --git a/nemo/collections/llm/inference/base.py b/nemo/collections/llm/inference/base.py
index 95da536fde06..0171f1c2dd5c 100644
--- a/nemo/collections/llm/inference/base.py
+++ b/nemo/collections/llm/inference/base.py
@@ -16,6 +16,7 @@
 
 import nemo.lightning as nl
 from nemo.lightning import io
+from nemo.lightning.ckpt_utils import ckpt_to_context_subdir
 from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy
 from nemo.lightning.pytorch.strategies.utils import RestoreConfig
 
@@ -44,6 +45,7 @@ def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl.
         load_optim_state=False,
     )
     trainer.strategy.restore_config = restore_config
+    trainer.strategy._setup_optimizers = False
     trainer.ckpt_path = None
     trainer.strategy.connect(model)
     if trainer.strategy.launcher is not None:
@@ -61,16 +63,22 @@ def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl.
 
 def setup_model_and_tokenizer(
     path: Path,
-    trainer: Optional[nl.Trainer] = None,
+    trainer: nl.Trainer,
     params_dtype: torch.dtype = torch.bfloat16,
     inference_batch_times_seqlen_threshold: int = 1000,
 ) -> tuple[MCoreGPTModel, MCoreTokenizerWrappper]:
-    model: io.TrainerContext = io.load_context(path=path, subpath="model")
-    trainer = trainer or io.load_context(path=path, subpath="trainer")
+    model: io.TrainerContext = io.load_context(path=ckpt_to_context_subdir(path), subpath="model")
     _setup_trainer_and_restore_model(path=path, trainer=trainer, model=model)
 
     # This is to get the MCore model required in GPTInferenceWrapper.
-    mcore_model = model.module.module.module
+    mcore_model = model
+    while mcore_model:
+        if type(mcore_model) is MCoreGPTModel:
+            break
+        mcore_model = getattr(mcore_model, "module", None)
+    if mcore_model is None or type(mcore_model) is not MCoreGPTModel:
+        raise ValueError("Exact McoreGPTModel instance not found in the model structure.")
+
     inference_wrapped_model = GPTInferenceWrapper(
         mcore_model,
         InferenceWrapperConfig(

From a8bd34945611586899c699a3c19ed33e885db39e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Thu, 24 Oct 2024 15:48:08 +0200
Subject: [PATCH 2/8] Add a build option to load_context (#10713) (#11023)

* Add a build option to load_context


* Adding test


* Trying to fix failing CPU test


* cherry-pick fix


---------

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: Marc Romeyn <mromeijn@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 nemo/lightning/io/api.py       | 18 ++++++++++++++----
 nemo/lightning/io/mixin.py     |  5 ++++-
 tests/lightning/io/test_api.py | 13 +++++++------
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/nemo/lightning/io/api.py b/nemo/lightning/io/api.py
index 643b671d1d85..7a702edb7f21 100644
--- a/nemo/lightning/io/api.py
+++ b/nemo/lightning/io/api.py
@@ -1,5 +1,6 @@
 from pathlib import Path
-from typing import Callable, Optional, Type
+from typing import Callable, Optional, Type, overload
+import fiddle as fdl
 
 import pytorch_lightning as pl
 
@@ -7,14 +8,23 @@
 from nemo.lightning.io.pl import TrainerContext
 
 
-def load_context(path: Path, subpath: Optional[str] = None) -> TrainerContext:
+@overload
+def load_context(path: Path, subpath: Optional[str] = None, build: bool = True) -> TrainerContext: ...
+
+
+@overload
+def load_context(path: Path, subpath: Optional[str] = None, build: bool = False) -> fdl.Config[TrainerContext]: ...
+
+
+def load_context(path: Path, subpath: Optional[str] = None, build: bool = True):
     """
     Loads a TrainerContext from a json-file or directory.
 
     Args:
         path (Path): The path to the json-file or directory containing 'io.json'.
         subpath (Optional[str]): Subpath to selectively load only specific objects inside the TrainerContext. Defaults to None.
-
+        build (bool): Whether to build the TrainerContext. Defaults to True.
+            Otherwise, the TrainerContext is returned as a Config[TrainerContext] object.
     Returns
     -------
         TrainerContext: The loaded TrainerContext instance.
@@ -27,7 +37,7 @@ def load_context(path: Path, subpath: Optional[str] = None) -> TrainerContext:
         checkpoint: TrainerContext = load_ckpt("/path/to/checkpoint", subpath="model.config")
 
     """
-    return load(path, output_type=TrainerContext, subpath=subpath)
+    return load(path, output_type=TrainerContext, subpath=subpath, build=build)
 
 
 def model_importer(target: Type[ConnectorMixin], ext: str) -> Callable[[Type[ConnT]], Type[ConnT]]:
diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
index ee53fe010145..2d1162bb2156 100644
--- a/nemo/lightning/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -634,7 +634,7 @@ def _artifact_transform_load(cfg: fdl.Config, path: Path):
             pass
 
 
-def load(path: Path, output_type: Type[CkptType] = Any, subpath: Optional[str] = None) -> CkptType:
+def load(path: Path, output_type: Type[CkptType] = Any, subpath: Optional[str] = None, build: bool = True) -> CkptType:
     """
     Loads a configuration from a pickle file and constructs an object of the specified type.
 
@@ -698,4 +698,7 @@ def load(path: Path, output_type: Type[CkptType] = Any, subpath: Optional[str] =
     config = serialization.Deserialization(json_config).result
     _artifact_transform_load(config, path)
 
+    if not build:
+        return config
+
     return fdl.build(config)
diff --git a/tests/lightning/io/test_api.py b/tests/lightning/io/test_api.py
index 386bd5b5fdab..a4d458cef17b 100644
--- a/tests/lightning/io/test_api.py
+++ b/tests/lightning/io/test_api.py
@@ -16,6 +16,7 @@
 from functools import partial
 from pathlib import Path
 
+import fiddle as fdl
 import pytest
 import yaml
 from pytorch_lightning.loggers import TensorBoardLogger
@@ -69,9 +70,9 @@ def test_reload_ckpt(self, tmpdir, partial_function_with_pos_and_key_args):
         loaded_func = loaded.extra["dummy"]
         assert loaded_func(b=2) == partial_function_with_pos_and_key_args(b=2)
 
-        model_yaml = Path(tmpdir) / "model.yaml"
-        assert model_yaml.exists()
-
-        observed = yaml.safe_load(model_yaml.read_text())
-        expected = yaml.safe_load((Path(ARTIFACTS_DIR) / "model.yaml").read_text())
-        assert observed.keys() == expected.keys()
+        config = io.load_context(tmpdir, build=False)
+        assert isinstance(config, fdl.Config)
+        assert config.model.config.seq_length == ckpt.model.config.seq_length
+        assert config.model.tokenizer.vocab_file.startswith(str(tmpdir))
+        assert config.model.tokenizer.merges_file.startswith(str(tmpdir))
+        assert config.extra["dummy"] == fdl.Partial(dummy_extra, 10, c=15)

From 4714421d0c3c76e769bda1680f0999b37c8d8637 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Thu, 24 Oct 2024 15:51:04 +0200
Subject: [PATCH 3/8] Change default for always_save_context to True (#11014)
 (#11020)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Pablo Garay <pagaray@nvidia.com>
---
 nemo/lightning/pytorch/callbacks/model_checkpoint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
index 97df7bfbcfa5..ee46630791ab 100644
--- a/nemo/lightning/pytorch/callbacks/model_checkpoint.py
+++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
@@ -73,7 +73,7 @@ def __init__(
         train_time_interval: Optional[timedelta] = None,
         save_on_train_epoch_end: Optional[bool] = False,  # Save after training, not after validation
         save_optim_on_train_end: Optional[bool] = False,
-        always_save_context: bool = False,
+        always_save_context: bool = True,
         save_context_on_train_end: bool = True,
         **kwargs,
     ):

From e34e04bd44c81d99c27c5f3ced324baaea3548ec Mon Sep 17 00:00:00 2001
From: malay-nagda <164242706+malay-nagda@users.noreply.github.com>
Date: Thu, 24 Oct 2024 23:01:32 +0530
Subject: [PATCH 4/8] Performance mode (#10926) and gpt3 175b cli (#10985)
 (#11021)

* Performance mode (#10926)

* llama3 performance mode

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* llama3 performance mode tests

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* mixtral performance mode

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* remove unused

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* nemotron perf mode

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* 405b, 174b perf mode

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* perf mode comment

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

---------

Signed-off-by: Malay Nagda <malayn@nvidia.com>
Signed-off-by: malay-nagda <164242706+malay-nagda@users.noreply.github.com>
Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
Co-authored-by: malay-nagda <malay-nagda@users.noreply.github.com>

* gpt3 175b cli (#10985)

* gpt3 175b cli

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

---------

Signed-off-by: Malay Nagda <malayn@nvidia.com>
Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
Signed-off-by: malay-nagda <164242706+malay-nagda@users.noreply.github.com>
Co-authored-by: malay-nagda <malay-nagda@users.noreply.github.com>

---------

Signed-off-by: Malay Nagda <malayn@nvidia.com>
Signed-off-by: malay-nagda <164242706+malay-nagda@users.noreply.github.com>
Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
Co-authored-by: malay-nagda <malay-nagda@users.noreply.github.com>
---
 nemo/collections/llm/recipes/__init__.py      |  2 +
 nemo/collections/llm/recipes/gpt3_175b.py     | 52 ++++++++-----------
 nemo/collections/llm/recipes/llama31_405b.py  | 52 ++++++++-----------
 nemo/collections/llm/recipes/llama3_70b.py    | 51 +++++++++---------
 nemo/collections/llm/recipes/llama3_8b.py     | 43 ++++++---------
 nemo/collections/llm/recipes/mixtral_8x22b.py | 49 ++++++++---------
 nemo/collections/llm/recipes/mixtral_8x7b.py  | 49 ++++++++---------
 nemo/collections/llm/recipes/nemotron3_8b.py  | 36 +++++--------
 nemo/collections/llm/recipes/nemotron4_15b.py | 37 +++++--------
 nemo/collections/llm/recipes/nemotron4_22b.py | 45 ++++++----------
 .../collections/llm/recipes/nemotron4_340b.py | 45 ++++++----------
 .../llm/recipes/test_llama3_70b.py            |  6 +--
 .../collections/llm/recipes/test_llama3_8b.py |  6 +--
 tests/lightning/test_nemo_run.py              |  1 +
 14 files changed, 196 insertions(+), 278 deletions(-)

diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
index 555d29f2ceb1..bb5c41bc600c 100644
--- a/nemo/collections/llm/recipes/__init__.py
+++ b/nemo/collections/llm/recipes/__init__.py
@@ -14,6 +14,7 @@
 
 
 from nemo.collections.llm.recipes import (
+    gpt3_175b,
     llama3_8b,
     llama3_8b_16k,
     llama3_8b_64k,
@@ -61,6 +62,7 @@
     "nemotron4_22b_16k",
     "nemotron4_22b_64k",
     "nemotron4_340b",
+    "gpt3_175b",
     "adam",
     "default_log",
     "default_resume",
diff --git a/nemo/collections/llm/recipes/gpt3_175b.py b/nemo/collections/llm/recipes/gpt3_175b.py
index 7e016154aa3e..1abe8a218e82 100644
--- a/nemo/collections/llm/recipes/gpt3_175b.py
+++ b/nemo/collections/llm/recipes/gpt3_175b.py
@@ -142,7 +142,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for GPT3 175B model.
@@ -155,6 +160,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -172,7 +178,7 @@ def pretrain_recipe(
     Note:
         This recipe is optimized for the large 175B model and requires significant computational resources.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -186,49 +192,35 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 1,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for GPT3 175B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-        CLI usage:
-            $ nemo llm pretrain --factory "gpt3_175b.pretrain_recipe_performance(num_nodes=64, name='perf_pretrain')"
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="gpt3_175b_perf", num_nodes=64)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
-    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
-    # They are added here for user's knowledge
-    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
-    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
 
     recipe.trainer.callbacks.append(
         run.Config(
diff --git a/nemo/collections/llm/recipes/llama31_405b.py b/nemo/collections/llm/recipes/llama31_405b.py
index 45efedc3cbd6..055e9a06fcba 100644
--- a/nemo/collections/llm/recipes/llama31_405b.py
+++ b/nemo/collections/llm/recipes/llama31_405b.py
@@ -144,7 +144,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Llama3.1 405B model.
@@ -157,6 +162,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -174,7 +180,7 @@ def pretrain_recipe(
     Note:
         This recipe is optimized for the large 405B model and requires significant computational resources.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -188,49 +194,35 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 1,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Llama3.1 405B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-        CLI usage:
-            $ nemo llm pretrain --factory "llama31_405b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')"
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="llama31_405b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
-    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
-    # They are added here for user's knowledge
-    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
-    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
 
     recipe.trainer.callbacks.append(
         run.Config(
diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py
index ffd4a833885e..b283c68b222b 100644
--- a/nemo/collections/llm/recipes/llama3_70b.py
+++ b/nemo/collections/llm/recipes/llama3_70b.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -142,7 +142,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 4, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Llama3 70B model.
@@ -155,6 +160,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -172,7 +178,8 @@ def pretrain_recipe(
     Note:
         This recipe is optimized for the large 70B model and requires significant computational resources.
     """
-    return run.Partial(
+
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -186,45 +193,35 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 4, num_gpus_per_node: int = 8, fn=pretrain
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Llama3 70B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-        CLI usage:
-            $ nemo llm pretrain --factory "llama3_70b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')"
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="llama3_70b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
-    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
-    # They are added here for user's knowledge
-    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
-    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
 
     recipe.trainer.callbacks.append(
         run.Config(
diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py
index dd162ed29914..269eb7865dcf 100644
--- a/nemo/collections/llm/recipes/llama3_8b.py
+++ b/nemo/collections/llm/recipes/llama3_8b.py
@@ -143,7 +143,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Llama3 8B model.
@@ -156,6 +161,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -174,7 +180,7 @@ def pretrain_recipe(
         For more details on pre-training LLMs with NeMo, see the pre-training
         guide in the `examples/llm/pretrain/` directory.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -188,44 +194,29 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 1,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Llama3 8B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-            $ nemo llm pretrain --factory llama3_8b_optimized
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="llama3_8b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
-
     recipe.trainer.callbacks.append(
         run.Config(
             MegatronCommOverlapCallback,
diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py
index 028ad25ad794..fd065a540cbf 100644
--- a/nemo/collections/llm/recipes/mixtral_8x22b.py
+++ b/nemo/collections/llm/recipes/mixtral_8x22b.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -146,7 +146,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 16,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Mixtral 8x22B model.
@@ -159,6 +164,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -173,7 +179,7 @@ def pretrain_recipe(
             >>> recipe = pretrain_recipe(name="mixtral_pretrain", num_nodes=2)
             >>> print(recipe)
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -185,44 +191,35 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-def pretrain_recipe_performance(
-    name: str = "default", dir: Optional[str] = None, num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain
-) -> Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Mixtral 8x22B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-        CLI usage:
-            $ nemo llm pretrain --factory "mixtral_8x22b.pretrain_recipe_performance(num_nodes=8, name='perf_pretrain')"
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="mixtral_8x22b_perf", num_nodes=8)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
-    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
-    # They are added here for user's knowledge
-    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
-    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
 
     recipe.trainer.callbacks.extend(
         [
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py
index ead3d03edeac..1933f7768382 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -142,7 +142,12 @@ def trainer(
 
 @run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 2, num_gpus_per_node: int = 8, fn=pretrain
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 8,
+    num_gpus_per_node: int = 8,
+    performance_mode: bool = False,
+    fn: Callable = pretrain,
 ) -> run.Partial:
     """
     Create a pre-training recipe for Mixtral 8x7B model.
@@ -155,6 +160,7 @@ def pretrain_recipe(
         name (str): Name of the pre-training run.
         num_nodes (int): Number of compute nodes to use.
         num_gpus_per_node (int): Number of GPUs per node.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -169,7 +175,7 @@ def pretrain_recipe(
             >>> recipe = pretrain_recipe(name="mixtral_8x7b_pretrain", num_nodes=2)
             >>> print(recipe)
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=trainer(
@@ -181,44 +187,35 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-def pretrain_recipe_performance(
-    dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain
-) -> Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Mixtral 8x7B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-        CLI usage:
-            $ nemo llm pretrain --factory "mixtral_8x7b.pretrain_recipe_performance(num_nodes=8, name='perf_pretrain')"
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="mixtral_8x7b_perf", num_nodes=8)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
-    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
-    # They are added here for user's knowledge
-    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
-    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
 
     recipe.trainer.callbacks.extend(
         [
diff --git a/nemo/collections/llm/recipes/nemotron3_8b.py b/nemo/collections/llm/recipes/nemotron3_8b.py
index c1563e3beb15..156fc543725b 100644
--- a/nemo/collections/llm/recipes/nemotron3_8b.py
+++ b/nemo/collections/llm/recipes/nemotron3_8b.py
@@ -83,6 +83,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=3.0e-5,
     max_lr=3e-4,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -118,6 +119,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -135,7 +137,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -174,43 +176,29 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 1,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Nemotron3 8B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-            $ nemo llm pretrain --factory nemotron3_8b_optimized
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="nemotron3_8b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
     recipe.trainer.callbacks.append(
         run.Config(
diff --git a/nemo/collections/llm/recipes/nemotron4_15b.py b/nemo/collections/llm/recipes/nemotron4_15b.py
index 9f184a92d94b..16ae7b2b1e79 100644
--- a/nemo/collections/llm/recipes/nemotron4_15b.py
+++ b/nemo/collections/llm/recipes/nemotron4_15b.py
@@ -80,6 +80,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=4.5e-5,
     max_lr=4.5e-5,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -115,6 +116,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -132,7 +134,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -171,44 +173,29 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 8,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Nemotron4 15B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-            $ nemo llm pretrain --factory nemotron4_15b_optimized
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="nemotron4_15b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
-
     recipe.trainer.callbacks.append(
         run.Config(
             MegatronCommOverlapCallback,
diff --git a/nemo/collections/llm/recipes/nemotron4_22b.py b/nemo/collections/llm/recipes/nemotron4_22b.py
index 4fb697c006fc..a20afedfea56 100644
--- a/nemo/collections/llm/recipes/nemotron4_22b.py
+++ b/nemo/collections/llm/recipes/nemotron4_22b.py
@@ -80,6 +80,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=1e-5,
     max_lr=1e-4,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -115,6 +116,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -132,7 +134,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -171,48 +173,35 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 8,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Nemotron4 22B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-            $ nemo llm pretrain --factory nemotron4_22b_optimized
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="nemotron4_22b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
-    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
-    # They are added here for user's knowledge
-    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
-    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
 
     recipe.trainer.callbacks.append(
         run.Config(
diff --git a/nemo/collections/llm/recipes/nemotron4_340b.py b/nemo/collections/llm/recipes/nemotron4_340b.py
index 62d25641f48c..31380f71c939 100644
--- a/nemo/collections/llm/recipes/nemotron4_340b.py
+++ b/nemo/collections/llm/recipes/nemotron4_340b.py
@@ -83,6 +83,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=1.0e-5,
     max_lr=1.0e-4,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -118,6 +119,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -135,7 +137,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -174,48 +176,35 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 16,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Nemotron4 340B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-            $ nemo llm pretrain --factory nemotron4_340b_optimized
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="nemotron4_340b_perf", num_nodes=16)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
-    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
-    # They are added here for user's knowledge
-    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
-    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
+    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
+    # by MegatronCommOverlapCallback. They are added here for user's knowledge.
+    # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
+    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
+    # each PP stage launches independently as needed.
 
     recipe.trainer.callbacks.append(
         run.Config(
diff --git a/tests/collections/llm/recipes/test_llama3_70b.py b/tests/collections/llm/recipes/test_llama3_70b.py
index cc77ec921de7..d47b674b7b70 100644
--- a/tests/collections/llm/recipes/test_llama3_70b.py
+++ b/tests/collections/llm/recipes/test_llama3_70b.py
@@ -79,10 +79,8 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_pretrain_recipe_performance(self, recipe_module):
-        recipe = recipe_module.pretrain_recipe_performance(
-            name="test_perf", dir="/tmp", num_nodes=4, num_gpus_per_node=8
-        )
+    def test_pretrain_performance_optimizations(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe(performance_mode=True)
         assert any(
             isinstance(cb, run.Config) and cb.__fn_or_cls__ == MegatronCommOverlapCallback
             for cb in recipe.trainer.callbacks
diff --git a/tests/collections/llm/recipes/test_llama3_8b.py b/tests/collections/llm/recipes/test_llama3_8b.py
index df4f05eec2ae..88fab6d6325a 100644
--- a/tests/collections/llm/recipes/test_llama3_8b.py
+++ b/tests/collections/llm/recipes/test_llama3_8b.py
@@ -90,10 +90,8 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_
         assert recipe.trainer.num_nodes == num_nodes
         assert recipe.trainer.devices == num_gpus_per_node
 
-    def test_pretrain_recipe_performance(self, recipe_module):
-        recipe = recipe_module.pretrain_recipe_performance(
-            name="test_perf", dir="/tmp", num_nodes=1, num_gpus_per_node=8
-        )
+    def test_pretrain_performance_optimizations(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe(performance_mode=True)
         assert any(cb.__fn_or_cls__.__name__ == "MegatronCommOverlapCallback" for cb in recipe.trainer.callbacks)
 
     def test_trainer_parallelism_options(self, recipe_module):
diff --git a/tests/lightning/test_nemo_run.py b/tests/lightning/test_nemo_run.py
index c708c71ea8a1..aa73df50613a 100644
--- a/tests/lightning/test_nemo_run.py
+++ b/tests/lightning/test_nemo_run.py
@@ -36,6 +36,7 @@
         # ("nemotron4_22b_64k", "pretrain_recipe", "nemotron4_22b_64k_pretrain"),
         # ("nemotron4_340b", "pretrain_recipe", "nemotron4_340b_pretrain"),
         # ("nemotron4_340b", "finetune_recipe", "nemotron4_340b_finetune"),
+        ("gpt3_175b", "pretrain_recipe", "gpt3_175b_pretrain"),
     ],
 )
 def test_recipes_with_nemo_run(module, recipe, name, tmpdir, monkeypatch):

From 395c502422198771828f87f73b142b551e69403e Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Fri, 25 Oct 2024 09:16:15 -0700
Subject: [PATCH 5/8] Fix _strategy_lib tests (#11033) (#11039)

* fix world size and don't mock


* cleanup global state


* check app state instead


* fix syntax nemo logger test


---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>
Co-authored-by: Maanu Grover <109391026+maanug-nv@users.noreply.github.com>
---
 tests/conftest.py                    |  8 ++++++
 tests/lightning/test_nemo_logger.py  |  3 +-
 tests/lightning/test_strategy_lib.py | 42 +++++++++++++++++-----------
 3 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 6298ed051c68..118e978e63c7 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -25,6 +25,8 @@
 
 import pytest
 
+from nemo.utils.metaclasses import Singleton
+
 # Those variables probably should go to main NeMo configuration file (config.yaml).
 __TEST_DATA_FILENAME = "test_data.tar.gz"
 __TEST_DATA_URL = "https://github.com/NVIDIA/NeMo/releases/download/v1.0.0rc1/"
@@ -115,6 +117,11 @@ def cleanup_local_folder():
         rmtree('./nemo_experiments', ignore_errors=True)
 
 
+@pytest.fixture(autouse=True)
+def reset_singletons():
+    Singleton._Singleton__instances = {}
+
+
 @pytest.fixture(scope="session")
 def test_data_dir():
     """
@@ -173,6 +180,7 @@ def k2_cuda_is_enabled(k2_is_appropriate) -> Tuple[bool, str]:
         return k2_is_appropriate
 
     import torch  # noqa: E402
+
     from nemo.core.utils.k2_guard import k2  # noqa: E402
 
     if torch.cuda.is_available() and k2.with_cuda:
diff --git a/tests/lightning/test_nemo_logger.py b/tests/lightning/test_nemo_logger.py
index 3f8f7a1e0bb8..a5a5ec32c886 100644
--- a/tests/lightning/test_nemo_logger.py
+++ b/tests/lightning/test_nemo_logger.py
@@ -115,7 +115,8 @@ def test_resume(self, trainer, tmp_path):
             resume_ignore_no_checkpoint=True,
         ).setup(trainer)
 
-        path = Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints").mkdir(parents=True)
+        path = Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints")
+        path.mkdir(parents=True)
         # Error because checkpoints do not exist in folder
         with pytest.raises(NotFoundError):
             nl.AutoResume(
diff --git a/tests/lightning/test_strategy_lib.py b/tests/lightning/test_strategy_lib.py
index 36143cedb8c4..61727d5612e7 100644
--- a/tests/lightning/test_strategy_lib.py
+++ b/tests/lightning/test_strategy_lib.py
@@ -57,8 +57,10 @@ def configure_model(self):
     assert model.config.pipeline_dtype == torch.float32
 
 
-@patch('nemo.collections.nlp.modules.common.megatron.megatron_init.initialize_model_parallel_for_nemo')
-def test_init_parallel_ranks(mock_initialize_model_parallel) -> None:
+def test_init_parallel_ranks() -> None:
+    from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator
+    from megatron.core.parallel_state import destroy_model_parallel
+
     from nemo.utils import AppState
 
     app_state = AppState()
@@ -80,27 +82,33 @@ def test_init_parallel_ranks(mock_initialize_model_parallel) -> None:
     mock_parallel_config.pipeline_model_parallel_split_rank = None
 
     _strategy_lib.init_parallel_ranks(
-        world_size=2,
+        world_size=24,
         global_rank=1,
         local_rank=0,
         parallel_config=mock_parallel_config,
         seed=1234,
         fp8=False,
     )
-    mock_initialize_model_parallel.assert_called_once_with(
-        world_size=2,
-        global_rank=1,
-        local_rank=0,
-        tensor_model_parallel_size=2,
-        pipeline_model_parallel_size=3,
-        virtual_pipeline_model_parallel_size=4,
-        context_parallel_size=2,
-        expert_model_parallel_size=2,
-        seed=1234,
-        pipeline_model_parallel_split_rank=None,
-        use_fp8=False,
-        init_mpi_proc_group=False,
-    )
+    expected_app_state = {
+        "world_size": 24,
+        "global_rank": 1,
+        "local_rank": 0,
+        "tensor_model_parallel_size": 2,
+        "pipeline_model_parallel_size": 3,
+        "virtual_pipeline_model_parallel_size": 4,
+        "context_parallel_size": 2,
+        "expert_model_parallel_size": 2,
+        "pipeline_model_parallel_split_rank": None,
+        "use_fp8": False,
+        "init_mpi_proc_group": False,
+    }
+    for k, v in expected_app_state.items():
+        assert hasattr(app_state, k), f"Expected to find {k} in AppState"
+        app_attr = getattr(app_state, k)
+        assert app_attr == v, f"{k} in AppState is incorrect, Expected: {v} Actual: {app_attr}"
+
+    destroy_model_parallel()
+    destroy_num_microbatches_calculator()
 
 
 @patch('torch.distributed.is_initialized', return_value=True)

From 4fac15ab3463218966a774204e720fa921f4a3f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Fri, 25 Oct 2024 18:22:04 +0200
Subject: [PATCH 6/8] Update `BaseMegatronSampler` for compatibility with PTL's
 `_BatchProgress` (#11016) (#11034)

* Revert "[NeMo-UX] Use custom `BatchProgress` class which does not restore states (#10383)"

This reverts commit b5798ded9f27168db9d7d77cbe4f9da80bf49268.

* make megatron sampler return the total number of batches in the dataset


---------

Signed-off-by: ashors1 <ashors@nvidia.com>
Co-authored-by: Anna Shors <71393111+ashors1@users.noreply.github.com>
---
 nemo/lightning/data.py                                 |  7 +++----
 nemo/lightning/pytorch/strategies/fsdp_strategy.py     |  8 --------
 nemo/lightning/pytorch/strategies/megatron_strategy.py |  7 -------
 nemo/lightning/pytorch/strategies/utils.py             | 10 ----------
 4 files changed, 3 insertions(+), 29 deletions(-)

diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py
index 0f30dfe22851..7051e87841ca 100644
--- a/nemo/lightning/data.py
+++ b/nemo/lightning/data.py
@@ -287,17 +287,16 @@ def __init__(
         )
 
     def __len__(self):
-        num_available_samples: int = self.total_samples - self.consumed_samples
         if self.global_batch_size is not None:
             if self.drop_last:
-                num_global_batches = num_available_samples // self.global_batch_size
+                num_global_batches = self.total_samples // self.global_batch_size
             else:
-                num_global_batches = (num_available_samples + self.global_batch_size - 1) // self.global_batch_size
+                num_global_batches = (self.total_samples + self.global_batch_size - 1) // self.global_batch_size
             # return len of dataloader in terms of micro batches to avoid discrepancy between len of dataloader and
             # num of batches fetched (as training step fetches in terms of micro batches)
             return num_global_batches * (self.global_batch_size // self.micro_batch_times_data_parallel_size)
         else:
-            return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1
+            return (self.total_samples - 1) // self.micro_batch_times_data_parallel_size + 1
 
     @abc.abstractmethod
     def __iter__(self): ...
diff --git a/nemo/lightning/pytorch/strategies/fsdp_strategy.py b/nemo/lightning/pytorch/strategies/fsdp_strategy.py
index 5f24d988396b..d34d1716e6b4 100644
--- a/nemo/lightning/pytorch/strategies/fsdp_strategy.py
+++ b/nemo/lightning/pytorch/strategies/fsdp_strategy.py
@@ -35,7 +35,6 @@
 
 from nemo.lightning import io
 from nemo.lightning.pytorch.strategies.utils import (
-    _MegatronBatchProgress,
     ckpt_to_dir,
     create_checkpoint_io,
     fix_progress_bar,
@@ -74,7 +73,6 @@ def __init__(
         ckpt_load_optimizer: bool = True,
         ckpt_save_optimizer: bool = True,
         data_sampler=None,
-        overwrite_batch_progress: bool = True,
         **kwargs,
     ):
         super().__init__(auto_wrap_policy=auto_wrap_policy, state_dict_type=state_dict_type, **kwargs)
@@ -82,7 +80,6 @@ def __init__(
         self.data_sampler = data_sampler
         self.ckpt_load_optimizer = ckpt_load_optimizer
         self.ckpt_save_optimizer = ckpt_save_optimizer
-        self.overwrite_batch_progress = overwrite_batch_progress
 
     @override
     def setup_environment(self) -> None:
@@ -95,11 +92,6 @@ def setup(self, trainer: pl.Trainer) -> None:
         self.trainer = trainer
         setup_data_sampler(self.trainer)
         fix_progress_bar(trainer)
-
-        trainer_fn = trainer.state.fn
-        if trainer_fn == TrainerFn.FITTING and self.overwrite_batch_progress:
-            trainer.fit_loop.epoch_loop.batch_progress = _MegatronBatchProgress()
-
         super().setup(trainer)
 
     def _get_loss_reduction(self, step_type: str):
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
index e68b67c86f2d..839f1249cbb1 100644
--- a/nemo/lightning/pytorch/strategies/megatron_strategy.py
+++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -62,7 +62,6 @@
 from nemo.lightning.pytorch.callbacks import ModelTransform
 from nemo.lightning.pytorch.strategies.utils import (
     RestoreConfig,
-    _MegatronBatchProgress,
     ckpt_to_dir,
     create_checkpoint_io,
     fix_progress_bar,
@@ -155,8 +154,6 @@ class MegatronStrategy(DDPStrategy, io.IOMixin):
             that prints the metrics to stdout. Suitable for non-interactive settings.
         progress_interval (int): How frequently to print progress to stdout. Only used when
             replace_progress_bar is True.
-        overwrite_batch_progress (bool): Whether to overwrite _BatchProgress class used in PTL by default with
-            _MegatronBatchProgress. This should be True whenever you're using a Megatron-based dataset.
         **kwargs: Additional keyword arguments.
 
     Note:
@@ -199,7 +196,6 @@ def __init__(
         replace_progress_bar: bool = True,
         progress_interval: int = 1,
         restore_config: Optional[RestoreConfig] = None,
-        overwrite_batch_progress: bool = True,
         **kwargs,
     ) -> None:
         super().__init__(
@@ -240,7 +236,6 @@ def __init__(
 
         self.replace_progress_bar = replace_progress_bar
         self.progress_interval = progress_interval
-        self.overwrite_batch_progress = overwrite_batch_progress
 
         self.restore_config = restore_config
 
@@ -338,8 +333,6 @@ def setup(self, trainer: pl.Trainer) -> None:
             self.configure_ddp()
 
             trainer.fit_loop.epoch_loop.automatic_optimization = _MegatronAutomaticOptimization(trainer)
-            if self.overwrite_batch_progress:
-                trainer.fit_loop.epoch_loop.batch_progress = _MegatronBatchProgress()
 
             import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD
 
diff --git a/nemo/lightning/pytorch/strategies/utils.py b/nemo/lightning/pytorch/strategies/utils.py
index 415392f2bef0..a7f0e7339def 100644
--- a/nemo/lightning/pytorch/strategies/utils.py
+++ b/nemo/lightning/pytorch/strategies/utils.py
@@ -25,12 +25,10 @@
 from megatron.core.dist_checkpointing.strategies.torch import sharded_tensor_to_torch_sharded_tensor
 from megatron.core.transformer.utils import _get_extra_state_offsets
 from pytorch_lightning.callbacks import TQDMProgressBar
-from pytorch_lightning.loops.progress import _BatchProgress
 from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO
 from torch.distributed._sharded_tensor import ShardedTensor as TorchShardedTensor
 from torch.distributed._tensor import DTensor, Replicate, Shard
 from torch.distributed.device_mesh import DeviceMesh
-from typing_extensions import override
 
 from nemo.lightning import _strategy_lib
 from nemo.lightning.io.pl import MegatronCheckpointIO
@@ -48,14 +46,6 @@ class RestoreConfig:
     load_artifacts: bool = True
 
 
-class _MegatronBatchProgress(_BatchProgress):
-    @override
-    def load_state_dict(self, state_dict: dict) -> None:
-        ## in megatron, we want to start the batch progress over when
-        ## restoring from a checkpoint
-        return
-
-
 def setup_parallel_ranks(strategy: pl.strategies.Strategy):
     from megatron.core.model_parallel_config import ModelParallelConfig
 

From 5dd81f09181298a2dcaa4eba9a0949b5d871e269 Mon Sep 17 00:00:00 2001
From: Shriya Rishab <69161273+ShriyaPalsamudram@users.noreply.github.com>
Date: Fri, 25 Oct 2024 13:36:03 -0400
Subject: [PATCH 7/8] Change dist ckpt defaults (#10913) (#11031)

* Enable ckpt features by default (async ckpt), ckpt every 15mins and reduce preemption time to 1min


* fix ssm tests


* Make note that ckpt_async_save is disabled for SSMs


* Enable async ckpt for SSMs with fix


* Disable async ckpt in the peft test as it is a known bug, add note.


* Fix failing unit tests


* Ashors/peft async ckpt (#11010)

* [WIP] prototype for supporting async checkpointing with peft


* Enable async ckpt for the peft test


* Fix peft setup test


---------

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>
Signed-off-by: ashors1 <ashors@nvidia.com>
Co-authored-by: ataghibakhsh <ataghibakhsh@nvidia.com>
Co-authored-by: Pablo Garay <pagaray@nvidia.com>
---
 nemo/collections/llm/recipes/log/default.py   |  3 +-
 nemo/lightning/io/connector.py                |  2 ++
 nemo/lightning/pytorch/callbacks/peft.py      | 28 ++++++++++++++++---
 .../pytorch/strategies/megatron_strategy.py   |  8 +++---
 nemo/lightning/pytorch/strategies/utils.py    |  4 ++-
 nemo/lightning/run/plugins.py                 |  4 +--
 .../collections/llm/test_mnist_model_nemo2.py |  1 +
 .../lightning/pytorch/callbacks/test_peft.py  |  4 ++-
 tests/lightning/test_dist_ckpt.py             |  1 +
 9 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/nemo/collections/llm/recipes/log/default.py b/nemo/collections/llm/recipes/log/default.py
index 93bd9f9470fa..d83580a1a543 100644
--- a/nemo/collections/llm/recipes/log/default.py
+++ b/nemo/collections/llm/recipes/log/default.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 
+from datetime import timedelta
 from typing import Optional
 
 from nemo_run import Config, cli
@@ -50,7 +51,7 @@ def default_log(
         nl.ModelCheckpoint,
         save_last=True,
         save_top_k=10,
-        every_n_train_steps=200,
+        train_time_interval=Config(timedelta, minutes=15),
         filename="{model_name}--{val_loss:.2f}-{step}-{consumed_samples}",
     )
 
diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
index 3ccbef536b99..41ce2d8f1117 100644
--- a/nemo/lightning/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -183,6 +183,8 @@ def nemo_save(self, output_path: Path, trainer: pl.Trainer, dump_io: bool = True
         output_path = Path(output_path)
         output_path.mkdir(parents=True, exist_ok=True)
         trainer.save_checkpoint(ckpt_to_weights_subdir(output_path))
+        if getattr(trainer.strategy, "async_save", False):
+            trainer.strategy.checkpoint_io.maybe_finalize_save_checkpoint(blocking=True)
 
         from nemo.lightning.io.pl import TrainerContext
         from nemo.utils.get_rank import is_global_rank_zero
diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py
index 1e3cde0bbcde..15d0dd8ac2ab 100644
--- a/nemo/lightning/pytorch/callbacks/peft.py
+++ b/nemo/lightning/pytorch/callbacks/peft.py
@@ -14,6 +14,7 @@
 
 import json
 from abc import ABC, abstractmethod
+from functools import partial
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple
 
@@ -27,6 +28,7 @@
 from nemo.lightning.io.pl import ckpt_to_dir
 from nemo.lightning.pytorch.callbacks.model_transform import ModelTransform
 from nemo.utils import logging
+from nemo.utils.callbacks.dist_ckpt_io import AsyncCompatibleCheckpointIO
 
 if TYPE_CHECKING:
     from megatron.core.dist_checkpointing.mapping import ShardedStateDict
@@ -97,11 +99,28 @@ def __call__(self, model: nn.Module) -> nn.Module:
         return model
 
     def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str) -> None:
+        from nemo.lightning.pytorch.strategies.utils import create_checkpoint_io
+
         super().setup(trainer, pl_module, stage=stage)
 
         trainer.strategy.trainer = trainer
-        self.wrapped_io = WrappedAdapterIO(trainer.strategy.checkpoint_io, self)
-        trainer.strategy._checkpoint_io = self.wrapped_io
+        wrapped_io = partial(WrappedAdapterIO, peft=self)
+        ckpt_io_kwargs = {
+            "save_ckpt_format": trainer.strategy.save_ckpt_format,
+            "async_save": trainer.strategy.async_save,
+            "torch_dist_multiproc": trainer.strategy.torch_dist_multiproc,
+            "assume_constant_structure": trainer.strategy.assume_constant_structure,
+            "parallel_save": trainer.strategy.parallel_save,
+            "parallel_save_within_dp": trainer.strategy.parallel_save_within_dp,
+            "parallel_load": trainer.strategy.parallel_load,
+            "load_directly_on_device": trainer.strategy.load_directly_on_device,
+        }
+        trainer.strategy._checkpoint_io = create_checkpoint_io(wrapping_ckpt_io=wrapped_io, **ckpt_io_kwargs)
+        self.wrapped_io = (
+            trainer.strategy._checkpoint_io._checkpoint_io
+            if trainer.strategy.async_save
+            else trainer.strategy._checkpoint_io
+        )
         trainer.strategy._init_model_parallel = False
         trainer.strategy._setup_optimizers = False
 
@@ -257,7 +276,7 @@ def load_state_dict(self, state_dict, strict=True):
             self.adapter.load_state_dict(adapter_state_dict, strict)
 
 
-class WrappedAdapterIO(_WrappingCheckpointIO):
+class WrappedAdapterIO(_WrappingCheckpointIO, AsyncCompatibleCheckpointIO):
     peft: Optional[PEFT] = None
     model_ckpt_path: Optional[Path] = None
     adapter_ckpt_path: Optional[Path] = None
@@ -273,7 +292,7 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
         checkpoint['sharded_state_dict'] = dict(
             filter(lambda item: self.peft.adapter_key_filter(item[0]), checkpoint['sharded_state_dict'].items())
         )
-        self.checkpoint_io.save_checkpoint(checkpoint, path, storage_options=storage_options)
+        request = self.checkpoint_io.save_checkpoint(checkpoint, path, storage_options=storage_options)
 
         from nemo.utils.get_rank import is_global_rank_zero
 
@@ -282,6 +301,7 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
             adapter_meta_path = ckpt_to_dir(path) / _ADAPTER_META_FILENAME
             with open(adapter_meta_path, "w") as f:
                 json.dump(metadata, f)
+        return request
 
     @override
     def load_checkpoint(
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
index 839f1249cbb1..342c437f0e32 100644
--- a/nemo/lightning/pytorch/strategies/megatron_strategy.py
+++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -129,7 +129,7 @@ class MegatronStrategy(DDPStrategy, io.IOMixin):
         save_ckpt_format (str): Distributed checkpoint format to use for checkpoint saving. Should be one of
             'torch_dist' or 'zarr'. Defaults to 'torch_dist'.
         ckpt_async_save (bool): Whether to save checkpoints asynchronously to reduce checkpointing overhead.
-            Defaults to False.
+            Defaults to True.
         ckpt_torch_dist_multiproc (int): Number of extra processes per rank used during ckpt save
             with PyTorch distributed format. Defaults to None.
         ckpt_assume_constant_structure (bool): Allows caching some computation across checkpoint saves.
@@ -139,7 +139,7 @@ class MegatronStrategy(DDPStrategy, io.IOMixin):
         ckpt_parallel_save_within_dp (bool): If true, save will be parallelized only within a DP group
             (whole world otherwise), which might slightly reduce the save overhead. Defaults to False.
         ckpt_parallel_load (bool): If true, each worker will load part of the dist checkpoint
-            and exchange with NCCL. Might use some extra GPU memory. Defaults to False.
+            and exchange with NCCL. Might use some extra GPU memory. Defaults to True.
         ckpt_parallel_save_optim (bool): Parallel save/load of a DistributedOptimizer. 'True'
             allows performant save and reshardable checkpoints. Set to 'False' only in order to minimize
             the number of checkpoint files.
@@ -183,12 +183,12 @@ def __init__(
         lazy_init: bool = False,
         pipeline_dtype: Optional[torch.dtype] = None,
         save_ckpt_format: str = "torch_dist",
-        ckpt_async_save: bool = False,
+        ckpt_async_save: bool = True,
         ckpt_torch_dist_multiproc: int = None,  ## TODO(ashors): put elsewhere?
         ckpt_assume_constant_structure: bool = False,
         ckpt_parallel_save: bool = True,
         ckpt_parallel_save_within_dp: bool = False,
-        ckpt_parallel_load: bool = False,
+        ckpt_parallel_load: bool = True,
         ckpt_parallel_save_optim: bool = True,
         ckpt_load_directly_on_device: bool = True,
         setup_optimizers: bool = True,
diff --git a/nemo/lightning/pytorch/strategies/utils.py b/nemo/lightning/pytorch/strategies/utils.py
index a7f0e7339def..43a5a9243aa5 100644
--- a/nemo/lightning/pytorch/strategies/utils.py
+++ b/nemo/lightning/pytorch/strategies/utils.py
@@ -117,8 +117,10 @@ def ckpt_to_dir(filepath: Union[str, Path]) -> Path:
     return filepath
 
 
-def create_checkpoint_io(**kwargs):
+def create_checkpoint_io(wrapping_ckpt_io=None, **kwargs):
     checkpoint_io = MegatronCheckpointIO(**kwargs)
+    if wrapping_ckpt_io:
+        checkpoint_io = wrapping_ckpt_io(checkpoint_io)
     if kwargs.get("async_save", False):
         checkpoint_io = AsyncFinalizableCheckpointIO(checkpoint_io)
 
diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py
index dfcc7c1650ce..c9a38c5979ca 100644
--- a/nemo/lightning/run/plugins.py
+++ b/nemo/lightning/run/plugins.py
@@ -52,14 +52,14 @@ class PreemptionPlugin(run.Plugin):
         preempt_time (int): The time, in seconds, before the task's time limit at which the executor
                              will send a SIGTERM preemption signal. This allows tasks to be gracefully
                              stopped before reaching their time limit, reducing waste and
-                             promoting fair resource usage. The default value is 300 seconds (5 minutes).
+                             promoting fair resource usage. The default value is 60 seconds (1 minute).
                              This is only supported for ``run.SlurmExecutor``.
         callbacks (list[run.Config[Callback]]): A list of callback configurations that the plugin
                                                 will merge with the task's existing callbacks.
                                                 By default, the list includes NeMo's preemption callback.
     """
 
-    preempt_time: int = 300
+    preempt_time: int = 60
     callbacks: list[run.Config[Callback]] = field(default_factory=lambda: [run.Config(PreemptionCallback)])
 
     def setup(self, task: run.Partial | run.Script, executor: run.Executor):
diff --git a/tests/collections/llm/test_mnist_model_nemo2.py b/tests/collections/llm/test_mnist_model_nemo2.py
index 3f0b804e8bd6..a5c2aa96fc03 100644
--- a/tests/collections/llm/test_mnist_model_nemo2.py
+++ b/tests/collections/llm/test_mnist_model_nemo2.py
@@ -501,6 +501,7 @@ def run_train_mnist_litautoencoder_with_megatron_strategy_single_gpu():
                 monitor="val_loss",
                 save_top_k=1,
                 every_n_train_steps=5,
+                filename="{model_name}--{val_loss:.2f}-{step}-{consumed_samples}",
                 # Enables the .nemo file-like checkpointing where all IOMixins are under SerDe
                 always_save_context=True,
             )
diff --git a/tests/lightning/pytorch/callbacks/test_peft.py b/tests/lightning/pytorch/callbacks/test_peft.py
index 53f9016a3bac..95caca4d2784 100644
--- a/tests/lightning/pytorch/callbacks/test_peft.py
+++ b/tests/lightning/pytorch/callbacks/test_peft.py
@@ -18,6 +18,7 @@
 from pytorch_lightning.trainer.states import TrainerFn
 from nemo.collections.llm import fn
 from nemo.lightning.pytorch.callbacks.peft import PEFT, WrappedAdapterIO
+from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO
 
 
 class TestPEFT:
@@ -48,7 +49,8 @@ def test_peft_setup(self):
         pl_module.model_transform = peft
         peft.setup(trainer, pl_module, "fit")
 
-        assert isinstance(trainer.strategy._checkpoint_io, WrappedAdapterIO)
+        assert isinstance(trainer.strategy._checkpoint_io, AsyncFinalizableCheckpointIO)
+        assert isinstance(trainer.strategy._checkpoint_io._checkpoint_io, WrappedAdapterIO)
         assert peft.model_transform is not None
         assert peft._needs_to_call is True
 
diff --git a/tests/lightning/test_dist_ckpt.py b/tests/lightning/test_dist_ckpt.py
index e6ea381fdf0b..5deb8085aa30 100644
--- a/tests/lightning/test_dist_ckpt.py
+++ b/tests/lightning/test_dist_ckpt.py
@@ -35,6 +35,7 @@ def set_env():
 def _get_strategy():
     strategy = nl.MegatronStrategy(
         enable_nemo_ckpt_io=False,
+        ckpt_async_save=False,
     )
     return strategy
 

From fe4d09b63e21be0e935dd6ed5fb9a25b231c3690 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Fri, 25 Oct 2024 11:49:05 -0700
Subject: [PATCH 8/8] Fix pip install (#11026) (#11028)

* Move AutoTokenizer inline


* Move einops to common requirements


* Move AutoTokenizer import to top-level again in fine_tuning


* Move megatron init inside nemo.lightning


* Make megatron_lazy_init_context work when transformer-engine is not installed


* Only import get_nmt_tokenizer when needed


* Apply isort and black reformatting


---------

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: Marc Romeyn <mromeijn@nvidia.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
---
 nemo/collections/llm/gpt/data/mock.py |   8 +-
 nemo/lightning/_strategy_lib.py       |  28 +-
 nemo/lightning/megatron_init.py       | 413 ++++++++++++++++++++++++++
 requirements/requirements_common.txt  |   1 +
 requirements/requirements_nlp.txt     |   1 -
 5 files changed, 441 insertions(+), 10 deletions(-)
 create mode 100644 nemo/lightning/megatron_init.py

diff --git a/nemo/collections/llm/gpt/data/mock.py b/nemo/collections/llm/gpt/data/mock.py
index 1c5e01c89bbd..5678597eda0b 100644
--- a/nemo/collections/llm/gpt/data/mock.py
+++ b/nemo/collections/llm/gpt/data/mock.py
@@ -56,9 +56,13 @@ def __init__(
         self.persistent_workers = persistent_workers
         self.create_attention_mask = create_attention_mask or not HAVE_TE
 
-        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+        if tokenizer is None:
+            from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+            self.tokenizer = get_nmt_tokenizer("megatron", "GPT2BPETokenizer")
+        else:
+            self.tokenizer = tokenizer
 
-        self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "GPT2BPETokenizer")
         self.data_sampler = MegatronDataSampler(
             seq_len=self.seq_length,
             micro_batch_size=micro_batch_size,
diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py
index d11031feded6..c3adf2a133e5 100644
--- a/nemo/lightning/_strategy_lib.py
+++ b/nemo/lightning/_strategy_lib.py
@@ -21,6 +21,8 @@
 import torch
 from torch import nn
 
+from nemo.lightning.megatron_init import initialize_model_parallel_for_nemo
+
 NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE = "NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE"
 
 
@@ -56,7 +58,6 @@ def init_parallel_ranks(
         seed (int, optional): The seed for random number generation. Defaults to 1234.
         fp8 (bool, optional): Whether to use fp8 precision for model parameters. Defaults to False.
     """
-    from nemo.collections.nlp.modules.common.megatron.megatron_init import initialize_model_parallel_for_nemo
     from nemo.utils import AppState
 
     app_state = AppState()
@@ -161,13 +162,20 @@ def set_model_parallel_attributes(model, parallelism):
 
 @contextmanager
 def megatron_lazy_init_context(config) -> Generator[None, None, None]:
-    def monkey_patched(c):
-        return {"device": "meta"}
+    try:
+        from megatron.core.extensions import transformer_engine as _te
+
+        original = _te._get_extra_te_kwargs  # noqa: SLF001
 
-    from megatron.core.extensions import transformer_engine as _te
+        def _get_extra_te_kwargs_meta(c):
+            """Forces device to meta"""
+            kwargs = original(c)
+            kwargs['device'] = 'meta'
+            return kwargs
 
-    original = _te._get_extra_te_kwargs  # noqa: SLF001
-    _te._get_extra_te_kwargs = monkey_patched  # noqa: SLF001
+        _te._get_extra_te_kwargs = _get_extra_te_kwargs_meta  # noqa: SLF001
+    except ImportError:
+        pass
 
     _orig_perform_initialization = config.perform_initialization
     _orig_use_cpu_initialization = config.use_cpu_initialization
@@ -177,7 +185,13 @@ def monkey_patched(c):
 
     yield
 
-    _te._get_extra_te_kwargs = original  # noqa: SLF001
+    try:
+        from megatron.core.extensions import transformer_engine as _te
+
+        _te._get_extra_te_kwargs = original  # noqa: SLF001
+    except ImportError:
+        pass
+
     config.perform_initialization = _orig_perform_initialization
     config.use_cpu_initialization = _orig_use_cpu_initialization
 
diff --git a/nemo/lightning/megatron_init.py b/nemo/lightning/megatron_init.py
new file mode 100644
index 000000000000..c060d140cb8c
--- /dev/null
+++ b/nemo/lightning/megatron_init.py
@@ -0,0 +1,413 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import numpy as np
+import torch
+
+from nemo.utils import AppState, logging
+
+try:
+    from apex.transformer.log_util import set_logging_level
+
+    HAVE_APEX = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_APEX = False
+
+try:
+    from megatron.core import tensor_parallel
+    from megatron.core.parallel_state import (
+        RankGenerator,
+        get_pipeline_model_parallel_rank,
+        set_expert_model_parallel_rank,
+        set_expert_model_parallel_world_size,
+        set_pipeline_model_parallel_rank,
+        set_pipeline_model_parallel_split_rank,
+        set_pipeline_model_parallel_world_size,
+        set_tensor_model_parallel_rank,
+        set_tensor_model_parallel_world_size,
+        set_virtual_pipeline_model_parallel_rank,
+    )
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
+try:
+    from megatron.core.num_microbatches_calculator import (
+        ConstantNumMicroBatchesCalculator,
+        get_current_global_batch_size,
+        get_micro_batch_size,
+        get_num_microbatches,
+        init_num_microbatches_calculator,
+    )
+
+    MCORE_MB_CALCULATOR = True
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.microbatches import ConstantNumMicroBatches as ConstantNumMicroBatchesCalculator
+    from apex.transformer.pipeline_parallel.utils import (
+        get_current_global_batch_size,
+        get_micro_batch_size,
+        get_num_microbatches,
+    )
+    from apex.transformer.pipeline_parallel.utils import (
+        setup_microbatch_calculator as init_num_microbatches_calculator,
+    )
+
+    MCORE_MB_CALCULATOR = False
+
+
+try:
+    from megatron.core.parallel_state import set_virtual_pipeline_model_parallel_world_size
+
+    HAVE_INTERLEAVED = True
+
+except:
+
+    HAVE_INTERLEAVED = False
+
+
+def initialize_model_parallel_for_nemo(
+    world_size,
+    global_rank,
+    local_rank,
+    tensor_model_parallel_size=1,
+    expert_model_parallel_size=1,
+    pipeline_model_parallel_size=1,
+    virtual_pipeline_model_parallel_size=None,
+    pipeline_model_parallel_split_rank=None,
+    context_parallel_size=1,
+    micro_batch_size=None,
+    global_batch_size=None,
+    rampup_batch_size=None,
+    use_fp8=False,
+    init_mpi_proc_group=False,
+    seed=1234,
+    apex_transformer_log_level=30,
+    use_tp_pp_dp_mapping=False,
+    use_te_rng_tracker=False,
+):
+
+    if virtual_pipeline_model_parallel_size is not None and not HAVE_INTERLEAVED:
+        raise ValueError("set_virtual_pipeline_model_parallel_world_size is needed in megatron-core for interleaved.")
+
+    # updating NeMo globals
+    app_state = AppState()
+    app_state.global_rank = global_rank
+    app_state.world_size = world_size
+    app_state.local_rank = local_rank
+    app_state.use_tp_pp_dp_mapping = use_tp_pp_dp_mapping
+    app_state.expert_model_parallel_size = expert_model_parallel_size
+    app_state.tensor_model_parallel_size = tensor_model_parallel_size
+    app_state.pipeline_model_parallel_size = pipeline_model_parallel_size
+    app_state.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size
+    app_state.context_parallel_size = context_parallel_size
+    app_state.use_fp8 = use_fp8
+    app_state.init_mpi_proc_group = init_mpi_proc_group
+    (
+        app_state.tensor_model_parallel_rank,
+        app_state.pipeline_model_parallel_rank,
+        app_state.expert_model_parallel_rank,
+        app_state.model_parallel_size,
+        app_state.data_parallel_size,
+        app_state.pipeline_model_parallel_split_rank,
+        app_state.virtual_pipeline_model_parallel_rank,
+    ) = fake_initialize_model_parallel(
+        world_size=world_size,
+        rank=global_rank,
+        tensor_model_parallel_size_=tensor_model_parallel_size,
+        pipeline_model_parallel_size_=pipeline_model_parallel_size,
+        virtual_pipeline_model_parallel_size_=virtual_pipeline_model_parallel_size,
+        pipeline_model_parallel_split_rank_=pipeline_model_parallel_split_rank,
+        context_parallel_size_=context_parallel_size,
+        expert_model_parallel_size_=expert_model_parallel_size,
+        use_tp_pp_dp_mapping=use_tp_pp_dp_mapping,
+    )
+
+    # update apex.transformer globals
+    set_tensor_model_parallel_world_size(app_state.tensor_model_parallel_size)
+    set_tensor_model_parallel_rank(app_state.tensor_model_parallel_rank)
+
+    set_expert_model_parallel_world_size(app_state.expert_model_parallel_size)
+    set_expert_model_parallel_rank(app_state.expert_model_parallel_rank)
+
+    set_pipeline_model_parallel_rank(app_state.pipeline_model_parallel_rank)
+    if HAVE_INTERLEAVED:
+        set_virtual_pipeline_model_parallel_world_size(app_state.virtual_pipeline_model_parallel_size)
+    set_virtual_pipeline_model_parallel_rank(app_state.virtual_pipeline_model_parallel_rank)
+    set_pipeline_model_parallel_world_size(app_state.pipeline_model_parallel_size)
+    set_pipeline_model_parallel_split_rank(app_state.pipeline_model_parallel_split_rank)
+
+    tensor_parallel.random.initialize_rng_tracker(use_te_rng_tracker=use_te_rng_tracker)
+    if seed is not None:
+        # @chcui not setting seed is for model conversion. always set seed for training/inference.
+        _set_random_seed(seed)
+
+    if global_batch_size and micro_batch_size is not None:
+        # TODO: add rampup_batch_size here when we have it implemented
+        if MCORE_MB_CALCULATOR:
+            from megatron.core.num_microbatches_calculator import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+
+            if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is None:
+                init_num_microbatches_calculator(
+                    rank=global_rank,
+                    global_batch_size=global_batch_size,
+                    micro_batch_size=micro_batch_size,
+                    data_parallel_size=app_state.data_parallel_size,
+                    rampup_batch_size=rampup_batch_size,
+                )
+            else:
+                if isinstance(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, ConstantNumMicroBatchesCalculator):
+                    assert get_current_global_batch_size() == global_batch_size
+                    assert get_micro_batch_size() == micro_batch_size
+                    assert get_num_microbatches() == global_batch_size // (
+                        micro_batch_size * app_state.data_parallel_size
+                    )
+                else:
+                    raise Exception("Microbatch calculator already initialized.")
+        else:
+            from apex.transformer.pipeline_parallel.utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+
+            if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is None:
+                init_num_microbatches_calculator(
+                    rank=global_rank,
+                    global_batch_size=global_batch_size,
+                    micro_batch_size=micro_batch_size,
+                    data_parallel_size=app_state.data_parallel_size,
+                    rampup_batch_size=rampup_batch_size,
+                )
+            else:
+                if isinstance(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, ConstantNumMicroBatchesCalculator):
+                    assert get_current_global_batch_size() == global_batch_size
+                    assert get_micro_batch_size() == micro_batch_size
+                    assert get_num_microbatches() == global_batch_size // (
+                        micro_batch_size * app_state.data_parallel_size
+                    )
+                else:
+                    raise Exception("Microbatch calculator already initialized.")
+
+    app_state._is_megatron_initialized = True
+
+    if HAVE_APEX:
+        set_logging_level(apex_transformer_log_level)
+
+
+def _set_random_seed(seed_):
+    """Set random seed for reproducability."""
+    if seed_ is not None and seed_ > 0:
+        # Ensure that different pipeline MP stages get different seeds.
+        seed = seed_ + (100 * get_pipeline_model_parallel_rank())
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.device_count() > 0:
+            tensor_parallel.model_parallel_cuda_manual_seed(seed)
+    else:
+        raise ValueError('Seed ({}) should be a positive integer.'.format(seed_))
+
+
+def set_jit_fusion_options():
+    """Set PyTorch JIT layer fusion options."""
+    # set flags if we are using the 21.10 container
+    if torch.__version__ == "1.10.0a0+0aef44c":
+        # nvfuser
+        torch._C._jit_set_profiling_executor(True)
+        torch._C._jit_set_profiling_mode(True)
+        torch._C._jit_override_can_fuse_on_cpu(False)
+        torch._C._jit_override_can_fuse_on_gpu(False)
+        torch._C._jit_set_texpr_fuser_enabled(False)
+        torch._C._jit_set_nvfuser_enabled(True)
+        torch._C._debug_set_autodiff_subgraph_inlining(False)
+
+
+def fake_initialize_model_parallel(
+    world_size,
+    rank,
+    tensor_model_parallel_size_,
+    pipeline_model_parallel_size_,
+    pipeline_model_parallel_split_rank_=None,
+    virtual_pipeline_model_parallel_size_=None,
+    expert_model_parallel_size_=1,
+    context_parallel_size_=1,
+    use_tp_pp_dp_mapping=False,
+):
+    """
+    Fake initialize model data parallel groups so that we can instantiate model parallel models before DDP is initialized.
+    This is needed because PTL execution flow is init model, init trainer -> call trainer.fit(model). DDP is initialized during .fit.
+    This function is taken from megatron.core.parallel_state and modified so that the distributed groups are not created.
+    We only need the tensor parallel and pipeline parallel ranks to instantiate the model.
+
+    Arguments:
+        tensor_model_parallel_size: number of GPUs used to parallelize model tensor.
+        pipeline_model_parallel_size: number of GPUs used to parallelize model pipeline.
+        context_parallel_size: number of GPUs used to parallelize tokens of each input.
+
+    Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
+    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+    the model pipeline. The present function will
+    create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
+    and 8 data-parallel groups as:
+        8 data_parallel groups:
+            [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
+        8 tensor model-parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
+        4 pipeline model-parallel groups:
+            [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+
+    # Get world size and rank. Ensure some consistencies.
+    tensor_model_parallel_size = min(tensor_model_parallel_size_, world_size)
+    pipeline_model_parallel_size = min(pipeline_model_parallel_size_, world_size)
+    model_parallel_size = tensor_model_parallel_size * pipeline_model_parallel_size
+    context_parallel_size = min(context_parallel_size_, world_size)
+
+    assert (
+        world_size % (tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size) == 0
+    ), f'world_size: {world_size} must be divisible by tensor_model_parallel_size: {tensor_model_parallel_size} times pipeline_model_parallel_size {pipeline_model_parallel_size} times context_parallel_size {context_parallel_size}'
+    data_parallel_size = world_size // (
+        tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size
+    )
+
+    num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
+    num_pipeline_model_parallel_groups = world_size // pipeline_model_parallel_size
+
+    virtual_pipeline_model_parallel_rank = None
+    if virtual_pipeline_model_parallel_size_ is not None:
+        virtual_pipeline_model_parallel_rank = 0
+
+    rank_generator = RankGenerator(
+        tp=tensor_model_parallel_size,
+        ep=expert_model_parallel_size_,
+        dp=data_parallel_size,
+        pp=pipeline_model_parallel_size,
+        cp=context_parallel_size,
+        order='tp-pp-dp' if use_tp_pp_dp_mapping else 'tp-cp-ep-dp-pp',
+    )
+
+    # Build the data-parallel groups.
+    all_data_parallel_group_ranks_with_cp = []
+    for ranks in rank_generator.get_ranks('dp'):
+        if rank in ranks:
+            data_parallel_group = list(ranks)
+            logging.info(f'Rank {rank} has data parallel group : {data_parallel_group}')
+
+    for ranks_with_cp in rank_generator.get_ranks('dp-cp'):
+        all_data_parallel_group_ranks_with_cp.append(ranks_with_cp)
+        if rank in ranks_with_cp:
+            data_parallel_group_with_cp = ranks_with_cp
+            logging.info(
+                f'Rank {rank} has combined group of data parallel and context parallel : {data_parallel_group_with_cp}'
+            )
+
+    data_parallel_rank = data_parallel_group.index(rank)
+    logging.info(
+        f'All data parallel group ranks with context parallel combined: {all_data_parallel_group_ranks_with_cp}'
+    )
+    logging.info(f'Ranks {rank} has data parallel rank: {data_parallel_rank}')
+
+    # Build the context-parallel groups.
+    all_context_parallel_group_ranks = []
+    for ranks in rank_generator.get_ranks('cp'):
+        all_context_parallel_group_ranks.append(ranks)
+        if rank in ranks:
+            context_parallel_group = ranks
+            logging.info(f'Rank {rank} has context parallel group: {context_parallel_group}')
+
+    context_parallel_rank = context_parallel_group.index(rank)
+    logging.info(f'All context parallel group ranks: {all_context_parallel_group_ranks}')
+    logging.info(f'Ranks {rank} has context parallel rank: {context_parallel_rank}')
+
+    # Build the model-parallel groups.
+    all_model_parallel_group_ranks = []
+    for ranks in rank_generator.get_ranks('tp-pp'):
+        all_model_parallel_group_ranks.append(ranks)
+        if rank in ranks:
+            logging.info(f'Rank {rank} has model parallel group: {list(ranks)}')
+    logging.info(f'All model parallel group ranks: {all_model_parallel_group_ranks}')
+
+    # Build the tensor model-parallel groups.
+    all_tensor_model_parallel_group_ranks = []
+    tensor_model_parallel_group = None
+    for ranks in rank_generator.get_ranks('tp'):
+        all_tensor_model_parallel_group_ranks.append(ranks)
+        if rank in ranks:
+            tensor_model_parallel_group = ranks
+            logging.info(f'Rank {rank} has tensor model parallel group: {tensor_model_parallel_group}')
+
+    tensor_model_parallel_rank = tensor_model_parallel_group.index(rank)
+
+    logging.info(f'All tensor model parallel group ranks: {all_tensor_model_parallel_group_ranks}')
+    logging.info(f'Rank {rank} has tensor model parallel rank: {tensor_model_parallel_rank}')
+
+    # EP rank
+    expert_model_parallel_rank = 0
+    if expert_model_parallel_size_ is not None and expert_model_parallel_size_ > 1:
+        for ranks in rank_generator.get_ranks('ep', independent_ep=True):
+            if rank in ranks:
+                expert_model_parallel_rank = list(ranks).index(rank)
+
+    # Build the pipeline model-parallel groups and embedding groups
+    # (first and last rank in each pipeline model-parallel group).
+    all_pipeline_model_parallel_group_ranks = []
+    all_embedding_group_ranks = []
+    pipeline_model_parallel_group = None
+    embedding_group = None
+    embedding_rank = None
+    for ranks in rank_generator.get_ranks('pp'):
+        all_pipeline_model_parallel_group_ranks.append(ranks)
+        if rank in ranks:
+            pipeline_model_parallel_group = ranks
+            logging.info(f'Rank {rank} has pipeline model parallel group: {pipeline_model_parallel_group}')
+
+        # Setup embedding group (to exchange gradients between
+        # first and last stages).
+        if len(ranks) > 1:
+            embedding_ranks = [ranks[0], ranks[-1]]
+            all_embedding_group_ranks.append(embedding_ranks)
+        else:
+            embedding_ranks = ranks
+            all_embedding_group_ranks.append(list(embedding_ranks))
+        if rank in embedding_ranks:
+            embedding_group = list(embedding_ranks)
+            logging.info(f'Rank {rank} has embedding group: {embedding_group}')
+
+    pipeline_model_parallel_rank = pipeline_model_parallel_group.index(rank)
+    if embedding_group is not None:
+        embedding_rank = embedding_group.index(rank)
+
+    logging.info(f'All pipeline model parallel group ranks: {all_pipeline_model_parallel_group_ranks}')
+    logging.info(f'Rank {rank} has pipeline model parallel rank {pipeline_model_parallel_rank}')
+    logging.info(f'All embedding group ranks: {all_pipeline_model_parallel_group_ranks}')
+    logging.info(f'Rank {rank} has embedding rank: {embedding_rank}')
+
+    return (
+        tensor_model_parallel_rank,
+        pipeline_model_parallel_rank,
+        expert_model_parallel_rank,
+        model_parallel_size,
+        data_parallel_size,
+        pipeline_model_parallel_split_rank_,
+        virtual_pipeline_model_parallel_rank,
+    )
diff --git a/requirements/requirements_common.txt b/requirements/requirements_common.txt
index 616381ed5933..d8ad52452c7c 100644
--- a/requirements/requirements_common.txt
+++ b/requirements/requirements_common.txt
@@ -1,4 +1,5 @@
 datasets
+einops
 inflect
 pandas
 sacremoses>=0.0.43
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index f2b074a5975b..b7cac6a3b827 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -1,6 +1,5 @@
 accelerated-scan
 boto3
-einops
 faiss-cpu
 fasttext
 flask_restful