From 58da88610f73a712684429b8207a5d9039924869 Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Tue, 22 Oct 2024 17:58:21 +0200
Subject: [PATCH 01/12] Reflect CLI change nemorun -> nemo (#10443)

Signed-off-by: Marc Romeijn <mromeijn@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
---
 examples/llm/pretrain/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/llm/pretrain/README.md b/examples/llm/pretrain/README.md
index c9bb7331f972..61f64d7792bb 100644
--- a/examples/llm/pretrain/README.md
+++ b/examples/llm/pretrain/README.md
@@ -3,7 +3,7 @@
 ### Listing the available recipes for pretraining
 
 ```bash
-nemorun llm pretrain --help
+nemo llm pretrain --help
 ```
 
 ![recipe-listing](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/list-recipes.png)
@@ -12,7 +12,7 @@ nemorun llm pretrain --help
 ### Run pre-training with a default recipe
 
 ```bash
-nemorun llm pretrain --factory llama3_8b
+nemo llm pretrain --factory llama3_8b
 ```
 
 ![llama3_70b](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b.png)
@@ -20,7 +20,7 @@ nemorun llm pretrain --factory llama3_8b
 We can also call the factory function with custom parameters:
 
 ```bash
-nemorun llm pretrain --factory "llama3_70b(num_nodes=128)"
+nemo llm pretrain --factory "llama3_70b(num_nodes=128)"
 ```
 
 ![llama3_70b-128-nodes](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b_128nodes.png)
@@ -29,13 +29,13 @@ nemorun llm pretrain --factory "llama3_70b(num_nodes=128)"
 The CLI allows you to overwrite any parameter. For example, to run the recipe with 2000 steps: 
 
 ```bash
-nemorun llm pretrain --factory llama3_70b trainer.max_steps=2000
+nemo llm pretrain --factory llama3_70b trainer.max_steps=2000
 ```
 
 The syntax of the CLI is the same as the Python code. Which is great but in some cases you might want to inspect & edit a recipe interactively. An easy way to do this using the cli is the use the `--repl` flag.
 
 ```bash
-nemorun llm pretrain --factory llama3_70b --repl
+nemo llm pretrain --factory llama3_70b --repl
 ```
 
 ![repl](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/repl.gif)

From 746203add92094e385a97bfe54f819b1dd45146e Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Tue, 22 Oct 2024 19:02:32 +0300
Subject: [PATCH 02/12] minor fix (#10990)

Co-authored-by: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
---
 scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py  | 2 +-
 .../checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
index f395e34765d0..42d3e77ce4c8 100644
--- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
@@ -15,7 +15,7 @@
 r"""
 Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint.
   Example to run this conversion script:
-    python convert_llama_hf_to_nemo.py \
+    python convert_llama_hf_to_nemo_load.py \
      --input_name_or_path <path_to_hf_checkpoints_folder> \
      --input_state_dict <path_to_saved_state_dict> \
      --output_path <path_to_output_nemo_file> \
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
index 940a9df5f9a8..f7096996e5b1 100644
--- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
@@ -15,7 +15,7 @@
 r"""
 Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint.
   Example to run this conversion script:
-    python convert_llama_hf_to_nemo.py \
+    python convert_llama_hf_to_nemo_save_dict.py \
      --input_name_or_path <path_to_hf_checkpoints_folder> \
      --output_path <path_to_output_nemo_file>
      --precision bf16 

From 70d8cc191b322d25fdb9428396c21a66d19f3ffb Mon Sep 17 00:00:00 2001
From: anteju <108555623+anteju@users.noreply.github.com>
Date: Tue, 22 Oct 2024 09:33:55 -0700
Subject: [PATCH 03/12] Fixed sampler override and audio_key in
 prepare_audio_data (#10980)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ante Jukić <ajukic@nvidia.com>
---
 examples/audio/process_audio.py                      | 4 ++--
 nemo/collections/asr/parts/utils/transcribe_utils.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/audio/process_audio.py b/examples/audio/process_audio.py
index e28fb4e69627..ec88bda34954 100644
--- a/examples/audio/process_audio.py
+++ b/examples/audio/process_audio.py
@@ -159,8 +159,8 @@ def main(cfg: ProcessConfig) -> ProcessConfig:
     audio_to_audio_model.set_trainer(trainer)
     audio_to_audio_model = audio_to_audio_model.eval()
 
-    # override sampler
-    if cfg.sampler is not None:
+    # override sampler if necessary
+    if cfg.sampler:
         logging.info('Overriding sampler with %s', cfg.sampler)
 
         if hasattr(audio_to_audio_model, 'sampler'):
diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py
index c1e712c44aeb..0d4f4c895bcf 100644
--- a/nemo/collections/asr/parts/utils/transcribe_utils.py
+++ b/nemo/collections/asr/parts/utils/transcribe_utils.py
@@ -314,7 +314,7 @@ def prepare_audio_data(cfg: DictConfig) -> Tuple[List[str], bool]:
         with NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
             for item in read_and_maybe_sort_manifest(cfg.dataset_manifest, try_sort=cfg.presort_manifest):
                 audio_file = get_full_path(audio_file=item[audio_key], manifest_file=cfg.dataset_manifest)
-                item[audio_key] = audio_file
+                item['audio_filepath'] = audio_file
                 filepaths.append(audio_file)
                 f.write(json.dumps(item) + "\n")
         sorted_manifest_path = f.name

From c20e8922c434ccc22b7a8bf62acdb3276bd7a9f7 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Tue, 22 Oct 2024 14:08:43 -0400
Subject: [PATCH 04/12] Add more recipes (#10957)

* add recipes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* adjust finetuning recipe

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
---
 nemo/collections/llm/gpt/model/baichuan.py    |   2 +-
 nemo/collections/llm/gpt/model/chatglm.py     |   4 +-
 nemo/collections/llm/recipes/__init__.py      |   8 +
 nemo/collections/llm/recipes/baichuan2_7b.py  | 285 ++++++++++++++++++
 nemo/collections/llm/recipes/chatglm3_6b.py   | 283 +++++++++++++++++
 .../llm/recipes/finetune_default.py           |   8 +-
 nemo/collections/llm/recipes/gemma_2b.py      | 285 ++++++++++++++++++
 nemo/collections/llm/recipes/gemma_7b.py      | 285 ++++++++++++++++++
 nemo/collections/llm/recipes/optim/adam.py    |   8 +-
 9 files changed, 1158 insertions(+), 10 deletions(-)
 create mode 100644 nemo/collections/llm/recipes/baichuan2_7b.py
 create mode 100644 nemo/collections/llm/recipes/chatglm3_6b.py
 create mode 100644 nemo/collections/llm/recipes/gemma_2b.py
 create mode 100644 nemo/collections/llm/recipes/gemma_7b.py

diff --git a/nemo/collections/llm/gpt/model/baichuan.py b/nemo/collections/llm/gpt/model/baichuan.py
index 56231978061f..c283b802a118 100644
--- a/nemo/collections/llm/gpt/model/baichuan.py
+++ b/nemo/collections/llm/gpt/model/baichuan.py
@@ -215,7 +215,7 @@ def _import_qkv(ctx: io.TransformCTX, qkv_weights):
     q = qkv_weights[0].squeeze().view(*new_q_tensor_shape)
     k = qkv_weights[1].squeeze().view(*new_kv_tensor_shape)
     v = qkv_weights[2].squeeze().view(*new_kv_tensor_shape)
-    qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:])
+    qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:]).type_as(qkv_weights)
     for i in range(num_query_groups):
         qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
         qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
diff --git a/nemo/collections/llm/gpt/model/chatglm.py b/nemo/collections/llm/gpt/model/chatglm.py
index 5bd1319102e2..e7450a8db28d 100644
--- a/nemo/collections/llm/gpt/model/chatglm.py
+++ b/nemo/collections/llm/gpt/model/chatglm.py
@@ -221,7 +221,7 @@ def _import_qkv_weight(ctx: io.TransformCTX, hf_qkv_weights):
     k = k.view(*new_kv_tensor_shape)
     v = v.view(*new_kv_tensor_shape)
 
-    qkv_weights = torch.empty((0, head_size, old_tensor_shape[1]))
+    qkv_weights = torch.empty((0, head_size, old_tensor_shape[1])).type_as(hf_qkv_weights)
     for i in range(num_query_groups):
         qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
         qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
@@ -251,7 +251,7 @@ def _import_qkv_bias(ctx: io.TransformCTX, hf_qkv_bias):
     q = q.view(*new_q_tensor_shape)
     k = k.view(*new_kv_tensor_shape)
     v = v.view(*new_kv_tensor_shape)
-    qkv_bias = torch.empty((0, head_size))
+    qkv_bias = torch.empty((0, head_size)).type_as(hf_qkv_bias)
     for i in range(num_query_groups):
         qkv_bias = torch.cat((qkv_bias, q[i * heads_per_group : (i + 1) * heads_per_group, :]))
         qkv_bias = torch.cat((qkv_bias, k[i : i + 1, :]))
diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
index 47cc4e71448d..ff81c3b383fc 100644
--- a/nemo/collections/llm/recipes/__init__.py
+++ b/nemo/collections/llm/recipes/__init__.py
@@ -14,6 +14,10 @@
 
 
 from nemo.collections.llm.recipes import (
+    baichuan2_7b,
+    chatglm3_6b,
+    gemma_2b,
+    gemma_7b,
     llama3_8b,
     llama3_8b_16k,
     llama3_8b_64k,
@@ -49,6 +53,10 @@
 from nemo.collections.llm.recipes.optim import adam
 
 __all__ = [
+    "baichuan2_7b",
+    "chatglm3_6b",
+    "gemma_2b",
+    "gemma_7b",
     "llama3_8b",
     "llama3_8b_16k",
     "llama3_8b_64k",
diff --git a/nemo/collections/llm/recipes/baichuan2_7b.py b/nemo/collections/llm/recipes/baichuan2_7b.py
new file mode 100644
index 000000000000..3ebb643af779
--- /dev/null
+++ b/nemo/collections/llm/recipes/baichuan2_7b.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm import Baichuan2Config7B, Baichuan2Model
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "baichuan2_7b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Baichuan2 7B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Baichuan2 7B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=baichuan2_7b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(Baichuan2Model, config=run.Config(Baichuan2Config7B))
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Baichuan2 7B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=baichuan2_7b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Baichuan2 7B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory baichuan2_7b
+            $ nemo llm pretrain --factory "baichuan2_7b(num_nodes=2, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="baichuan2_7b_pretrain", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_optimized")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Baichuan2 7B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory baichuan2_7b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="baichuan2_7b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=False,
+        )
+    )
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Baichuan2 7B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory baichuan2_7b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="baichuan2_7b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    recipe = default_finetune_recipe(
+        model(), "baichuan-inc/Baichuan2-7B-Base", dir, name, num_nodes, num_gpus_per_node
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 2
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA)
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/chatglm3_6b.py b/nemo/collections/llm/recipes/chatglm3_6b.py
new file mode 100644
index 000000000000..f5d580a9c6ea
--- /dev/null
+++ b/nemo/collections/llm/recipes/chatglm3_6b.py
@@ -0,0 +1,283 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm import ChatGLM3Config6B, ChatGLMModel
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "chatglm3_6b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a ChatGLM3 6B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the ChatGLM3 6B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=chatglm3_6b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(ChatGLMModel, config=run.Config(ChatGLM3Config6B))
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for ChatGLM3 6B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=chatglm3_6b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for ChatGLM3 6B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory chatglm3_6b
+            $ nemo llm pretrain --factory "chatglm3_6b(num_nodes=2, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="chatglm3_6b_pretrain", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_optimized")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for ChatGLM3 6B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory chatglm3_6b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="chatglm3_6b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=False,
+        )
+    )
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for ChatGLM3 6B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory chatglm3_6b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="chatglm3_6b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    recipe = default_finetune_recipe(model(), "THUDM/chatglm3-6b", dir, name, num_nodes, num_gpus_per_node)
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 2
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA)
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/finetune_default.py b/nemo/collections/llm/recipes/finetune_default.py
index 89c982613126..255763abbf50 100644
--- a/nemo/collections/llm/recipes/finetune_default.py
+++ b/nemo/collections/llm/recipes/finetune_default.py
@@ -60,7 +60,7 @@ def default_finetune_recipe(
         ),
         data=run.Config(llm.SquadDataModule, seq_length=2048, global_batch_size=128, micro_batch_size=1),
         log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
-        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50, adam_beta2=0.98),
         resume=nemo_resume(resume_path),
     )
 
@@ -77,9 +77,9 @@ def default_finetune_trainer(
     num_nodes=1,
     num_gpus_per_node=8,
     max_steps=1000,
-    limit_test_batches=None,
-    limit_val_batches=None,
-    val_check_interval=5,
+    limit_test_batches=1,
+    limit_val_batches=1,
+    val_check_interval=30,
 ):
     strategy = run.Config(
         nl.MegatronStrategy,
diff --git a/nemo/collections/llm/recipes/gemma_2b.py b/nemo/collections/llm/recipes/gemma_2b.py
new file mode 100644
index 000000000000..cbcd340c1e92
--- /dev/null
+++ b/nemo/collections/llm/recipes/gemma_2b.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm import GemmaConfig2B, GemmaModel
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "gemma_2b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Gemma 2B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Gemma 2B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=gemma_2b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(GemmaModel, config=run.Config(GemmaConfig2B))
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Gemma 2B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=gemma_2b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Gemma 2B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory gemma_2b
+            $ nemo llm pretrain --factory "gemma_2b(num_nodes=2, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="gemma_2b_pretrain", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_optimized")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Gemma 2B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory gemma_2b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="gemma_2b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=False,
+        )
+    )
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Gemma 2B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory gemma_2b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="gemma_2b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    # Disable cuDNN attention since TE 1.8 does not support head dim > 128
+    os.environ['NVTE_FUSED_ATTN'] = "0"
+
+    recipe = default_finetune_recipe(model(), "google/gemma-2b", dir, name, num_nodes, num_gpus_per_node)
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 2
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA)
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/gemma_7b.py b/nemo/collections/llm/recipes/gemma_7b.py
new file mode 100644
index 000000000000..3b0e206d9ce7
--- /dev/null
+++ b/nemo/collections/llm/recipes/gemma_7b.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Callable, Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm import GemmaConfig7B, GemmaModel
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "gemma_7b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Gemma 7B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Gemma 7B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=gemma_7b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(GemmaModel, config=run.Config(GemmaConfig7B))
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Gemma 7B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=gemma_7b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Gemma 7B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory gemma_7b
+            $ nemo llm pretrain --factory "gemma_7b(num_nodes=2, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="gemma_7b_pretrain", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+@run.cli.factory(target=pretrain, name=NAME + "_optimized")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Gemma 7B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory gemma_7b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="gemma_7b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=False,
+        )
+    )
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Gemma 7B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory gemma_7b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="gemma_7b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    # Disable cuDNN attention since TE 1.8 does not support head dim > 128
+    os.environ['NVTE_FUSED_ATTN'] = "0"
+
+    recipe = default_finetune_recipe(model(), "google/gemma-7b", dir, name, num_nodes, num_gpus_per_node)
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 2
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA)
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+    return recipe
diff --git a/nemo/collections/llm/recipes/optim/adam.py b/nemo/collections/llm/recipes/optim/adam.py
index 5be87ac71e9d..c6510577711d 100644
--- a/nemo/collections/llm/recipes/optim/adam.py
+++ b/nemo/collections/llm/recipes/optim/adam.py
@@ -25,6 +25,8 @@ def distributed_fused_adam_with_cosine_annealing(
     precision: str = "bf16-mixed",  # or "16-mixed"
     warmup_steps: int = 2000,
     constant_steps: int = 0,
+    adam_beta1: float = 0.9,
+    adam_beta2: float = 0.95,
     max_lr: float = 1e-4,
     min_lr: Optional[float] = None,
     clip_grad: float = 1.0,
@@ -37,14 +39,14 @@ def distributed_fused_adam_with_cosine_annealing(
         weight_decay=0.1,
         bf16=precision == "bf16-mixed",
         fp16=precision == "16-mixed",
-        adam_beta1=0.9,
-        adam_beta2=0.95,
+        adam_beta1=adam_beta1,
+        adam_beta2=adam_beta2,
         adam_eps=1e-5,
         use_distributed_optimizer=True,
         clip_grad=clip_grad,
     )
 
-    min_lr = min_lr or (0.1 * max_lr)
+    min_lr = min_lr if min_lr is not None else (0.1 * max_lr)
     sched = run.Config(
         CosineAnnealingScheduler,
         warmup_steps=warmup_steps,

From 1ba8bb1c623c2b7e938549002f5b77ab02fbe4bc Mon Sep 17 00:00:00 2001
From: meatybobby <bobchen@nvidia.com>
Date: Tue, 22 Oct 2024 11:48:51 -0700
Subject: [PATCH 05/12] Fix parallel_embedding (#10975)

---
 nemo/export/trt_llm/converter/model_converter.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py
index 366206c948eb..e5e9f8154d24 100755
--- a/nemo/export/trt_llm/converter/model_converter.py
+++ b/nemo/export/trt_llm/converter/model_converter.py
@@ -260,9 +260,7 @@ def model_to_trtllm_ckpt(
 
         if mapping.is_first_pp_rank():
             embedding_weight = (
-                np.ascontiguousarray(
-                    split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank)
-                )
+                split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank)
                 if use_parallel_embedding
                 else weights_dict["transformer.vocab_embedding.weight"]
             )
@@ -272,9 +270,7 @@ def model_to_trtllm_ckpt(
             pos_embedding_weight = weights_dict.get("transformer.position_embedding.weight")
             if pos_embedding_weight is not None:
                 if use_parallel_embedding:
-                    pos_embedding_weight = np.ascontiguousarray(
-                        split(pos_embedding_weight, mapping.tp_size, mapping.tp_rank)
-                    )
+                    pos_embedding_weight = split(pos_embedding_weight, mapping.tp_size, mapping.tp_rank)
                 weights_dict_local["transformer.position_embedding.weight"] = pos_embedding_weight
 
         if mapping.is_last_pp_rank():

From 69e3c3f31ce2c6ba9b7f29c5101426d49b63bd63 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Tue, 22 Oct 2024 18:15:00 -0400
Subject: [PATCH 06/12] Upgrade transformers (#10854)

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 requirements/requirements_lightning.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt
index 171abce41f37..e8020f244821 100644
--- a/requirements/requirements_lightning.txt
+++ b/requirements/requirements_lightning.txt
@@ -4,6 +4,6 @@ hydra-core>1.3,<=1.3.2
 omegaconf<=2.3
 pytorch-lightning>2.2.1
 torchmetrics>=0.11.0
-transformers>=4.44.0
+transformers>=4.45.0
 wandb
 webdataset>=0.2.86

From 8f26236a6af9e963b214130a481a9cb67b941943 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Tue, 22 Oct 2024 22:11:32 -0700
Subject: [PATCH 07/12] Add support and recipes for HF models via
 AutoModelForCausalLM (#10962)

* initial hf_lit_module

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* make sft gpt dataset sanity check optional

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* HF sft example

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Rename HfLitModule to HfAutoModel

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update default model id

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* move rank&world_size as params

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix mbs in example

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix for fsdp and logger

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* make loss_fn configurable

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* remove optim from HfAutoModel

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add pytorch native optim

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add hfAutoModel pretrain nemorun recipe

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove debug

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove stale imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove stale import

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rm stale imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rm stale imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* tokenizer fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update example

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rename pytorch_adam to pytorch_adam_with_cosine_annealing

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* small refactor

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix no_weight_decay_cond

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* switch to flat_lr optim for example

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* remove imports & update docstrings

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add a tokenizer setter to allow it to work with nemo/collections/llm/api.py::_use_tokenizer

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove unused import

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* allow loss_mask to be none

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Add HF-dataset lightning module

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* check if pad_token_id is None

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rename hf_lit_module.py to hf_auto_model.py

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* class rename

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rename

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update example

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* HfAutoModelForCausalLM

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rm stale import

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add option to start with random weights

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add check in megatron-strategy

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rename param

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* drop mcore sampler from squadmodule

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* make megatron_sampler optional in HfDatasetDataModule

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* copyright

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* use is_hf_model to mark hf classes

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
---
 examples/llm/sft/hf.py                        |  91 ++++++++++
 nemo/collections/llm/__init__.py              |   3 +
 nemo/collections/llm/gpt/data/__init__.py     |  10 +-
 nemo/collections/llm/gpt/data/fine_tuning.py  |   5 +
 nemo/collections/llm/gpt/data/hf_dataset.py   | 103 +++++++++++
 nemo/collections/llm/gpt/data/squad.py        |   2 +
 nemo/collections/llm/gpt/model/__init__.py    |   2 +
 .../gpt/model/hf_auto_model_for_causal_lm.py  | 108 +++++++++++
 nemo/collections/llm/recipes/__init__.py      |   2 +
 .../recipes/hf_auto_model_for_causal_lm.py    | 168 ++++++++++++++++++
 nemo/collections/llm/recipes/optim/adam.py    |  59 +++++-
 .../megatron/dataset_utils.py                 |  39 ++--
 .../megatron/gpt_sft_dataset.py               |   4 +
 nemo/lightning/data.py                        |  17 +-
 nemo/lightning/pytorch/optim/__init__.py      |   2 +
 nemo/lightning/pytorch/optim/pytorch.py       | 132 ++++++++++++++
 .../lightning/pytorch/plugins/data_sampler.py |   7 +
 .../pytorch/strategies/megatron_strategy.py   |   2 +
 18 files changed, 733 insertions(+), 23 deletions(-)
 create mode 100644 examples/llm/sft/hf.py
 create mode 100644 nemo/collections/llm/gpt/data/hf_dataset.py
 create mode 100644 nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
 create mode 100644 nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py
 create mode 100644 nemo/lightning/pytorch/optim/pytorch.py

diff --git a/examples/llm/sft/hf.py b/examples/llm/sft/hf.py
new file mode 100644
index 000000000000..b7e12d8fb2de
--- /dev/null
+++ b/examples/llm/sft/hf.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fiddle as fdl
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+from torch.utils.data import DataLoader
+
+from nemo import lightning as nl
+from nemo.collections import llm
+
+
+class SquadDataModuleWithPthDataloader(llm.SquadDataModule):
+    def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
+        return DataLoader(
+            dataset,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers,
+            collate_fn=dataset.collate_fn,
+            batch_size=self.micro_batch_size,
+            **kwargs,
+        )
+
+
+def squad(tokenizer) -> pl.LightningDataModule:
+    return SquadDataModuleWithPthDataloader(
+        tokenizer=tokenizer,
+        seq_length=2048,
+        micro_batch_size=2,
+        global_batch_size=128,  # assert gbs == mbs * accumulate_grad_batches
+        num_workers=0,
+        sanity_check_dist_workers=False,
+    )
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', default='meta-llama/Llama-3.2-1B')
+    parser.add_argument('--strategy', type=str, default='auto', choices=['auto', 'ddp', 'fsdp'])
+    parser.add_argument('--devices', default=1)
+    parser.add_argument('--accelerator', default='gpu', choices=['gpu'])
+    parser.add_argument('--max-steps', type=int, default=100)
+    parser.add_argument('--wandb-project', type=str, default=None)
+    args = parser.parse_args()
+
+    wandb = None
+    if args.wandb_project is not None:
+        model = '_'.join(args.model.split('/')[-2:])
+        wandb = WandbLogger(
+            project=args.wandb_project,
+            name=f'{model}_dev{args.devices}_strat_{args.strategy}',
+        )
+    grad_clip = 0.5
+    if args.strategy == 'fsdp':
+        # See: https://github.com/Lightning-AI/pytorch-lightning/blob/8ad3e29816a63d8ce5c00ac104b14729a4176f4f/src/lightning/pytorch/plugins/precision/fsdp.py#L81
+        grad_clip = None
+    use_dist_samp = False
+
+    llm.api.finetune(
+        model=llm.HfAutoModelForCausalLM(args.model),
+        data=squad(llm.HfAutoModelForCausalLM.configure_tokenizer(args.model)),
+        trainer=nl.Trainer(
+            devices=args.devices,
+            max_steps=args.max_steps,
+            accelerator=args.accelerator,
+            strategy=args.strategy,
+            log_every_n_steps=1,
+            limit_val_batches=0.0,
+            num_sanity_val_steps=0,
+            accumulate_grad_batches=10,
+            gradient_clip_val=grad_clip,
+            use_distributed_sampler=use_dist_samp,
+            logger=wandb,
+        ),
+        optim=fdl.build(llm.adam.pytorch_adam_with_flat_lr(max_lr=1e-5, clip_grad=0.5)),
+        log=None,
+    )
diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 4205c401eea8..6dde88079567 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -21,6 +21,7 @@
 from nemo.collections.llm.gpt.data import (
     DollyDataModule,
     FineTuningDataModule,
+    HfDatasetDataModule,
     MockDataModule,
     PreTrainingDataModule,
     SquadDataModule,
@@ -57,6 +58,7 @@
     GPTConfig126M,
     GPTConfig175B,
     GPTModel,
+    HfAutoModelForCausalLM,
     Llama2Config7B,
     Llama2Config13B,
     Llama2Config70B,
@@ -182,6 +184,7 @@
     "squad",
     "dolly",
     "peft",
+    "HfAutoModelForCausalLM",
 ]
 
 
diff --git a/nemo/collections/llm/gpt/data/__init__.py b/nemo/collections/llm/gpt/data/__init__.py
index 45ca0788874f..f4e97d91e5cd 100644
--- a/nemo/collections/llm/gpt/data/__init__.py
+++ b/nemo/collections/llm/gpt/data/__init__.py
@@ -14,8 +14,16 @@
 
 from nemo.collections.llm.gpt.data.dolly import DollyDataModule
 from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
+from nemo.collections.llm.gpt.data.hf_dataset import HfDatasetDataModule
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.pre_training import PreTrainingDataModule
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 
-__all__ = ["FineTuningDataModule", "SquadDataModule", "DollyDataModule", "MockDataModule", "PreTrainingDataModule"]
+__all__ = [
+    "FineTuningDataModule",
+    "SquadDataModule",
+    "DollyDataModule",
+    "MockDataModule",
+    "PreTrainingDataModule",
+    "HfDatasetDataModule",
+]
diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py
index 01cf617a094d..2545bbc93f1d 100644
--- a/nemo/collections/llm/gpt/data/fine_tuning.py
+++ b/nemo/collections/llm/gpt/data/fine_tuning.py
@@ -70,6 +70,7 @@ def __init__(
         persistent_workers: bool = False,
         pad_to_max_length: bool = False,
         packed_sequence_specs: Optional["PackedSequenceSpecs"] = None,
+        sanity_check_dist_workers: bool = True,
     ):
         super().__init__()
         self.seq_length = seq_length
@@ -89,6 +90,7 @@ def __init__(
         self.packed_sequence_specs = packed_sequence_specs
         self.packed_sequence_size = -1 if not packed_sequence_specs else packed_sequence_specs.packed_sequence_size
         self.validate_batch_size_for_packed_sequence()
+        self._sanity_check_dist_workers = sanity_check_dist_workers
 
     def validate_batch_size_for_packed_sequence(self):
         if self.packed_sequence_size > 0 and self.micro_batch_size > 1:
@@ -134,6 +136,7 @@ def train_dataloader(self) -> DataLoader:
                 self.train_path if self.packed_sequence_size <= 0 else self.train_path_packed,
                 max_num_samples=self.max_train_samples,
                 pad_to_max_length=self.pad_to_max_length,
+                sanity_check_dist_workers=self._sanity_check_dist_workers,
             )
         )
 
@@ -143,6 +146,7 @@ def val_dataloader(self) -> DataLoader:
                 self.validation_path,
                 is_test=True,
                 pad_to_max_length=self.pad_to_max_length,
+                sanity_check_dist_workers=self._sanity_check_dist_workers,
             ),
         )
 
@@ -153,6 +157,7 @@ def test_dataloader(self) -> DataLoader:
                 tokens_to_generate=32,
                 is_test=True,
                 pad_to_max_length=self.pad_to_max_length,
+                sanity_check_dist_workers=self._sanity_check_dist_workers,
             )
         )
 
diff --git a/nemo/collections/llm/gpt/data/hf_dataset.py b/nemo/collections/llm/gpt/data/hf_dataset.py
new file mode 100644
index 000000000000..7e70a970913e
--- /dev/null
+++ b/nemo/collections/llm/gpt/data/hf_dataset.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytorch_lightning as pl
+import torch
+from torch.utils.data import DataLoader
+
+
+class HfDatasetDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        dataset,
+        num_workers=2,
+        pin_memory=True,
+        persistent_workers=True,
+        micro_batch_size=2,
+        global_batch_size=2,
+        pad_token_id=0,
+        use_mcore_sampler=False,
+        mcore_dataloader_type='cyclic',
+    ) -> None:
+        super().__init__()
+        assert pad_token_id is not None
+
+        self.dataset = dataset
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        self.persistent_workers = persistent_workers
+        self.micro_batch_size = micro_batch_size
+        self.global_batch_size = global_batch_size
+        self.pad_token_id = pad_token_id
+
+        self.use_mcore_sampler = use_mcore_sampler
+        self.mcore_dataloader_type = mcore_dataloader_type
+
+    @staticmethod
+    def collate_fn(batch, pad_token_id=0):
+        def batchify(tensor):
+            if tensor.ndim == 1:
+                return tensor.unsqueeze_(0)
+            return tensor
+
+        def extract_key_from_dicts(batch, key):
+            return list(map(lambda x: x[key], batch))
+
+        def pad_within_micro(batch, pad_token_id):
+            max_len = max(map(len, batch))
+            return [item + [pad_token_id] * (max_len - len(item)) for item in batch]
+
+        return {
+            key: batchify(
+                torch.LongTensor(
+                    pad_within_micro(
+                        extract_key_from_dicts(batch, key),
+                        pad_token_id,
+                    )
+                )
+            )
+            for key in ['tokens', 'labels']
+        }
+
+    def train_dataloader(self, collate_fn=None):
+        from nemo.lightning.data import add_megatron_sampler
+
+        if collate_fn is None:
+            collate_fn = lambda x: HfDatasetDataModule.collate_fn(x, pad_token_id=self.pad_token_id)
+
+        dataloader = DataLoader(
+            self.dataset,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers,
+            collate_fn=collate_fn,
+            batch_size=self.micro_batch_size,
+        )
+        if not self.use_mcore_sampler:
+            return dataloader
+
+        rank = 0
+        world_size = 1
+        if torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+            world_size = torch.distributed.get_world_size()
+
+        return add_megatron_sampler(
+            dataloader,
+            self.micro_batch_size,
+            self.global_batch_size,
+            dataloader_type=self.mcore_dataloader_type,
+            rank=rank,
+            world_size=world_size,
+        )
diff --git a/nemo/collections/llm/gpt/data/squad.py b/nemo/collections/llm/gpt/data/squad.py
index f872db94077d..cabbd444c0cf 100644
--- a/nemo/collections/llm/gpt/data/squad.py
+++ b/nemo/collections/llm/gpt/data/squad.py
@@ -56,6 +56,7 @@ def __init__(
         persistent_workers: bool = False,
         pad_to_max_length: bool = False,
         packed_sequence_specs: Optional["PackedSequenceSpecs"] = None,
+        sanity_check_dist_workers: bool = True,
     ):
         self.force_redownload = force_redownload
         self.delete_raw = delete_raw
@@ -74,6 +75,7 @@ def __init__(
             persistent_workers=persistent_workers,
             pad_to_max_length=pad_to_max_length,
             packed_sequence_specs=packed_sequence_specs,
+            sanity_check_dist_workers=sanity_check_dist_workers,
         )
 
     def prepare_data(self) -> None:
diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index ebecc06140fe..26b8d67cb53d 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -37,6 +37,7 @@
     GemmaConfig7B,
     GemmaModel,
 )
+from nemo.collections.llm.gpt.model.hf_auto_model_for_causal_lm import HfAutoModelForCausalLM
 from nemo.collections.llm.gpt.model.llama import (
     CodeLlamaConfig7B,
     CodeLlamaConfig13B,
@@ -166,4 +167,5 @@
     "gpt_forward_step",
     "transformer_engine_layer_spec",
     "local_layer_spec",
+    "HfAutoModelForCausalLM",
 ]
diff --git a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
new file mode 100644
index 000000000000..794c39738dbe
--- /dev/null
+++ b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytorch_lightning as pl
+import torch
+import torch.nn.functional as F
+from transformers import AutoModelForCausalLM
+
+from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+from nemo.lightning import io
+
+
+def _extract_non_bias_params(model):
+    return list(map(lambda x: x[1], filter(lambda x: not 'bias' in x[0], model.named_parameters())))
+
+
+def masked_cross_entropy(logits, targets, mask=None):
+    if mask is not None:
+        loss = F.cross_entropy(logits, targets, reduction='none')
+        return torch.mean(loss[mask == 1])
+    else:
+        return F.cross_entropy(logits, targets)
+
+
+class HfAutoModelForCausalLM(pl.LightningModule, io.IOMixin):
+    def __init__(self, model_name='gpt2', load_pretrained_weights=True, tokenizer=None, loss_fn=masked_cross_entropy):
+        super().__init__()
+        self.save_hyperparameters()
+        self.model_name = model_name
+        self._tokenizer = None
+        self.model = None
+        self.loss_fn = loss_fn
+        self.load_pretrained_weights = load_pretrained_weights
+        self.is_hf_model = True
+
+    @property
+    def tokenizer(self):
+        if self._tokenizer is None:
+            self._tokenizer = HfAutoModelForCausalLM.configure_tokenizer(self.model_name)
+        return self._tokenizer
+
+    @tokenizer.setter
+    def tokenizer(self, value):
+        assert self._tokenizer is None
+        self._tokenizer = value
+
+    @staticmethod
+    def configure_tokenizer(model_name):
+        return AutoTokenizer(model_name)
+
+    def configure_model(self):
+        # create all your layers here
+        if self.load_pretrained_weights:
+            self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype='auto')
+        else:
+            from transformers import AutoConfig
+
+            config = AutoConfig.from_pretained(self.model_name)
+            self.model = AutoModelForCausalLM.from_config(config)
+        self.model.train()
+
+    def forward(self, input_ids, attention_mask=None, labels=None, loss_mask=None):
+        outputs = self.model(
+            input_ids=input_ids.to(self.model.device),
+            attention_mask=attention_mask,
+        )
+        labels = labels.to(self.model.device)
+        if loss_mask is not None:
+            loss_mask = loss_mask.to(self.model.device).view(-1)
+        n_cls = outputs.logits.shape[-1]
+        outputs.loss = self.loss_fn(outputs.logits.view(-1, n_cls), labels.view(-1), loss_mask)
+        return outputs
+
+    def training_step(self, batch):
+        tokens = batch['tokens']
+        labels = batch['labels']
+        loss_mask = batch.get('loss_mask', None)
+        output = self.forward(
+            input_ids=tokens,
+            labels=labels,
+            loss_mask=loss_mask,
+        )
+
+        loss = output.loss
+        self.log('train_log', loss, on_step=True, on_epoch=True, prog_bar=True)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        tokens = batch['tokens']
+        labels = batch['labels']
+        output = self.forward(
+            input_ids=tokens,
+            labels=labels,
+        )
+
+        loss = output.loss
+        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
index ff81c3b383fc..b1fc15aee07c 100644
--- a/nemo/collections/llm/recipes/__init__.py
+++ b/nemo/collections/llm/recipes/__init__.py
@@ -18,6 +18,7 @@
     chatglm3_6b,
     gemma_2b,
     gemma_7b,
+    hf_auto_model_for_causal_lm,
     llama3_8b,
     llama3_8b_16k,
     llama3_8b_64k,
@@ -73,6 +74,7 @@
     "mamba2_hybrid_8b",
     "mistral_7b",
     "mistral_nemo_12b",
+    "hf_auto_model_for_causal_lm",
     "mixtral_8x7b",
     "mixtral_8x7b_16k",
     "mixtral_8x7b_64k",
diff --git a/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py
new file mode 100644
index 000000000000..6c81bf922152
--- /dev/null
+++ b/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.model.hf_auto_model_for_causal_lm import HfAutoModelForCausalLM
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import pytorch_adam_with_cosine_annealing
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "hf_auto_model_for_causal_lm"
+
+
+@run.cli.factory(name=NAME)
+def model(model_name) -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create HfAutoModelForCausalLM model configurations.
+
+    Args:
+        model_name (str): Model id on HF.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the HfAutoModelForCausalLM.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory 'HfAutoModelForCausalLM(model_name="mistralai/Mistral-Nemo-Instruct-2407")'
+
+        Python API usage:
+            >>> model_config = model(model_name="mistralai/Mistral-Nemo-Instruct-2407")
+            >>> print(model_config)
+    """
+    return run.Config(HfAutoModelForCausalLM, model_name=model_name)
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 100,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+    strategy: Optional[str] = 'ddp',
+    gradient_clip_val: float = 1.0,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for HfAutoModelForCausalLM.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+        strategy: Optional[str] = 'ddp': Parallelism strategy.
+        gradient_clip_val: float = 1.0: gradient-clip value.
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=HfAutoModelForCausalLM ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+    """
+    strategy = str(strategy).lower()
+    assert strategy in ['', 'ddp', 'fsdp'], strategy
+    if strategy == 'fsdp':
+        # See: https://github.com/Lightning-AI/pytorch-lightning/blob/8ad3e29816a63d8ce5c00ac104b14729a4176f4f/src/lightning/pytorch/plugins/precision/fsdp.py#L81
+        gradient_clip_val = None
+
+    trainer = run.Config(
+        nl.Trainer,
+        devices=num_gpus_per_node,
+        max_steps=max_steps,
+        accelerator='gpu',
+        strategy=strategy,
+        log_every_n_steps=1,
+        limit_val_batches=0.0,
+        num_sanity_val_steps=0,
+        accumulate_grad_batches=10,
+        callbacks=callbacks,
+        gradient_clip_val=gradient_clip_val,
+        use_distributed_sampler=False,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn=pretrain,
+    model_name: str = '',
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mistral 7B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory 'HfAutoModelForCausalLM(model_name="mistralai/Mistral-Nemo-Instruct-2407")'
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="auto_pretrain", num_nodes=2, model_name="mistralai/Mistral-Nemo-Instruct-2407")
+            >>> print(recipe)
+    """
+    return run.Partial(
+        fn,
+        model=model(model_name),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=pytorch_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
diff --git a/nemo/collections/llm/recipes/optim/adam.py b/nemo/collections/llm/recipes/optim/adam.py
index c6510577711d..4148d19c6635 100644
--- a/nemo/collections/llm/recipes/optim/adam.py
+++ b/nemo/collections/llm/recipes/optim/adam.py
@@ -17,7 +17,12 @@
 import nemo_run as run
 from megatron.core.optimizer import OptimizerConfig
 
-from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule, OptimizerModule
+from nemo.lightning.pytorch.optim import (
+    CosineAnnealingScheduler,
+    MegatronOptimizerModule,
+    OptimizerModule,
+    PytorchOptimizerModule,
+)
 
 
 @run.cli.factory
@@ -59,3 +64,55 @@ def distributed_fused_adam_with_cosine_annealing(
         config=opt_cfg,
         lr_scheduler=sched,
     )
+
+
+@run.cli.factory
+def pytorch_adam_with_cosine_annealing(
+    precision: str = "bf16-mixed",  # or "16-mixed"
+    warmup_steps: int = 2000,
+    constant_steps: int = 0,
+    max_lr: float = 1e-5,
+    min_lr: Optional[float] = None,
+    clip_grad: float = 1.0,
+) -> run.Config[OptimizerModule]:
+    from torch.optim import Adam
+
+    return run.Config(
+        PytorchOptimizerModule,
+        optim_cls=Adam,
+        config=dict(
+            lr=max_lr,
+            weight_decay=0.1,
+            betas=(0.9, 0.95),
+            eps=1e-8,
+        ),
+        lr_scheduler=run.Config(
+            CosineAnnealingScheduler,
+            warmup_steps=warmup_steps,
+            constant_steps=constant_steps,
+            min_lr=min_lr or (0.1 * max_lr),
+        ),
+    )
+
+
+@run.cli.factory
+def pytorch_adam_with_flat_lr(
+    precision: str = "bf16-mixed",  # or "16-mixed"
+    warmup_steps: int = 2000,
+    constant_steps: int = 0,
+    max_lr: float = 1e-5,
+    min_lr: Optional[float] = None,
+    clip_grad: float = 1.0,
+) -> run.Config[OptimizerModule]:
+    from torch.optim import Adam
+
+    return run.Config(
+        PytorchOptimizerModule,
+        optim_cls=Adam,
+        config=dict(
+            lr=max_lr,
+            weight_decay=0.1,
+            betas=(0.9, 0.95),
+            eps=1e-8,
+        ),
+    )
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py b/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
index 17ffc01fb7f4..4ce9701e76b4 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
@@ -209,7 +209,7 @@ def create_masked_lm_predictions(
     # on-the-fly whole word masking is possible.
     token_boundary = [0] * len(tokens)
     skip_mask_idx = None  # Store the index of token that cannot be masked.
-    for (i, token) in enumerate(tokens):
+    for i, token in enumerate(tokens):
         if token == skip_masking_id:
             skip_mask_idx = i
         if token == cls_id or token == sep_id:
@@ -285,7 +285,10 @@ def create_masked_lm_predictions(
             available_ngrams = list(cand_index_set.keys())
             # n - 1 because pvals is 0-indexed and available ngrams are 1-indexed.
             pvals_current = np.array([pvals[n - 1] for n in available_ngrams])
-            n = np_rng.choice(available_ngrams, p=pvals_current / pvals_current.sum(keepdims=True),)
+            n = np_rng.choice(
+                available_ngrams,
+                p=pvals_current / pvals_current.sum(keepdims=True),
+            )
         else:
             # Sampling "n" from the geometric distribution and clipping it to
             # the max_ngrams. Using p=0.2 default from the SpanBERT paper
@@ -488,7 +491,10 @@ def create_extreme_masked_lm_predictions(
         if span_length_distribution == LengthDistribution.uniform:
             available_ngrams = list(cand_index_set.keys())
             pvals_current = np.array([pvals[n] for n in available_ngrams])
-            n = np_rng.choice(available_ngrams, p=pvals_current / pvals_current.sum(keepdims=True),)
+            n = np_rng.choice(
+                available_ngrams,
+                p=pvals_current / pvals_current.sum(keepdims=True),
+            )
         elif span_length_distribution == LengthDistribution.geometric:
             # Sampling "n" from the geometric distribution and clipping it to
             # the max_ngrams. Using p=0.2 default from the SpanBERT paper
@@ -914,7 +920,13 @@ def build_train_valid_test_datasets(
                 seed,
             )
             test_ds = MockT5Dataset(
-                cfg, tokenizer, "test", int(train_valid_test_num_samples[2]), max_seq_length, max_seq_length_dec, seed,
+                cfg,
+                tokenizer,
+                "test",
+                int(train_valid_test_num_samples[2]),
+                max_seq_length,
+                max_seq_length_dec,
+                seed,
             )
             return train_ds, valid_ds, test_ds
         else:
@@ -1257,6 +1269,7 @@ def get_samples_mapping(
     binary_head,
     index_mapping_dir: str = None,
     samples_mapping: Any = None,
+    sanity_check_dist_workers: bool = True,
 ):
     """Get a list that maps a sample index to a starting sentence index, end sentence index, and length"""
 
@@ -1328,14 +1341,16 @@ def get_samples_mapping(
         logging.info(
             ' > elasped time to build and save samples mapping ' '(seconds): {:4f}'.format(time.time() - start_time)
         )
-    torch.distributed.barrier()
-    counts = torch.cuda.LongTensor([1])
-    torch.distributed.all_reduce(counts, group=parallel_state.get_data_parallel_group(with_context_parallel=True))
-    torch.distributed.all_reduce(counts, group=parallel_state.get_pipeline_model_parallel_group())
-    assert counts[0].item() == (
-        torch.distributed.get_world_size()
-        // torch.distributed.get_world_size(group=parallel_state.get_tensor_model_parallel_group())
-    )
+
+    if sanity_check_dist_workers:
+        torch.distributed.barrier()
+        counts = torch.cuda.LongTensor([1])
+        torch.distributed.all_reduce(counts, group=parallel_state.get_data_parallel_group(with_context_parallel=True))
+        torch.distributed.all_reduce(counts, group=parallel_state.get_pipeline_model_parallel_group())
+        assert counts[0].item() == (
+            torch.distributed.get_world_size()
+            // torch.distributed.get_world_size(group=parallel_state.get_tensor_model_parallel_group())
+        )
     # Load indexed dataset if not given externally.
     if samples_mapping is None:
         logging.info(' > loading indexed mapping from {}'.format(indexmap_filename))
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
index c42249cec2f2..898ddb7d716b 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -64,6 +64,7 @@ def __init__(
         output_original_text: bool = False,
         ceil_to_power_2: bool = False,
         get_attention_mask_from_fusion: bool = False,
+        sanity_check_dist_workers: bool = True,
     ):
         """
         file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
@@ -89,6 +90,7 @@ def __init__(
         special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>', 'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
         is_test: Whether this dataset is the test split.
         output_original_text (bool): if true, will keep the original text in the output alongside the tokenized ids.
+        sanity_check_dist_workers (bool): if true, will run sanity check across workers when making mapping.
         """
         self.tokenizer = tokenizer
         self.file_path = file_path
@@ -117,6 +119,7 @@ def __init__(
         self.output_original_text = output_original_text
         self.ceil_to_power_2 = ceil_to_power_2
         self.get_attention_mask_from_fusion = get_attention_mask_from_fusion
+        self.sanity_check_dist_workers = sanity_check_dist_workers
 
         if special_tokens is None:
             self.special_tokens = {
@@ -196,6 +199,7 @@ def _build_samples_mapping(self):
                 binary_head=False,
                 index_mapping_dir=self.index_mapping_dir,
                 samples_mapping=osm,
+                sanity_check_dist_workers=self.sanity_check_dist_workers,
             )
         else:
             self.samples_mapping = None
diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py
index 0f30dfe22851..ea7d91b37214 100644
--- a/nemo/lightning/data.py
+++ b/nemo/lightning/data.py
@@ -139,6 +139,8 @@ def add_megatron_sampler(
     dataloader_type: Literal["single", "cyclic", "batch"] = "single",
     drop_last: bool = True,
     pad_samples_to_global_batch_size: bool = False,
+    rank: int = 0,
+    world_size: int = 1,
     # data_sharding: bool = False
 ) -> DataLoader:
     """
@@ -172,9 +174,6 @@ def add_megatron_sampler(
     Returns:
         DataLoader: A new DataLoader instance with the configured Megatron sampler.
     """
-
-    from megatron.core import parallel_state
-
     if dataloader_type == 'single':
         batch_sampler = MegatronPretrainingSampler(
             total_samples=len(dataloader.dataset),
@@ -182,8 +181,8 @@ def add_megatron_sampler(
             micro_batch_size=micro_batch_size,
             global_batch_size=global_batch_size,
             rampup_batch_size=rampup_batch_size,
-            data_parallel_rank=parallel_state.get_data_parallel_rank(),
-            data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            data_parallel_rank=rank,
+            data_parallel_size=world_size,
             drop_last=drop_last,
             pad_samples_to_global_batch_size=pad_samples_to_global_batch_size,
         )
@@ -192,8 +191,8 @@ def add_megatron_sampler(
             total_samples=len(dataloader.dataset),
             consumed_samples=consumed_samples,
             micro_batch_size=micro_batch_size,
-            data_parallel_rank=parallel_state.get_data_parallel_rank(),
-            data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            data_parallel_rank=rank,
+            data_parallel_size=world_size,
             drop_last=drop_last,
             # data_sharding=data_sharding
         )
@@ -207,8 +206,8 @@ def add_megatron_sampler(
             consumed_samples=consumed_samples,
             micro_batch_size=micro_batch_size,
             global_batch_size=global_batch_size,
-            data_parallel_rank=parallel_state.get_data_parallel_rank(),
-            data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            data_parallel_rank=rank,
+            data_parallel_size=world_size,
             drop_last=drop_last,
             pad_samples_to_global_batch_size=not drop_last,
         )
diff --git a/nemo/lightning/pytorch/optim/__init__.py b/nemo/lightning/pytorch/optim/__init__.py
index 1572e95e136a..db40e5c48c1b 100644
--- a/nemo/lightning/pytorch/optim/__init__.py
+++ b/nemo/lightning/pytorch/optim/__init__.py
@@ -28,6 +28,7 @@
     WarmupPolicyScheduler,
 )
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
+from nemo.lightning.pytorch.optim.pytorch import PytorchOptimizerModule
 
 __all__ = [
     "OptimizerModule",
@@ -45,4 +46,5 @@
     "PolynomialDecayAnnealingScheduler",
     "PolynomialHoldDecayAnnealingScheduler",
     "CosineAnnealingScheduler",
+    "PytorchOptimizerModule",
 ]
diff --git a/nemo/lightning/pytorch/optim/pytorch.py b/nemo/lightning/pytorch/optim/pytorch.py
new file mode 100644
index 000000000000..6600fc0cf0a4
--- /dev/null
+++ b/nemo/lightning/pytorch/optim/pytorch.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional
+
+import pytorch_lightning as pl
+from torch.optim import Optimizer
+
+from nemo.lightning.megatron_parallel import MegatronParallel
+from nemo.lightning.pytorch.optim.base import LRSchedulerModule, OptimizerModule
+
+
+def _param_does_not_have_wd(param_name, param):
+    return 'bias' in param_name
+
+
+class PytorchOptimizerModule(OptimizerModule):
+    """A OptimizerModule for pytorch optimizers.
+
+    Attributes:
+        config (OptimizerConfig): Configuration for the optimizer.
+        no_weight_decay_cond (Optional[Callable]): Condition for no weight decay.
+        scale_lr_cond (Optional[Callable]): Condition for scaling learning rate.
+        lr_mult (float): Learning rate multiplier.
+
+    Example::
+
+        config = OptimizerConfig(...)
+        lr_scheduler = MyLRSchedulerModule(...)
+        optimizer_module = PytorchOptimizerModule(config, lr_scheduler)
+
+    Methods:
+        setup(model): Sets up the optimizer.
+        optimizers(model): Defines the optimizers.
+    """
+
+    def __init__(
+        self,
+        optim_cls,
+        config: dict = {'lr': 3e-4},
+        lr_scheduler: Optional[LRSchedulerModule] = None,
+        no_weight_decay_cond: Optional[Callable] = _param_does_not_have_wd,
+        scale_lr_cond: Optional[Callable] = None,
+        lr_mult: float = 1.0,
+    ):
+        """Initializes the PytorchOptimizerModule.
+
+        Args:
+            config (OptimizerConfig): Configuration for the optimizer.
+            lr_scheduler (Optional[LRSchedulerModule]): The learning rate scheduler module.
+            no_weight_decay_cond (Optional[Callable]): Condition for no weight decay.
+            scale_lr_cond (Optional[Callable]): Condition for scaling learning rate.
+            lr_mult (float): Learning rate multiplier.
+        """
+
+        super().__init__(lr_scheduler=lr_scheduler)
+        self.optim_cls = optim_cls
+        self.config = config
+        self.no_weight_decay_cond = no_weight_decay_cond
+        self.scale_lr_cond = scale_lr_cond
+        self.lr_mult = lr_mult
+        self.optim_cls = optim_cls
+
+    def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"):
+        # Noop
+        pass
+
+    def optimizers(self, model) -> List[Optimizer]:
+        """Defines the optimizers.
+
+        Args:
+            model (nn.Module): The model for which the optimizers are being defined.
+
+        Returns:
+            List[Optimizer]: The list of optimizers.
+
+        Raises:
+            ValueError: If the model is an instance of MegatronParallel.
+        """
+
+        if isinstance(model, MegatronParallel):
+            raise ValueError("Model cannot be an instance of MegatronParallel")
+
+        params_with_wd, params_without_wd = [], []
+        if self.no_weight_decay_cond is not None:
+            for name, param in model.named_parameters():
+                if self.no_weight_decay_cond(name, param):
+                    params_without_wd.append(param)
+                else:
+                    params_with_wd.append(param)
+        else:
+            params_with_wd = model.parameters()
+
+        optimizers = []
+        if len(params_with_wd) > 0:
+            optimizers.append(
+                self.optim_cls(
+                    params_with_wd,
+                    **self.config,
+                )
+            )
+
+        if len(params_without_wd) > 0:
+            wd = self.config.get('weight_decay', None)
+            kwargs['weight_decay'] = 0
+            optimizers.append(
+                self.optim_cls(
+                    params_without_wd,
+                    **kwargs,
+                )
+            )
+            # restore value
+            if wd is not None:
+                kwargs['weight_decay'] = wd
+
+        assert len(optimizers) > 0, "Expected at least one optimizer with params"
+        return optimizers
+
+    def finalize_model_grads(self, *args, **kwargs):
+        # Noop
+        pass
diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py
index 55bafce5f71e..52ba9e3220ac 100644
--- a/nemo/lightning/pytorch/plugins/data_sampler.py
+++ b/nemo/lightning/pytorch/plugins/data_sampler.py
@@ -65,9 +65,14 @@ def setup(self, global_rank: int) -> None:
         setup_microbatch_calculator(global_rank, self.micro_batch_size, self.global_batch_size, self.rampup_batch_size)
 
     def transform_dataloader(self, dataloader: DataLoader, consumed_samples: int = 0) -> DataLoader:
+        from megatron.core import parallel_state
+
         from nemo.lightning.data import add_megatron_sampler
 
         mode = getattr(dataloader, 'mode', 'train')
+
+        data_parallel_rank = parallel_state.get_data_parallel_rank()
+        data_parallel_size = parallel_state.get_data_parallel_world_size()
         return add_megatron_sampler(
             dataloader,
             micro_batch_size=self.micro_batch_size,
@@ -76,6 +81,8 @@ def transform_dataloader(self, dataloader: DataLoader, consumed_samples: int = 0
             consumed_samples=self.init_consumed_samples if mode == 'train' else 0,
             dataloader_type=self.dataloader_type,
             drop_last=self.drop_last,
+            rank=data_parallel_rank,
+            world_size=data_parallel_size,
         )
 
     def compute_consumed_samples(self, steps_since_resume=0) -> int:
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
index c5195511c522..b045804044ec 100644
--- a/nemo/lightning/pytorch/strategies/megatron_strategy.py
+++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -267,6 +267,8 @@ def __init__(
     def connect(self, model: pl.LightningModule) -> None:
         super().connect(model)
 
+        assert not hasattr(model, 'is_hf_model'), "Cannot use HfAutoModelForCausalLM with MegatronParallel"
+
         _maybe_mcore_config = _strategy_lib.set_model_parallel_attributes(model, self.parallelism)
         if _maybe_mcore_config:
             self._mcore_config = _maybe_mcore_config

From 51e9b7cb543773c26f02233a696551acc7aae727 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Wed, 23 Oct 2024 11:20:32 +0200
Subject: [PATCH 08/12] ci: Update tests (#10987)

* ci: Re-enable `L0_Unit_Tests_GPU_Lightning`

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* ci: Disable `L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2`

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/workflows/cicd-main.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 345482e9a1a8..55a952c21eb6 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -217,15 +217,14 @@ jobs:
        SCRIPT: |
          NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --with_downloads
 
-  OPTIONAL_L0_Unit_Tests_GPU_Lightning:
+  L0_Unit_Tests_GPU_Lightning:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true'
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure
        SCRIPT: |
          NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --with_downloads
-       IS_OPTIONAL: true
 
   L0_Unit_Tests_GPU_Others:
      needs: [cicd-test-container-setup]
@@ -2468,10 +2467,10 @@ jobs:
         rm -rf examples/nlp/language_modeling/gpt_pretrain_results
         rm -rf examples/nlp/language_modeling/gpt_index_mappings
 
-  L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
+  Optional_L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'Optional_L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure-gpus-2-h100
       SCRIPT: |
@@ -2578,6 +2577,7 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf examples/nlp/language_modeling/gpt_pretrain_results
         rm -rf examples/nlp/language_modeling/gpt_index_mappings
+      IS_OPTIONAL: true
 
   OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124:
     needs: [cicd-test-container-setup]
@@ -4323,7 +4323,7 @@ jobs:
       - L0_Unit_Tests_GPU_TTS
       #- OPTIONAL_L0_Unit_Tests_GPU_Core
       - L0_Unit_Tests_GPU_Hydra
-      #- OPTIONAL_L0_Unit_Tests_GPU_Lightning
+      - L0_Unit_Tests_GPU_Lightning
       - L0_Unit_Tests_GPU_Others
       
       - L0_Unit_Tests_CPU_ASR
@@ -4390,7 +4390,7 @@ jobs:
       - L2_Megatron_GPT_with_Drop_Optimizer_States_TP2
       - L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2
+      # - Optional_L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2
       #- OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124
       - L2_Megatron_GPT_Finetuning_PP2
       - L2_Megatron_GPT_Finetuning_StarCoder_PP1

From 05f75862169b1dc7f2641c54ebe5ab3f6f8451cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Wed, 23 Oct 2024 11:20:54 +0200
Subject: [PATCH 09/12] =?UTF-8?q?[=F0=9F=A4=A0]:=20Howdy=20folks,=20let's?=
 =?UTF-8?q?=20bump=20`Dockerfile.ci`=20to=20425cdd4=20!=20(#11001)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>
---
 Dockerfile.ci | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ci b/Dockerfile.ci
index 09ffe9674e5d..6ef99a35ae82 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.17.0
-ARG MCORE_TAG=563d5d1726012e8077895b732d5bc81b6e975e8d
+ARG MCORE_TAG=425cdd48d5ef5d360d8033288ff7cb0d378f535f
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \

From eeb861b89f12ab233a6b9ec5a10d83f29febf94e Mon Sep 17 00:00:00 2001
From: malay-nagda <164242706+malay-nagda@users.noreply.github.com>
Date: Wed, 23 Oct 2024 20:45:02 +0530
Subject: [PATCH 10/12] gpt3 175b cli (#10985)

* gpt3 175b cli

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

---------

Signed-off-by: Malay Nagda <malayn@nvidia.com>
Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
Signed-off-by: malay-nagda <164242706+malay-nagda@users.noreply.github.com>
Co-authored-by: malay-nagda <malay-nagda@users.noreply.github.com>
---
 nemo/collections/llm/recipes/__init__.py | 2 ++
 tests/lightning/test_nemo_run.py         | 1 +
 2 files changed, 3 insertions(+)

diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
index b1fc15aee07c..21994b75f60d 100644
--- a/nemo/collections/llm/recipes/__init__.py
+++ b/nemo/collections/llm/recipes/__init__.py
@@ -18,6 +18,7 @@
     chatglm3_6b,
     gemma_2b,
     gemma_7b,
+    gpt3_175b,
     hf_auto_model_for_causal_lm,
     llama3_8b,
     llama3_8b_16k,
@@ -89,6 +90,7 @@
     "nemotron4_22b_16k",
     "nemotron4_22b_64k",
     "nemotron4_340b",
+    "gpt3_175b",
     "adam",
     "default_log",
     "default_resume",
diff --git a/tests/lightning/test_nemo_run.py b/tests/lightning/test_nemo_run.py
index 947930c84847..934eaa853bf0 100644
--- a/tests/lightning/test_nemo_run.py
+++ b/tests/lightning/test_nemo_run.py
@@ -36,6 +36,7 @@
         ("nemotron4_22b_64k", "pretrain_recipe", "nemotron4_22b_64k_pretrain"),
         ("nemotron4_340b", "pretrain_recipe", "nemotron4_340b_pretrain"),
         ("nemotron4_340b", "finetune_recipe", "nemotron4_340b_finetune"),
+        ("gpt3_175b", "pretrain_recipe", "gpt3_175b_pretrain"),
     ],
 )
 def test_recipes_with_nemo_run(module, recipe, name, tmpdir, monkeypatch):

From 9251d1c5b1c0b58c32dcae76b076e734467596f0 Mon Sep 17 00:00:00 2001
From: Valerie Sarge <vsarge@nvidia.com>
Date: Wed, 23 Oct 2024 08:19:58 -0700
Subject: [PATCH 11/12] Fix for crash with LoRA + tp_overlap_comm=false +
 sequence_parallel=true (#10920)

* Add fusion defaults for llama2

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Alter ParallelLinearAdapter condition to account for tp_comm_overlap=false

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: vysarge <vysarge@users.noreply.github.com>

* Clean up unneeded defaults

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* gpt3 175b cli

Signed-off-by: Malay Nagda <malayn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: vysarge <vysarge@users.noreply.github.com>

---------

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: vysarge <vysarge@users.noreply.github.com>
Signed-off-by: Malay Nagda <malayn@nvidia.com>
Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
Signed-off-by: Eric Harper <complex451@gmail.com>
Co-authored-by: vysarge <vysarge@users.noreply.github.com>
Co-authored-by: Malay Nagda <malayn@nvidia.com>
Co-authored-by: malay-nagda <malay-nagda@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 nemo/collections/llm/gpt/model/llama.py                    | 7 +++++++
 .../modules/common/megatron/adapters/parallel_adapters.py  | 6 +++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index 0ec13a3d91e8..b48f99e061c9 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -50,6 +50,13 @@ class LlamaConfig(GPTConfig):
     attention_dropout: float = 0.0
     hidden_dropout: float = 0.0
     share_embeddings_and_output_weights: bool = False
+    # Fusions
+    bias_activation_fusion: bool = True
+    masked_softmax_fusion: bool = True
+    persist_layer_norm: bool = True
+    bias_dropout_fusion: bool = True
+    apply_rope_fusion: bool = True
+    cross_entropy_loss_fusion: bool = False
 
 
 @dataclass
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index a547d593d6d7..042dbb95979e 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -256,7 +256,11 @@ def __init__(
 
             te_version = packaging.version.Version(version("transformer-engine"))
             if te_version >= packaging.version.Version("1.5.0dev") and (
-                not self.input_is_parallel and getattr(model_parallel_config, "tp_comm_overlap_disable_qkv", False)
+                not self.input_is_parallel
+                and (
+                    not getattr(model_parallel_config, "tp_comm_overlap", False)
+                    or getattr(model_parallel_config, "tp_comm_overlap_disable_qkv", False)
+                )
             ):
                 # TE 1.5 introduces the option `return_layernorm_output_gathered`, so the all gather
                 # in the forward method is not needed, so set self._sequence_parallel to False

From ed37d19d51229ad01b6c4e43eeb0f1bd1f3216d3 Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Wed, 23 Oct 2024 10:02:43 -0700
Subject: [PATCH 12/12] llm.generate fixes (#10983)

* fix context path, disable optimizer init, add tp

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* format

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* address comments, require user to provide trainer

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* minor fix

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* minor fixes

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

---------

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
---
 nemo/collections/llm/api.py            |  2 +-
 nemo/collections/llm/inference/base.py | 16 ++++++++++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 71e006472db9..a9b3d4361f5b 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -436,7 +436,7 @@ def export_ckpt(
 def generate(
     path: Union[Path, str],
     prompts: list[str],
-    trainer: Optional[nl.Trainer] = None,
+    trainer: nl.Trainer,
     params_dtype: torch.dtype = torch.bfloat16,
     max_batch_size: int = 4,
     random_seed: Optional[int] = None,
diff --git a/nemo/collections/llm/inference/base.py b/nemo/collections/llm/inference/base.py
index 95da536fde06..0171f1c2dd5c 100644
--- a/nemo/collections/llm/inference/base.py
+++ b/nemo/collections/llm/inference/base.py
@@ -16,6 +16,7 @@
 
 import nemo.lightning as nl
 from nemo.lightning import io
+from nemo.lightning.ckpt_utils import ckpt_to_context_subdir
 from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy
 from nemo.lightning.pytorch.strategies.utils import RestoreConfig
 
@@ -44,6 +45,7 @@ def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl.
         load_optim_state=False,
     )
     trainer.strategy.restore_config = restore_config
+    trainer.strategy._setup_optimizers = False
     trainer.ckpt_path = None
     trainer.strategy.connect(model)
     if trainer.strategy.launcher is not None:
@@ -61,16 +63,22 @@ def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl.
 
 def setup_model_and_tokenizer(
     path: Path,
-    trainer: Optional[nl.Trainer] = None,
+    trainer: nl.Trainer,
     params_dtype: torch.dtype = torch.bfloat16,
     inference_batch_times_seqlen_threshold: int = 1000,
 ) -> tuple[MCoreGPTModel, MCoreTokenizerWrappper]:
-    model: io.TrainerContext = io.load_context(path=path, subpath="model")
-    trainer = trainer or io.load_context(path=path, subpath="trainer")
+    model: io.TrainerContext = io.load_context(path=ckpt_to_context_subdir(path), subpath="model")
     _setup_trainer_and_restore_model(path=path, trainer=trainer, model=model)
 
     # This is to get the MCore model required in GPTInferenceWrapper.
-    mcore_model = model.module.module.module
+    mcore_model = model
+    while mcore_model:
+        if type(mcore_model) is MCoreGPTModel:
+            break
+        mcore_model = getattr(mcore_model, "module", None)
+    if mcore_model is None or type(mcore_model) is not MCoreGPTModel:
+        raise ValueError("Exact McoreGPTModel instance not found in the model structure.")
+
     inference_wrapped_model = GPTInferenceWrapper(
         mcore_model,
         InferenceWrapperConfig(