Merge branch 'main' into online_augmentation_tutorial

Signed-off-by: Rauf <[email protected]>
NVIDIA · Oct 23, 2024 · 2647467 · 2647467
2 parents 3b1bfe9 + ed37d19
commit 2647467
Show file tree

Hide file tree

Showing 39 changed files with 1,940 additions and 64 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -217,15 +217,14 @@ jobs:
        SCRIPT: |
          NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --with_downloads
 
-  OPTIONAL_L0_Unit_Tests_GPU_Lightning:
+  L0_Unit_Tests_GPU_Lightning:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true'
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure
        SCRIPT: |
          NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --with_downloads
-       IS_OPTIONAL: true
 
   L0_Unit_Tests_GPU_Others:
      needs: [cicd-test-container-setup]
@@ -2468,10 +2467,10 @@ jobs:
         rm -rf examples/nlp/language_modeling/gpt_pretrain_results
         rm -rf examples/nlp/language_modeling/gpt_index_mappings
 
-  L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
+  Optional_L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'Optional_L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure-gpus-2-h100
       SCRIPT: |
@@ -2578,6 +2577,7 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf examples/nlp/language_modeling/gpt_pretrain_results
         rm -rf examples/nlp/language_modeling/gpt_index_mappings
+      IS_OPTIONAL: true
 
   OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124:
     needs: [cicd-test-container-setup]
@@ -4323,7 +4323,7 @@ jobs:
       - L0_Unit_Tests_GPU_TTS
       #- OPTIONAL_L0_Unit_Tests_GPU_Core
       - L0_Unit_Tests_GPU_Hydra
-      #- OPTIONAL_L0_Unit_Tests_GPU_Lightning
+      - L0_Unit_Tests_GPU_Lightning
       - L0_Unit_Tests_GPU_Others
 
       - L0_Unit_Tests_CPU_ASR
@@ -4390,7 +4390,7 @@ jobs:
       - L2_Megatron_GPT_with_Drop_Optimizer_States_TP2
       - L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2
+      # - Optional_L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2
       #- OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124
       - L2_Megatron_GPT_Finetuning_PP2
       - L2_Megatron_GPT_Finetuning_StarCoder_PP1

diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.17.0
-ARG MCORE_TAG=563d5d1726012e8077895b732d5bc81b6e975e8d
+ARG MCORE_TAG=425cdd48d5ef5d360d8033288ff7cb0d378f535f
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \

diff --git a/examples/audio/process_audio.py b/examples/audio/process_audio.py
@@ -159,8 +159,8 @@ def main(cfg: ProcessConfig) -> ProcessConfig:
     audio_to_audio_model.set_trainer(trainer)
     audio_to_audio_model = audio_to_audio_model.eval()
 
-    # override sampler
-    if cfg.sampler is not None:
+    # override sampler if necessary
+    if cfg.sampler:
         logging.info('Overriding sampler with %s', cfg.sampler)
 
         if hasattr(audio_to_audio_model, 'sampler'):

diff --git a/examples/llm/pretrain/README.md b/examples/llm/pretrain/README.md
@@ -3,7 +3,7 @@
 ### Listing the available recipes for pretraining
 
 ```bash
-nemorun llm pretrain --help
+nemo llm pretrain --help
 ```
 
 ![recipe-listing](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/list-recipes.png)
@@ -12,15 +12,15 @@ nemorun llm pretrain --help
 ### Run pre-training with a default recipe
 
 ```bash
-nemorun llm pretrain --factory llama3_8b
+nemo llm pretrain --factory llama3_8b
 ```
 
 ![llama3_70b](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b.png)
 
 We can also call the factory function with custom parameters:
 
 ```bash
-nemorun llm pretrain --factory "llama3_70b(num_nodes=128)"
+nemo llm pretrain --factory "llama3_70b(num_nodes=128)"
 ```
 
 ![llama3_70b-128-nodes](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b_128nodes.png)
@@ -29,13 +29,13 @@ nemorun llm pretrain --factory "llama3_70b(num_nodes=128)"
 The CLI allows you to overwrite any parameter. For example, to run the recipe with 2000 steps: 
 
 ```bash
-nemorun llm pretrain --factory llama3_70b trainer.max_steps=2000
+nemo llm pretrain --factory llama3_70b trainer.max_steps=2000
 ```
 
 The syntax of the CLI is the same as the Python code. Which is great but in some cases you might want to inspect & edit a recipe interactively. An easy way to do this using the cli is the use the `--repl` flag.
 
 ```bash
-nemorun llm pretrain --factory llama3_70b --repl
+nemo llm pretrain --factory llama3_70b --repl
 ```
 
 ![repl](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/repl.gif)

diff --git a/examples/llm/sft/hf.py b/examples/llm/sft/hf.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fiddle as fdl
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+from torch.utils.data import DataLoader
+
+from nemo import lightning as nl
+from nemo.collections import llm
+
+
+class SquadDataModuleWithPthDataloader(llm.SquadDataModule):
+    def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
+        return DataLoader(
+            dataset,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers,
+            collate_fn=dataset.collate_fn,
+            batch_size=self.micro_batch_size,
+            **kwargs,
+        )
+
+
+def squad(tokenizer) -> pl.LightningDataModule:
+    return SquadDataModuleWithPthDataloader(
+        tokenizer=tokenizer,
+        seq_length=2048,
+        micro_batch_size=2,
+        global_batch_size=128,  # assert gbs == mbs * accumulate_grad_batches
+        num_workers=0,
+        sanity_check_dist_workers=False,
+    )
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', default='meta-llama/Llama-3.2-1B')
+    parser.add_argument('--strategy', type=str, default='auto', choices=['auto', 'ddp', 'fsdp'])
+    parser.add_argument('--devices', default=1)
+    parser.add_argument('--accelerator', default='gpu', choices=['gpu'])
+    parser.add_argument('--max-steps', type=int, default=100)
+    parser.add_argument('--wandb-project', type=str, default=None)
+    args = parser.parse_args()
+
+    wandb = None
+    if args.wandb_project is not None:
+        model = '_'.join(args.model.split('/')[-2:])
+        wandb = WandbLogger(
+            project=args.wandb_project,
+            name=f'{model}_dev{args.devices}_strat_{args.strategy}',
+        )
+    grad_clip = 0.5
+    if args.strategy == 'fsdp':
+        # See: https://github.com/Lightning-AI/pytorch-lightning/blob/8ad3e29816a63d8ce5c00ac104b14729a4176f4f/src/lightning/pytorch/plugins/precision/fsdp.py#L81
+        grad_clip = None
+    use_dist_samp = False
+
+    llm.api.finetune(
+        model=llm.HfAutoModelForCausalLM(args.model),
+        data=squad(llm.HfAutoModelForCausalLM.configure_tokenizer(args.model)),
+        trainer=nl.Trainer(
+            devices=args.devices,
+            max_steps=args.max_steps,
+            accelerator=args.accelerator,
+            strategy=args.strategy,
+            log_every_n_steps=1,
+            limit_val_batches=0.0,
+            num_sanity_val_steps=0,
+            accumulate_grad_batches=10,
+            gradient_clip_val=grad_clip,
+            use_distributed_sampler=use_dist_samp,
+            logger=wandb,
+        ),
+        optim=fdl.build(llm.adam.pytorch_adam_with_flat_lr(max_lr=1e-5, clip_grad=0.5)),
+        log=None,
+    )
diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py
@@ -314,7 +314,7 @@ def prepare_audio_data(cfg: DictConfig) -> Tuple[List[str], bool]:
         with NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
             for item in read_and_maybe_sort_manifest(cfg.dataset_manifest, try_sort=cfg.presort_manifest):
                 audio_file = get_full_path(audio_file=item[audio_key], manifest_file=cfg.dataset_manifest)
-                item[audio_key] = audio_file
+                item['audio_filepath'] = audio_file
                 filepaths.append(audio_file)
                 f.write(json.dumps(item) + "\n")
         sorted_manifest_path = f.name

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
@@ -21,6 +21,7 @@
 from nemo.collections.llm.gpt.data import (
     DollyDataModule,
     FineTuningDataModule,
+    HfDatasetDataModule,
     MockDataModule,
     PreTrainingDataModule,
     SquadDataModule,
@@ -57,6 +58,7 @@
     GPTConfig126M,
     GPTConfig175B,
     GPTModel,
+    HfAutoModelForCausalLM,
     Llama2Config7B,
     Llama2Config13B,
     Llama2Config70B,
@@ -182,6 +184,7 @@
     "squad",
     "dolly",
     "peft",
+    "HfAutoModelForCausalLM",
 ]
 
 

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
@@ -436,7 +436,7 @@ def export_ckpt(
 def generate(
     path: Union[Path, str],
     prompts: list[str],
-    trainer: Optional[nl.Trainer] = None,
+    trainer: nl.Trainer,
     params_dtype: torch.dtype = torch.bfloat16,
     max_batch_size: int = 4,
     random_seed: Optional[int] = None,

diff --git a/nemo/collections/llm/gpt/data/__init__.py b/nemo/collections/llm/gpt/data/__init__.py
@@ -14,8 +14,16 @@
 
 from nemo.collections.llm.gpt.data.dolly import DollyDataModule
 from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
+from nemo.collections.llm.gpt.data.hf_dataset import HfDatasetDataModule
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.pre_training import PreTrainingDataModule
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 
-__all__ = ["FineTuningDataModule", "SquadDataModule", "DollyDataModule", "MockDataModule", "PreTrainingDataModule"]
+__all__ = [
+    "FineTuningDataModule",
+    "SquadDataModule",
+    "DollyDataModule",
+    "MockDataModule",
+    "PreTrainingDataModule",
+    "HfDatasetDataModule",
+]
diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py
@@ -70,6 +70,7 @@ def __init__(
         persistent_workers: bool = False,
         pad_to_max_length: bool = False,
         packed_sequence_specs: Optional["PackedSequenceSpecs"] = None,
+        sanity_check_dist_workers: bool = True,
     ):
         super().__init__()
         self.seq_length = seq_length
@@ -89,6 +90,7 @@ def __init__(
         self.packed_sequence_specs = packed_sequence_specs
         self.packed_sequence_size = -1 if not packed_sequence_specs else packed_sequence_specs.packed_sequence_size
         self.validate_batch_size_for_packed_sequence()
+        self._sanity_check_dist_workers = sanity_check_dist_workers
 
     def validate_batch_size_for_packed_sequence(self):
         if self.packed_sequence_size > 0 and self.micro_batch_size > 1:
@@ -134,6 +136,7 @@ def train_dataloader(self) -> DataLoader:
                 self.train_path if self.packed_sequence_size <= 0 else self.train_path_packed,
                 max_num_samples=self.max_train_samples,
                 pad_to_max_length=self.pad_to_max_length,
+                sanity_check_dist_workers=self._sanity_check_dist_workers,
             )
         )
 
@@ -143,6 +146,7 @@ def val_dataloader(self) -> DataLoader:
                 self.validation_path,
                 is_test=True,
                 pad_to_max_length=self.pad_to_max_length,
+                sanity_check_dist_workers=self._sanity_check_dist_workers,
             ),
         )
 
@@ -153,6 +157,7 @@ def test_dataloader(self) -> DataLoader:
                 tokens_to_generate=32,
                 is_test=True,
                 pad_to_max_length=self.pad_to_max_length,
+                sanity_check_dist_workers=self._sanity_check_dist_workers,
             )
         )