Merge branch 'main' into aot/qwen-recipe

NVIDIA · Oct 27, 2024 · 4d893ff · 4d893ff
2 parents a4aafc8 + e13466f
commit 4d893ff
Show file tree

Hide file tree

Showing 28 changed files with 1,329 additions and 59 deletions.
diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
@@ -33,13 +33,17 @@ on:
       log:
         description: Last 2000 characters of the test step's log
         value: ${{ jobs.main.outputs.log }} 
+      potential_infra_failure:
+        description: Boolean flag when infra-related keyword spotted in logs.
+        value: ${{ jobs.main.outputs.potential_infra_failure }}
 jobs:
 
   main:
     runs-on: ${{ inputs.RUNNER }} 
     outputs:
       conclusion: ${{ steps.main.conclusion }}
       log: ${{ steps.main.outputs.log }}
+      potential_infra_failure: ${{ steps.main.outputs.potential_infra_failure }}
     steps:
         - name: Docker system cleanup
           run: |
@@ -51,7 +55,12 @@ jobs:
 
         - name: Start container
           run: |
-            docker run --rm -d --name nemo_container_${{ github.run_id }} --runtime=nvidia --gpus all --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
+            ARG=("")
+            if [[ "${{ inputs.RUNNER }}" != *cpu* ]]; then
+              ARG=("--runtime=nvidia --gpus all")
+            fi
+
+            docker run --rm -d --name nemo_container_${{ github.run_id }} ${ARG[@]} --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
 
         - id: main
           name: Run main script
@@ -70,6 +79,9 @@ jobs:
             
             echo "log=$(tail -c 2000 err.log |  base64 -w 0)" >> "$GITHUB_OUTPUT"
             
+            potential_infra_failure=$(cat err.log | grep -Eqi "gpu|cuda|device" && echo true || echo false)
+            echo "potential_infra_failure=$potential_infra_failure" >> "$GITHUB_OUTPUT"
+
             exit $EXIT_CODE
             
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -300,7 +300,7 @@ jobs:
      uses: ./.github/workflows/_test_template.yml
      if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_NLP') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
-       RUNNER: self-hosted-azure-cpu
+       RUNNER: self-hosted-azure
        TIMEOUT: 20
        SCRIPT: |
          CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
@@ -4515,7 +4515,10 @@ jobs:
         if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' && env.SLACK_WEBHOOK != '' }}
         env: 
           SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_ACTOR: ${{ github.actor }}
+          BRANCH: ${{ github.head_ref || github.ref_name }}
           REPOSITORY: ${{ github.repository }}
           RUN_ID: ${{ github.run_id }}
           PR_NUMBER: ${{ github.event.number }}
@@ -4571,13 +4574,15 @@ jobs:
             echo "* [$JOB_NAME]($JOB_URL)" | tee -a $GITHUB_STEP_SUMMARY
 
             LOGS=$(echo $JOB | yq '(.value.outputs.log | @base64d)' | tr -d '"')
+            LOGS=$([[ $(echo $LOGS | wc -c) -gt 0 ]] && echo -E "\`\`\`\n$LOGS\n\`\`\`" || echo "")
+            LOGS=$([[ $(echo $JOB | yq '.value.outputs.potential_infra_failure') == "true" ]] && echo -E "$LOGS\n\ncc: $SLACK_WEBHOOK_ADMIN" || echo -E "$LOGS")
             
             SUMMARY=$(echo "$SUMMARY" | jq \
               --arg pr "<$PR_URL|$PR_TITLE>" \
               --arg job "<$JOB_URL|$JOB_NAME>" \
-              --arg logs "$LOGS" \
-              --arg author "<https://github.com/${{ github.actor }}|${{ github.actor }}>" \
-              --arg branch "<https://github.com/$REPOSITORY/tree/${{ github.head_ref || github.ref_name }}|${{ github.head_ref || github.ref_name }}>"\
+              --arg logs "$(echo -e "$LOGS")" \
+              --arg author "<https://github.com/$GITHUB_ACTOR|$GITHUB_ACTOR>" \
+              --arg branch "<https://github.com/$REPOSITORY/tree/$BRANCH|$BRANCH>"\
               '. += [
               {
                 "type": "section",
@@ -4588,8 +4593,7 @@ jobs:
                     + "\nJob: " + $job
                     + "\nAuthor: " + $author
                     + "\nBranch: " + $branch
-                    + "\nLogs:" 
-                    + "```\n" + $logs + "\n```" 
+                    + "\nLogs:" + $logs
                   )
                 }
               }

diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.17.0
-ARG MCORE_TAG=425cdd48d5ef5d360d8033288ff7cb0d378f535f
+ARG MCORE_TAG=397e9da9511a09ae8badba30129c7e4934b06118
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \

diff --git a/examples/audio/conf/masking_with_online_augmentation.yaml b/examples/audio/conf/masking_with_online_augmentation.yaml
@@ -0,0 +1,119 @@
+name: "masking_with_online_augmenatation"
+
+model:
+  sample_rate: 16000
+  skip_nan_grad: false
+  num_outputs: 1
+
+  train_ds:
+    use_lhotse: true # enable Lhotse data loader
+    cuts_path: ??? # path to Lhotse cuts manifest with speech signals for augmentation (including custom "target_recording" field with the same signals)
+    truncate_duration: 4.0 # Number of STFT time frames = 1 + truncate_duration // encoder.hop_length = 256
+    truncate_offset_type: random # if the file is longer than truncate_duration, use random offset to select a subsegment
+    batch_size: 64 # batch size may be increased based on the available memory
+    shuffle: true
+    num_workers: 8
+    pin_memory: true
+    rir_enabled: true # enable room impulse response augmentation
+    rir_path: ??? # path to Lhotse recordings manifest with room impulse response signals
+    noise_path: ??? # path to Lhotse cuts manifest with noise signals
+
+  validation_ds:
+    use_lhotse: true # enable Lhotse data loader
+    cuts_path: ??? # path to Lhotse cuts manifest with noisy speech signals (including custom "target_recording" field with the clean signals)
+    batch_size: 64 # batch size may be increased based on the available memory
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+
+  test_ds:
+    use_lhotse: true # enable Lhotse data loader
+    cuts_path: ??? # path to Lhotse cuts manifest with noisy speech signals (including custom "target_recording" field with the clean signals)
+    batch_size: 1 # batch size may be increased based on the available memory
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+
+  encoder:
+    _target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram
+    fft_length: 512 # Length of the window and FFT for calculating spectrogram
+    hop_length: 256 # Hop length for calculating spectrogram
+
+  decoder:
+    _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
+    fft_length: 512 # Length of the window and FFT for calculating spectrogram
+    hop_length: 256 # Hop length for calculating spectrogram
+
+  mask_estimator:
+    _target_: nemo.collections.audio.modules.masking.MaskEstimatorRNN
+    num_outputs: ${model.num_outputs}
+    num_subbands: 257 # Number of subbands of the input spectrogram
+    num_features: 256 # Number of features at RNN input
+    num_layers: 5 # Number of RNN layers
+    bidirectional: true # Use bi-directional RNN
+
+  mask_processor:
+    _target_: nemo.collections.audio.modules.masking.MaskReferenceChannel # Apply mask on the reference channel
+    ref_channel: 0 # Reference channel for the output
+
+  loss:
+    _target_: nemo.collections.audio.losses.SDRLoss
+    scale_invariant: true # Use scale-invariant SDR
+
+  metrics:
+    val:
+      sdr: # output SDR
+        _target_: torchmetrics.audio.SignalDistortionRatio
+    test:
+      sdr_ch0: # SDR on output channel 0
+        _target_: torchmetrics.audio.SignalDistortionRatio
+        channel: 0
+
+  optim:
+    name: adamw
+    lr: 1e-4
+    # optimizer arguments
+    betas: [0.9, 0.98]
+    weight_decay: 1e-3
+
+trainer:
+  devices: -1 # number of GPUs, -1 would use all available GPUs
+  num_nodes: 1
+  max_epochs: -1
+  max_steps: -1 # computed at runtime if not set
+  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
+  accelerator: auto
+  strategy: ddp
+  accumulate_grad_batches: 1
+  gradient_clip_val: null
+  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
+  log_every_n_steps: 25  # Interval of logging.
+  enable_progress_bar: true
+  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
+  sync_batchnorm: true
+  enable_checkpointing: False  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    # in case of multiple validation sets, first one is used
+    monitor: "val_loss"
+    mode: "min"
+    save_top_k: 5
+    always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints
+
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  # you need to set these two to true to continue the training
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
+
+  # You may use this section to create a W&B logger
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
@@ -646,9 +646,9 @@ def __init__(
                             )
 
                             self._greedy_decode = RNNTGreedyDecodeCudaGraph(max_symbols_per_step, self)
-                        except (ImportError, ModuleNotFoundError, ValueError) as e:
+                        except (ImportError, ModuleNotFoundError, ValueError, EnvironmentError) as e:
                             self.use_cuda_graph_decoder = False
-                            logging.warning(f"Cannot use decoder with CUDA graphs, reason: {e.msg}")
+                            logging.warning(f"Cannot use decoder with CUDA graphs, reason: {e}")
                             self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames
                     else:
                         self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames

diff --git a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
@@ -266,11 +266,11 @@ def maybe_enable_cuda_graphs(self):
             try:
                 check_cuda_python_cuda_graphs_conditional_nodes_supported()
                 self.cuda_graphs_mode = self.CudaGraphsMode.FULL_GRAPH
-            except (ImportError, ModuleNotFoundError) as e:
+            except (ImportError, ModuleNotFoundError, EnvironmentError) as e:
                 logging.warning(
                     "No conditional node support for Cuda.\n"
                     "Cuda graphs with while loops are disabled, decoding speed will be slower\n"
-                    f"Reason: {e.msg}"
+                    f"Reason: {e}"
                 )
                 self.cuda_graphs_mode = self.CudaGraphsMode.NO_WHILE_LOOPS
         self.reset_cuda_graphs_state()

diff --git a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
@@ -277,11 +277,11 @@ def maybe_enable_cuda_graphs(self):
             try:
                 check_cuda_python_cuda_graphs_conditional_nodes_supported()
                 self.cuda_graphs_mode = self.CudaGraphsMode.FULL_GRAPH
-            except (ImportError, ModuleNotFoundError) as e:
+            except (ImportError, ModuleNotFoundError, EnvironmentError) as e:
                 logging.warning(
                     "No conditional node support for Cuda.\n"
                     "Cuda graphs with while loops are disabled, decoding speed will be slower\n"
-                    f"Reason: {e.msg}"
+                    f"Reason: {e}"
                 )
                 self.cuda_graphs_mode = self.CudaGraphsMode.NO_WHILE_LOOPS
         self.reset_cuda_graphs_state()

diff --git a/nemo/collections/audio/data/audio_to_audio_lhotse.py b/nemo/collections/audio/data/audio_to_audio_lhotse.py
@@ -55,6 +55,13 @@ def __getitem__(self, cuts: CutSet) -> dict[str, torch.Tensor]:
         retained_cuts = [
             cut._first_non_padding_cut if isinstance(cut, MixedCut) else cut for cut in retained_padded_cuts
         ]
+
+        # if online augmentation is applied, some retained cuts still may be MixedCuts (including the original speech, noise, and augmentation)
+        # get the first non-padding cut from there, which is supposed to be the clean speech signal
+        for n, cut in enumerate(retained_cuts):
+            if isinstance(cut, MixedCut):
+                retained_cuts[n] = cut._first_non_padding_cut
+        # create cutset
         retained_cuts = CutSet.from_cuts(retained_cuts)
 
         if _key_available(retained_cuts, self.TARGET_KEY):

diff --git a/nemo/collections/llm/inference/base.py b/nemo/collections/llm/inference/base.py
@@ -1,5 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 
 import pytorch_lightning as pl
 import torch
@@ -15,8 +30,9 @@
 from pytorch_lightning.trainer.states import TrainerFn
 
 import nemo.lightning as nl
+from nemo.collections.llm.peft import LoRA
 from nemo.lightning import io
-from nemo.lightning.ckpt_utils import ckpt_to_context_subdir
+from nemo.lightning.ckpt_utils import ADAPTER_META_FILENAME, ckpt_to_context_subdir, ckpt_to_weights_subdir
 from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy
 from nemo.lightning.pytorch.strategies.utils import RestoreConfig
 
@@ -39,11 +55,21 @@ def tokenize(self, prompt):
 def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl.LightningModule):
     assert isinstance(trainer.strategy, MegatronStrategy), "Only MegatronStrategy is supported for trainer.strategy."
     assert trainer.strategy.context_parallel_size <= 1, "Context parallelism is not supported for inference."
-    restore_config = RestoreConfig(
-        path=path,
-        load_model_state=True,
-        load_optim_state=False,
-    )
+    if (adapter_meta_path := ckpt_to_weights_subdir(path) / ADAPTER_META_FILENAME).exists():
+        with open(adapter_meta_path, "r") as f:
+            metadata = json.load(f)
+        restore_config = RestoreConfig(
+            path=metadata['model_ckpt_path'],
+            load_model_state=True,
+            load_optim_state=False,
+        )
+    else:
+        restore_config = RestoreConfig(
+            path=path,
+            load_model_state=True,
+            load_optim_state=False,
+        )
+
     trainer.strategy.restore_config = restore_config
     trainer.strategy._setup_optimizers = False
     trainer.ckpt_path = None
@@ -60,6 +86,15 @@ def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl.
     trainer.strategy.trainer = trainer
     trainer.strategy.selective_restore()
 
+    lora: Union[io.TrainerContext, LoRA] = io.load_context(ckpt_to_context_subdir(path), "model.model_transform")
+    if isinstance(lora, LoRA):
+        model = lora(model)
+        adapter_sharded_state_dict = {k: v for k, v in model.sharded_state_dict().items() if ".adapter." in k}
+        adapter_state = trainer.strategy.checkpoint_io.load_checkpoint(
+            ckpt_to_weights_subdir(path), sharded_state_dict=adapter_sharded_state_dict
+        )
+        trainer.strategy.load_model_state_dict(adapter_state, strict=False)
+
 
 def setup_model_and_tokenizer(
     path: Path,

diff --git a/nemo/collections/llm/recipes/baichuan2_7b.py b/nemo/collections/llm/recipes/baichuan2_7b.py
@@ -240,6 +240,7 @@ def finetune_recipe(
     num_nodes: int = 1,
     num_gpus_per_node: int = 8,
     peft_scheme: Optional[str] = 'lora',
+    packed_sequence: bool = False,
 ) -> run.Partial:
     """
     Create a fine-tuning recipe for Baichuan2 7B model.
@@ -272,7 +273,7 @@ def finetune_recipe(
         `examples/llm/finetune/` directory.
     """
     recipe = default_finetune_recipe(
-        model(), "baichuan-inc/Baichuan2-7B-Base", dir, name, num_nodes, num_gpus_per_node
+        model(), "baichuan-inc/Baichuan2-7B-Base", dir, name, num_nodes, num_gpus_per_node, packed_sequence
     )
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 2

diff --git a/nemo/collections/llm/recipes/chatglm3_6b.py b/nemo/collections/llm/recipes/chatglm3_6b.py
@@ -240,6 +240,7 @@ def finetune_recipe(
     num_nodes: int = 1,
     num_gpus_per_node: int = 8,
     peft_scheme: Optional[str] = 'lora',
+    packed_sequence: bool = False,
 ) -> run.Partial:
     """
     Create a fine-tuning recipe for ChatGLM3 6B model.
@@ -271,7 +272,9 @@ def finetune_recipe(
         on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
         `examples/llm/finetune/` directory.
     """
-    recipe = default_finetune_recipe(model(), "THUDM/chatglm3-6b", dir, name, num_nodes, num_gpus_per_node)
+    recipe = default_finetune_recipe(
+        model(), "THUDM/chatglm3-6b", dir, name, num_nodes, num_gpus_per_node, packed_sequence
+    )
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 2
         recipe.optim.config.lr = 5e-6