Merge branch 'main' into vsarge/ft_recipes

NVIDIA · Oct 28, 2024 · c5b39e8 · c5b39e8
2 parents 5906fc9 + 869625e
commit c5b39e8
Show file tree

Hide file tree

Showing 136 changed files with 4,019 additions and 71 deletions.
diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
@@ -33,13 +33,17 @@ on:
       log:
         description: Last 2000 characters of the test step's log
         value: ${{ jobs.main.outputs.log }} 
+      potential_infra_failure:
+        description: Boolean flag when infra-related keyword spotted in logs.
+        value: ${{ jobs.main.outputs.potential_infra_failure }}
 jobs:
 
   main:
     runs-on: ${{ inputs.RUNNER }} 
     outputs:
       conclusion: ${{ steps.main.conclusion }}
       log: ${{ steps.main.outputs.log }}
+      potential_infra_failure: ${{ steps.main.outputs.potential_infra_failure }}
     steps:
         - name: Docker system cleanup
           run: |
@@ -75,6 +79,9 @@ jobs:
             
             echo "log=$(tail -c 2000 err.log |  base64 -w 0)" >> "$GITHUB_OUTPUT"
             
+            potential_infra_failure=$(cat err.log | grep -Eqi "gpu|cuda|device" && echo true || echo false)
+            echo "potential_infra_failure=$potential_infra_failure" >> "$GITHUB_OUTPUT"
+
             exit $EXIT_CODE
             
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -4515,7 +4515,10 @@ jobs:
         if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' && env.SLACK_WEBHOOK != '' }}
         env: 
           SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_ACTOR: ${{ github.actor }}
+          BRANCH: ${{ github.head_ref || github.ref_name }}
           REPOSITORY: ${{ github.repository }}
           RUN_ID: ${{ github.run_id }}
           PR_NUMBER: ${{ github.event.number }}
@@ -4571,13 +4574,15 @@ jobs:
             echo "* [$JOB_NAME]($JOB_URL)" | tee -a $GITHUB_STEP_SUMMARY
 
             LOGS=$(echo $JOB | yq '(.value.outputs.log | @base64d)' | tr -d '"')
+            LOGS=$([[ $(echo $LOGS | wc -c) -gt 0 ]] && echo -E "\`\`\`\n$LOGS\n\`\`\`" || echo "")
+            LOGS=$([[ $(echo $JOB | yq '.value.outputs.potential_infra_failure') == "true" ]] && echo -E "$LOGS\n\ncc: $SLACK_WEBHOOK_ADMIN" || echo -E "$LOGS")
             
             SUMMARY=$(echo "$SUMMARY" | jq \
               --arg pr "<$PR_URL|$PR_TITLE>" \
               --arg job "<$JOB_URL|$JOB_NAME>" \
-              --arg logs "$LOGS" \
-              --arg author "<https://github.com/${{ github.actor }}|${{ github.actor }}>" \
-              --arg branch "<https://github.com/$REPOSITORY/tree/${{ github.head_ref || github.ref_name }}|${{ github.head_ref || github.ref_name }}>"\
+              --arg logs "$(echo -e "$LOGS")" \
+              --arg author "<https://github.com/$GITHUB_ACTOR|$GITHUB_ACTOR>" \
+              --arg branch "<https://github.com/$REPOSITORY/tree/$BRANCH|$BRANCH>"\
               '. += [
               {
                 "type": "section",
@@ -4588,8 +4593,7 @@ jobs:
                     + "\nJob: " + $job
                     + "\nAuthor: " + $author
                     + "\nBranch: " + $branch
-                    + "\nLogs:" 
-                    + "```\n" + $logs + "\n```" 
+                    + "\nLogs:" + $logs
                   )
                 }
               }

diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.17.0
-ARG MCORE_TAG=425cdd48d5ef5d360d8033288ff7cb0d378f535f
+ARG MCORE_TAG=d357c188323b6928cbcbd6f7e06af04c1694382f
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \

diff --git a/examples/audio/conf/masking_with_online_augmentation.yaml b/examples/audio/conf/masking_with_online_augmentation.yaml
@@ -0,0 +1,119 @@
+name: "masking_with_online_augmenatation"
+
+model:
+  sample_rate: 16000
+  skip_nan_grad: false
+  num_outputs: 1
+
+  train_ds:
+    use_lhotse: true # enable Lhotse data loader
+    cuts_path: ??? # path to Lhotse cuts manifest with speech signals for augmentation (including custom "target_recording" field with the same signals)
+    truncate_duration: 4.0 # Number of STFT time frames = 1 + truncate_duration // encoder.hop_length = 256
+    truncate_offset_type: random # if the file is longer than truncate_duration, use random offset to select a subsegment
+    batch_size: 64 # batch size may be increased based on the available memory
+    shuffle: true
+    num_workers: 8
+    pin_memory: true
+    rir_enabled: true # enable room impulse response augmentation
+    rir_path: ??? # path to Lhotse recordings manifest with room impulse response signals
+    noise_path: ??? # path to Lhotse cuts manifest with noise signals
+
+  validation_ds:
+    use_lhotse: true # enable Lhotse data loader
+    cuts_path: ??? # path to Lhotse cuts manifest with noisy speech signals (including custom "target_recording" field with the clean signals)
+    batch_size: 64 # batch size may be increased based on the available memory
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+
+  test_ds:
+    use_lhotse: true # enable Lhotse data loader
+    cuts_path: ??? # path to Lhotse cuts manifest with noisy speech signals (including custom "target_recording" field with the clean signals)
+    batch_size: 1 # batch size may be increased based on the available memory
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+
+  encoder:
+    _target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram
+    fft_length: 512 # Length of the window and FFT for calculating spectrogram
+    hop_length: 256 # Hop length for calculating spectrogram
+
+  decoder:
+    _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
+    fft_length: 512 # Length of the window and FFT for calculating spectrogram
+    hop_length: 256 # Hop length for calculating spectrogram
+
+  mask_estimator:
+    _target_: nemo.collections.audio.modules.masking.MaskEstimatorRNN
+    num_outputs: ${model.num_outputs}
+    num_subbands: 257 # Number of subbands of the input spectrogram
+    num_features: 256 # Number of features at RNN input
+    num_layers: 5 # Number of RNN layers
+    bidirectional: true # Use bi-directional RNN
+
+  mask_processor:
+    _target_: nemo.collections.audio.modules.masking.MaskReferenceChannel # Apply mask on the reference channel
+    ref_channel: 0 # Reference channel for the output
+
+  loss:
+    _target_: nemo.collections.audio.losses.SDRLoss
+    scale_invariant: true # Use scale-invariant SDR
+
+  metrics:
+    val:
+      sdr: # output SDR
+        _target_: torchmetrics.audio.SignalDistortionRatio
+    test:
+      sdr_ch0: # SDR on output channel 0
+        _target_: torchmetrics.audio.SignalDistortionRatio
+        channel: 0
+
+  optim:
+    name: adamw
+    lr: 1e-4
+    # optimizer arguments
+    betas: [0.9, 0.98]
+    weight_decay: 1e-3
+
+trainer:
+  devices: -1 # number of GPUs, -1 would use all available GPUs
+  num_nodes: 1
+  max_epochs: -1
+  max_steps: -1 # computed at runtime if not set
+  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
+  accelerator: auto
+  strategy: ddp
+  accumulate_grad_batches: 1
+  gradient_clip_val: null
+  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
+  log_every_n_steps: 25  # Interval of logging.
+  enable_progress_bar: true
+  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
+  sync_batchnorm: true
+  enable_checkpointing: False  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    # in case of multiple validation sets, first one is used
+    monitor: "val_loss"
+    mode: "min"
+    save_top_k: 5
+    always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints
+
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  # you need to set these two to true to continue the training
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
+
+  # You may use this section to create a W&B logger
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
diff --git a/examples/multimodal/multimodal_llm/neva/eval/mixtral_eval.py b/examples/multimodal/multimodal_llm/neva/eval/mixtral_eval.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Script to query Mixtral-8x7B as a judge via NGC API for evaluation"""
 import argparse
 import json
@@ -61,7 +75,10 @@ def get_eval(content: str, max_tokens: int):
                 'role': 'system',
                 'content': 'You are a helpful and precise assistant for checking the quality of the answer.',
             },
-            {'role': 'user', 'content': content,},
+            {
+                'role': 'user',
+                'content': content,
+            },
         ],
         "temperature": 0.2,
         "top_p": 0.7,

diff --git a/examples/nlp/dialogue/dialogue.py b/examples/nlp/dialogue/dialogue.py
@@ -63,10 +63,14 @@
 @hydra_runner(config_path="conf", config_name="dialogue_config")
 def main(cfg: DictConfig) -> None:
     pl.seed_everything(42)
+    logging.warning('This script is no longer supported in NeMo and is scheduled for removal in the 23.11 release.')
     logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
 
     try:
-        strategy = NLPDDPStrategy(no_ddp_communication_hook=True, find_unused_parameters=True,)
+        strategy = NLPDDPStrategy(
+            no_ddp_communication_hook=True,
+            find_unused_parameters=True,
+        )
     except (ImportError, ModuleNotFoundError):
         strategy = 'auto'
 

diff --git a/examples/nlp/rag/rag_generating.py b/examples/nlp/rag/rag_generating.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from llama_index.core import Settings, StorageContext, load_index_from_storage
 
 from nemo.collections.nlp.models.rag.custom_bert_embedder import NeMoBertEmbeddings

diff --git a/examples/nlp/rag/rag_indexing.py b/examples/nlp/rag/rag_indexing.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
 from llama_index.core.node_parser import SentenceSplitter
 

diff --git a/nemo/collections/asr/parts/submodules/adapters/attention_adapter_mixin.py b/nemo/collections/asr/parts/submodules/adapters/attention_adapter_mixin.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 
 from nemo.core.classes.mixins import adapter_mixins

diff --git a/nemo/collections/audio/data/audio_to_audio_lhotse.py b/nemo/collections/audio/data/audio_to_audio_lhotse.py
@@ -55,6 +55,13 @@ def __getitem__(self, cuts: CutSet) -> dict[str, torch.Tensor]:
         retained_cuts = [
             cut._first_non_padding_cut if isinstance(cut, MixedCut) else cut for cut in retained_padded_cuts
         ]
+
+        # if online augmentation is applied, some retained cuts still may be MixedCuts (including the original speech, noise, and augmentation)
+        # get the first non-padding cut from there, which is supposed to be the clean speech signal
+        for n, cut in enumerate(retained_cuts):
+            if isinstance(cut, MixedCut):
+                retained_cuts[n] = cut._first_non_padding_cut
+        # create cutset
         retained_cuts = CutSet.from_cuts(retained_cuts)
 
         if _key_available(retained_cuts, self.TARGET_KEY):

diff --git a/nemo/collections/common/metrics/perf_metrics.py b/nemo/collections/common/metrics/perf_metrics.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, List, Optional
 
 import numpy as np

diff --git a/nemo/collections/common/parts/perf_metrics_utils.py b/nemo/collections/common/parts/perf_metrics_utils.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import glob
 import os
 from typing import List

diff --git a/nemo/collections/common/prompts/canary.py b/nemo/collections/common/prompts/canary.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any
 
 import torch

diff --git a/nemo/collections/common/prompts/example.py b/nemo/collections/common/prompts/example.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 Implemented following the guide at https://www.promptingguide.ai/models/phi-2#phi-2-usage
 """