Merge branch 'main' of github.com:NVIDIA/NeMo into ashors/ckpt-subdirs

NVIDIA · Oct 25, 2024 · 3271f5c · 3271f5c
2 parents e15cafa + 90d82dc
commit 3271f5c
Show file tree

Hide file tree

Showing 208 changed files with 12,226 additions and 3,058 deletions.
diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
@@ -51,7 +51,12 @@ jobs:
 
         - name: Start container
           run: |
-            docker run --rm -d --name nemo_container_${{ github.run_id }} --runtime=nvidia --gpus all --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
+            ARG=("")
+            if [[ "${{ inputs.RUNNER }}" != *cpu* ]]; then
+              ARG=("--runtime=nvidia --gpus all")
+            fi
+
+            docker run --rm -d --name nemo_container_${{ github.run_id }} ${ARG[@]} --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
 
         - id: main
           name: Run main script

diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml
@@ -120,7 +120,7 @@ jobs:
                     "type": "section",
                     "text": {
                       "type": "mrkdwn",
-                      "text": ":alert: Cherrypick bot 🤖: Hey <@'$USERNAME'>: Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: <!subteam^{{ secrets.SLACK_WEBHOOK_ADMIN }}>"
+                      "text": ":alert: Cherrypick bot 🤖: Hey <@'$USERNAME'>: Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>"
                     }
                   }
                 ]

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
diff --git a/.github/workflows/monitor-single-vm.yml b/.github/workflows/monitor-single-vm.yml
@@ -0,0 +1,54 @@
+name: ~shut down a single VM
+
+on:
+  workflow_call:
+    inputs:
+      vm:
+        type: string
+        description: Name of VM
+        required: true
+      n_gpus:
+        type: string
+        description: Number of GPUs this VM has
+        required: true
+
+jobs:
+  check-status-and-maybe-shutdown:
+    environment: main
+    runs-on: ${{ inputs.vm }}
+    outputs:
+      status: ${{ steps.status.outputs.main }}
+    steps:
+      - name: Check status
+        id: status
+        run: |
+          docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi
+
+          NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+
+          if [[ $NUM_GPUS -ne ${{ inputs.n_gpus }} ]]; then
+            echo "Issues with GPU detected, will take this runner offline."
+            echo "main=degraded" >> "$GITHUB_OUTPUT"
+          else
+            echo "main=healthy" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Send Slack message & Disconnect runner from GitHub
+        if: ${{ steps.status.outputs.main == 'degraded' || failure() }}
+        run: |
+          MESSAGE='{
+            "blocks": [
+              {
+                "type": "section",
+                "text": {
+                  "type": "mrkdwn",
+                  "text": ":alert: VM bot 🤖: Hey <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>: VM `${{ inputs.vm }}` is having not the best day of their life, maybe bring them an apple or so."
+                }
+              }
+            ]
+          }'
+
+          curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_WEBHOOK }}
+
+          cd /home/azureuser/actions-runner
+          echo ${{ secrets.VM_KEY }} | sudo -S ./svc.sh stop
diff --git a/.github/workflows/monitor-vms.yml b/.github/workflows/monitor-vms.yml
@@ -0,0 +1,54 @@
+# Regularly updates the CI container
+name: Reboots VMs in a controlled way
+on:
+  schedule:
+    - cron: /15 * * * *
+  workflow_dispatch:
+  pull_request:
+
+jobs:
+  pre-flight:
+    runs-on: ubuntu-latest
+    outputs:
+      list-of-vms: ${{ steps.main.outputs.main }}
+    environment: main
+    steps:
+      - name: Get list of VMs
+        id: main
+        env:
+          GITHUB_TOKEN: ${{ secrets.PAT }}
+        run: |
+          RUNNERS=$(curl -L \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer $GITHUB_TOKEN" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/NVIDIA/NeMo/actions/runners)
+
+          MATRIX=$(echo $RUNNERS \
+            | jq -c '[
+                .runners[] 
+                | select(.status == "online")
+                | select(.name | contains("gpu")
+                | {
+                  "vm": .name, 
+                  "n_gpus": [
+                    .labels[] 
+                    | select(.name | endswith("gpu")) | .name
+                  ][0][:1]
+                }
+              ]
+            '
+          )
+          echo main=$MATRIX | tee -a "$GITHUB_OUTPUT"
+
+  maintenance:
+    needs: pre-flight
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{ fromJSON(needs.pre-flight.outputs.list-of-vms )}}
+    uses: .github/workflows/monitor-single-vm.yml
+    with:
+      vm: ${{ matrix.vm }}
+      n_gpus: ${{ matrix.n_gpus }}
+    secrets: inherit
diff --git a/.github/workflows/node-reboot.yml b/.github/workflows/node-reboot.yml
diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.17.0
-ARG MCORE_TAG=0d89fc4c0d4394f915fffff11212d6957652337f
+ARG MCORE_TAG=425cdd48d5ef5d360d8033288ff7cb0d378f535f
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \

diff --git a/docs/source/asr/intro.rst b/docs/source/asr/intro.rst
@@ -16,10 +16,10 @@ After :ref:`installing NeMo<installation>`, you can transcribe an audio file as
     asr_model = nemo_asr.models.ASRModel.from_pretrained("stt_en_fastconformer_transducer_large")
     transcript = asr_model.transcribe(["path/to/audio_file.wav"])
 
-Obtain word timestamps
-^^^^^^^^^^^^^^^^^^^^^^^^^
+Obtain word/segment timestamps
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-You can also obtain timestamps for each word in the transcription as follows:
+You can also obtain timestamps for each word or segment in the transcription as follows:
 
 .. code-block:: python
 
@@ -28,11 +28,14 @@ You can also obtain timestamps for each word in the transcription as follows:
     asr_model = nemo_asr.models.ASRModel.from_pretrained("stt_en_fastconformer_transducer_large")
 
     # update decoding config to preserve alignments and compute timestamps
+    # if necessary also update the segment seperators or word seperator for segment and word level timestamps
     from omegaconf import OmegaConf, open_dict
     decoding_cfg = asr_model.cfg.decoding
     with open_dict(decoding_cfg):
         decoding_cfg.preserve_alignments = True
         decoding_cfg.compute_timestamps = True
+        decoding_cfg.segment_seperators = [".", "?", "!"]
+        decoding_cfg.word_seperator = " "
         asr_model.change_decoding_strategy(decoding_cfg)
 
     # specify flag `return_hypotheses=True``
@@ -50,6 +53,7 @@ You can also obtain timestamps for each word in the transcription as follows:
     time_stride = 8 * asr_model.cfg.preprocessor.window_stride
 
     word_timestamps = timestamp_dict['word']
+    segment_timestamps = timestamp_dict['segment']
 
     for stamp in word_timestamps:
         start = stamp['start_offset'] * time_stride
@@ -58,6 +62,13 @@ You can also obtain timestamps for each word in the transcription as follows:
 
         print(f"Time : {start:0.2f} - {end:0.2f} - {word}")
 
+    for stamp in segment_timestamps:
+        start = stamp['start_offset'] * time_stride
+        end = stamp['end_offset'] * time_stride
+        segment = stamp['segment']
+
+        print(f"Time : {start:0.2f} - {end:0.2f} - {segment}")
+
 Transcribe speech via command line
 ----------------------------------
 You can also transcribe speech via the command line using the following `script <https://github.com/NVIDIA/NeMo/blob/main/examples/asr/transcribe_speech.py>`_, for example:

diff --git a/docs/source/asr/ssl/api.rst b/docs/source/asr/ssl/api.rst
@@ -4,6 +4,10 @@ NeMo SSL collection API
 
 Model Classes
 -------------
+.. autoclass:: nemo.collections.asr.models.EncDecDenoiseMaskedTokenPredModel
+    :show-inheritance:
+    :members:
+
 .. autoclass:: nemo.collections.asr.models.SpeechEncDecSelfSupervisedModel
     :show-inheritance:
     :members: 

diff --git a/docs/source/asr/ssl/intro.rst b/docs/source/asr/ssl/intro.rst
@@ -19,6 +19,10 @@ encoder module of neural ASR models. Here too, majority of SSL effort is focused
 While it is common that AM is the focus of SSL in ASR, it can also be utilized in improving other parts of 
 ASR models (e.g., predictor module in transducer based ASR models).
 
+In NeMo, we provide two types of SSL models, `Wav2Vec-BERT <https://arxiv.org/abs/2108.06209>`_ and `NEST <https://arxiv.org/abs/2408.13106>`_. 
+The training script for them can be found in `https://github.com/NVIDIA/NeMo/tree/main/examples/asr/speech_pretraining`.
+
+
 The full documentation tree is as follows:
 
 .. toctree::