Merge branch 'r2.0.0rc1' into dpykhtar/torch_dist_as_default

NVIDIA · Jul 25, 2024 · 52b4a27 · 52b4a27
2 parents 4e4901a + be21e95
commit 52b4a27
Show file tree

Hide file tree

Showing 7 changed files with 90 additions and 18 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -810,7 +810,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1
+        pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1 --with_downloads
 
   # L2: Segmentation Tool
   L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav:

diff --git a/Dockerfile b/Dockerfile
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3
 
 # build an image that includes only the nemo dependencies, ensures that dependencies
 # are included first for optimal caching, and useful for building a development

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -399,7 +399,9 @@ def _enable_nvidia_optimizations(self):
                 self.cfg.persist_layer_norm = False
 
             # NVFUSER available starting with 21.11
-            if NVIDIA_TORCH_MAJOR >= 21 or (NVIDIA_TORCH_MAJOR == 21 and NVIDIA_TORCH_MINOR >= 11):
+            if (NVIDIA_TORCH_MAJOR >= 21 or (NVIDIA_TORCH_MAJOR == 21 and NVIDIA_TORCH_MINOR >= 11)) and (
+                NVIDIA_TORCH_MAJOR < 23 or (NVIDIA_TORCH_MAJOR == 23 and NVIDIA_TORCH_MINOR < 11)
+            ):
 
                 # NVFUSER
                 torch._C._jit_set_profiling_executor(True)

diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
@@ -380,6 +380,8 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
         self.set_checkpoint_unfinished_marker(filepath, barrier_after=True)
         ema_callback = self._ema_callback(trainer)
 
+        self._last_global_step_saved = trainer.global_step
+
         if ema_callback is not None:
             if self.async_save:
                 raise ValueError('async_save with EMA not supported')
@@ -422,7 +424,6 @@ def _get_finalize_save_checkpoint_callback(
 
         def _cb():
             logging.debug(f'Finalize callback called for step {global_step}, filepath {filepath}')
-            self._last_global_step_saved = global_step
             self._last_checkpoint_saved = filepath
 
             from nemo.utils.get_rank import is_global_rank_zero

diff --git a/tests/collections/asr/decoding/rnnt_alignments_check.py b/tests/collections/asr/decoding/rnnt_alignments_check.py
@@ -17,29 +17,66 @@
 #       these tests outside of the CI machines environment, where test data is
 #       stored
 
-import os
+from pathlib import Path
+from typing import Union
+
 import pytest
+import torch.cuda
 from examples.asr.transcribe_speech import TranscriptionConfig
 from omegaconf import OmegaConf
 
-from nemo.collections.asr.parts.utils.transcribe_utils import prepare_audio_data, setup_model
+from nemo.collections.asr.models import EncDecRNNTBPEModel
+from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest
+from nemo.collections.asr.parts.utils.transcribe_utils import prepare_audio_data
+
+DEVICES = []
+
+if torch.cuda.is_available():
+    DEVICES.append('cuda')
 
-TEST_DATA_PATH = "/home/TestData/an4_dataset/an4_val.json"
-PRETRAINED_MODEL_NAME = "stt_en_conformer_transducer_small"
 
+@pytest.fixture(scope="module")
+def stt_en_conformer_transducer_small_model():
+    model = EncDecRNNTBPEModel.from_pretrained(model_name="stt_en_conformer_transducer_small", map_location="cpu")
+    return model
 
-def get_rnnt_alignments(strategy: str, loop_labels: bool = True, use_cuda_graph_decoder=False, location="cuda"):
-    cfg = OmegaConf.structured(TranscriptionConfig(pretrained_name=PRETRAINED_MODEL_NAME))
+
+@pytest.fixture(scope="module")
+def an4_val_manifest_corrected(tmp_path_factory, test_data_dir):
+    """
+    Correct an4_val manifest audio filepaths, e.g.,
+    "tests/data/asr/test/an4/wav/an440-mjgm-b.wav" -> test_data_dir / "test/an4/wav/an440-mjgm-b.wav"
+    """
+    an4_val_manifest_orig_path = Path(test_data_dir) / "asr/an4_val.json"
+    an4_val_manifest_corrected_path = tmp_path_factory.mktemp("manifests") / "an4_val_corrected.json"
+    an4_val_records = read_manifest(an4_val_manifest_orig_path)
+    for record in an4_val_records:
+        record["audio_filepath"] = record["audio_filepath"].replace(
+            "tests/data/asr", str(an4_val_manifest_orig_path.resolve().parent)
+        )
+    write_manifest(an4_val_manifest_corrected_path, an4_val_records)
+    return an4_val_manifest_corrected_path
+
+
+def get_rnnt_alignments(
+    strategy: str,
+    manifest_path: Union[Path, str],
+    model: EncDecRNNTBPEModel,
+    loop_labels: bool = True,
+    use_cuda_graph_decoder=False,
+    device="cuda",
+):
+    cfg = OmegaConf.structured(TranscriptionConfig())
     cfg.rnnt_decoding.confidence_cfg.preserve_frame_confidence = True
     cfg.rnnt_decoding.preserve_alignments = True
     cfg.rnnt_decoding.strategy = strategy
     if cfg.rnnt_decoding.strategy == "greedy_batch":
         cfg.rnnt_decoding.greedy.loop_labels = loop_labels
         cfg.rnnt_decoding.greedy.use_cuda_graph_decoder = use_cuda_graph_decoder
-    cfg.dataset_manifest = TEST_DATA_PATH
+    cfg.dataset_manifest = str(manifest_path)
     filepaths = prepare_audio_data(cfg)[0][:10]  # selecting 10 files only
 
-    model = setup_model(cfg, map_location=location)[0]
+    model = model.to(device)
     model.change_decoding_strategy(cfg.rnnt_decoding)
 
     transcriptions = model.transcribe(
@@ -72,16 +109,35 @@ def cleanup_local_folder():
 
 
 # TODO: add the same tests for multi-blank RNNT decoding
-@pytest.mark.skipif(not os.path.exists('/home/TestData'), reason='Not a Jenkins machine')
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("loop_labels", [True, False])
 @pytest.mark.parametrize("use_cuda_graph_decoder", [True, False])
-def test_rnnt_alignments(loop_labels: bool, use_cuda_graph_decoder: bool):
+@pytest.mark.with_downloads
+def test_rnnt_alignments(
+    loop_labels: bool,
+    use_cuda_graph_decoder: bool,
+    device: str,
+    an4_val_manifest_corrected,
+    stt_en_conformer_transducer_small_model,
+):
+    if use_cuda_graph_decoder and device != "cuda":
+        pytest.skip("CUDA decoder works only with CUDA")
     if not loop_labels and use_cuda_graph_decoder:
         pytest.skip("Frame-Looping algorithm with CUDA graphs does not yet support alignments")
     # using greedy as baseline and comparing all other configurations to it
-    ref_transcriptions = get_rnnt_alignments("greedy")
+    ref_transcriptions = get_rnnt_alignments(
+        "greedy",
+        manifest_path=an4_val_manifest_corrected,
+        model=stt_en_conformer_transducer_small_model,
+        device=device,
+    )
     transcriptions = get_rnnt_alignments(
-        "greedy_batch", loop_labels=loop_labels, use_cuda_graph_decoder=use_cuda_graph_decoder
+        "greedy_batch",
+        loop_labels=loop_labels,
+        use_cuda_graph_decoder=use_cuda_graph_decoder,
+        manifest_path=an4_val_manifest_corrected,
+        model=stt_en_conformer_transducer_small_model,
+        device=device,
     )
     # comparing that label sequence in alignments is exactly the same
     # we can't compare logits as well, because they are expected to be

diff --git a/tests/collections/common/test_metrics.py b/tests/collections/common/test_metrics.py
@@ -28,7 +28,9 @@
 
 
 class TestCommonMetrics:
-    top_k_logits = torch.tensor([[0.1, 0.3, 0.2, 0.0], [0.9, 0.6, 0.2, 0.3], [0.2, 0.1, 0.4, 0.3]],)  # 1  # 0  # 2
+    top_k_logits = torch.tensor(
+        [[0.1, 0.3, 0.2, 0.0], [0.9, 0.6, 0.2, 0.3], [0.2, 0.1, 0.4, 0.3]],
+    )  # 1  # 0  # 2
 
     @pytest.mark.unit
     def test_top_1_accuracy(self):
@@ -130,7 +132,10 @@ def test_top_1_accuracy_distributed_uneven_batch(self):
 class TestPerplexity(PerplexityTester):
     def test_perplexity(self, ddp, dist_sync_on_step, probs, logits):
         self.run_class_perplexity_test(
-            ddp=ddp, probs=probs, logits=logits, dist_sync_on_step=dist_sync_on_step,
+            ddp=ddp,
+            probs=probs,
+            logits=logits,
+            dist_sync_on_step=dist_sync_on_step,
         )
 
 

diff --git a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
@@ -1,5 +1,13 @@
 {
     "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "# Deprecation Warning\n",
+                "This tutorial is deprecated and no longer supported in NeMo. The notebook will be removed in the 24.09 release."
+            ]
+        },
         {
             "cell_type": "code",
             "execution_count": null,