From 5aed2c042310d8e3c03ff288749554bdb3726d0a Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Wed, 24 Jul 2024 18:30:18 +0400 Subject: [PATCH 1/4] Fix RNNT alignments test (#9770) * Make alignments tests work on any machine Signed-off-by: Vladimir Bataev --------- Signed-off-by: Vladimir Bataev Signed-off-by: artbataev Co-authored-by: artbataev --- .github/workflows/cicd-main.yml | 2 +- Dockerfile | 2 +- .../asr/decoding/rnnt_alignments_check.py | 80 ++++++++++++++++--- tests/collections/common/test_metrics.py | 9 ++- 4 files changed, 77 insertions(+), 16 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index bd4d131af1c2..e44378eb10b8 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -810,7 +810,7 @@ jobs: with: RUNNER: self-hosted-azure SCRIPT: | - pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1 + pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1 --with_downloads # L2: Segmentation Tool L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav: diff --git a/Dockerfile b/Dockerfile index a42ae592a9bd..0c3ea70537e1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3 +ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3 # build an image that includes only the nemo dependencies, ensures that dependencies # are included first for optimal caching, and useful for building a development diff --git a/tests/collections/asr/decoding/rnnt_alignments_check.py b/tests/collections/asr/decoding/rnnt_alignments_check.py index d44f7f8fd985..ec0656cbce49 100644 --- a/tests/collections/asr/decoding/rnnt_alignments_check.py +++ b/tests/collections/asr/decoding/rnnt_alignments_check.py @@ -17,29 +17,66 @@ # these tests outside of the CI machines environment, where test data is # stored -import os +from pathlib import Path +from typing import Union + import pytest +import torch.cuda from examples.asr.transcribe_speech import TranscriptionConfig from omegaconf import OmegaConf -from nemo.collections.asr.parts.utils.transcribe_utils import prepare_audio_data, setup_model +from nemo.collections.asr.models import EncDecRNNTBPEModel +from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest +from nemo.collections.asr.parts.utils.transcribe_utils import prepare_audio_data + +DEVICES = [] + +if torch.cuda.is_available(): + DEVICES.append('cuda') -TEST_DATA_PATH = "/home/TestData/an4_dataset/an4_val.json" -PRETRAINED_MODEL_NAME = "stt_en_conformer_transducer_small" +@pytest.fixture(scope="module") +def stt_en_conformer_transducer_small_model(): + model = EncDecRNNTBPEModel.from_pretrained(model_name="stt_en_conformer_transducer_small", map_location="cpu") + return model -def get_rnnt_alignments(strategy: str, loop_labels: bool = True, use_cuda_graph_decoder=False, location="cuda"): - cfg = OmegaConf.structured(TranscriptionConfig(pretrained_name=PRETRAINED_MODEL_NAME)) + +@pytest.fixture(scope="module") +def an4_val_manifest_corrected(tmp_path_factory, test_data_dir): + """ + Correct an4_val manifest audio filepaths, e.g., + "tests/data/asr/test/an4/wav/an440-mjgm-b.wav" -> test_data_dir / "test/an4/wav/an440-mjgm-b.wav" + """ + an4_val_manifest_orig_path = Path(test_data_dir) / "asr/an4_val.json" + an4_val_manifest_corrected_path = tmp_path_factory.mktemp("manifests") / "an4_val_corrected.json" + an4_val_records = read_manifest(an4_val_manifest_orig_path) + for record in an4_val_records: + record["audio_filepath"] = record["audio_filepath"].replace( + "tests/data/asr", str(an4_val_manifest_orig_path.resolve().parent) + ) + write_manifest(an4_val_manifest_corrected_path, an4_val_records) + return an4_val_manifest_corrected_path + + +def get_rnnt_alignments( + strategy: str, + manifest_path: Union[Path, str], + model: EncDecRNNTBPEModel, + loop_labels: bool = True, + use_cuda_graph_decoder=False, + device="cuda", +): + cfg = OmegaConf.structured(TranscriptionConfig()) cfg.rnnt_decoding.confidence_cfg.preserve_frame_confidence = True cfg.rnnt_decoding.preserve_alignments = True cfg.rnnt_decoding.strategy = strategy if cfg.rnnt_decoding.strategy == "greedy_batch": cfg.rnnt_decoding.greedy.loop_labels = loop_labels cfg.rnnt_decoding.greedy.use_cuda_graph_decoder = use_cuda_graph_decoder - cfg.dataset_manifest = TEST_DATA_PATH + cfg.dataset_manifest = str(manifest_path) filepaths = prepare_audio_data(cfg)[0][:10] # selecting 10 files only - model = setup_model(cfg, map_location=location)[0] + model = model.to(device) model.change_decoding_strategy(cfg.rnnt_decoding) transcriptions = model.transcribe( @@ -72,16 +109,35 @@ def cleanup_local_folder(): # TODO: add the same tests for multi-blank RNNT decoding -@pytest.mark.skipif(not os.path.exists('/home/TestData'), reason='Not a Jenkins machine') +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("loop_labels", [True, False]) @pytest.mark.parametrize("use_cuda_graph_decoder", [True, False]) -def test_rnnt_alignments(loop_labels: bool, use_cuda_graph_decoder: bool): +@pytest.mark.with_downloads +def test_rnnt_alignments( + loop_labels: bool, + use_cuda_graph_decoder: bool, + device: str, + an4_val_manifest_corrected, + stt_en_conformer_transducer_small_model, +): + if use_cuda_graph_decoder and device != "cuda": + pytest.skip("CUDA decoder works only with CUDA") if not loop_labels and use_cuda_graph_decoder: pytest.skip("Frame-Looping algorithm with CUDA graphs does not yet support alignments") # using greedy as baseline and comparing all other configurations to it - ref_transcriptions = get_rnnt_alignments("greedy") + ref_transcriptions = get_rnnt_alignments( + "greedy", + manifest_path=an4_val_manifest_corrected, + model=stt_en_conformer_transducer_small_model, + device=device, + ) transcriptions = get_rnnt_alignments( - "greedy_batch", loop_labels=loop_labels, use_cuda_graph_decoder=use_cuda_graph_decoder + "greedy_batch", + loop_labels=loop_labels, + use_cuda_graph_decoder=use_cuda_graph_decoder, + manifest_path=an4_val_manifest_corrected, + model=stt_en_conformer_transducer_small_model, + device=device, ) # comparing that label sequence in alignments is exactly the same # we can't compare logits as well, because they are expected to be diff --git a/tests/collections/common/test_metrics.py b/tests/collections/common/test_metrics.py index f9005232a017..931a3a2f2497 100644 --- a/tests/collections/common/test_metrics.py +++ b/tests/collections/common/test_metrics.py @@ -28,7 +28,9 @@ class TestCommonMetrics: - top_k_logits = torch.tensor([[0.1, 0.3, 0.2, 0.0], [0.9, 0.6, 0.2, 0.3], [0.2, 0.1, 0.4, 0.3]],) # 1 # 0 # 2 + top_k_logits = torch.tensor( + [[0.1, 0.3, 0.2, 0.0], [0.9, 0.6, 0.2, 0.3], [0.2, 0.1, 0.4, 0.3]], + ) # 1 # 0 # 2 @pytest.mark.unit def test_top_1_accuracy(self): @@ -130,7 +132,10 @@ def test_top_1_accuracy_distributed_uneven_batch(self): class TestPerplexity(PerplexityTester): def test_perplexity(self, ddp, dist_sync_on_step, probs, logits): self.run_class_perplexity_test( - ddp=ddp, probs=probs, logits=logits, dist_sync_on_step=dist_sync_on_step, + ddp=ddp, + probs=probs, + logits=logits, + dist_sync_on_step=dist_sync_on_step, ) From 148f279469499feb81c327b444a1b6a4f1e50ea0 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Wed, 24 Jul 2024 19:49:47 +0300 Subject: [PATCH 2/4] deprecate tutorial (#9864) Signed-off-by: dimapihtar --- tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb index 4f66d5ee7b52..60f6b9c47552 100644 --- a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb +++ b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Deprecation Warning\n", + "This tutorial is deprecated and no longer supported in NeMo. The notebook will be removed in the 24.09 release." + ] + }, { "cell_type": "code", "execution_count": null, From d161a0396d833df85e598f074b937edd8edc8dc3 Mon Sep 17 00:00:00 2001 From: Anna Shors <71393111+ashors1@users.noreply.github.com> Date: Wed, 24 Jul 2024 11:13:24 -0700 Subject: [PATCH 3/4] fix a minor bug with async checkpointing where a checkpoint would get saved on_train_batch_end and on_validation_end within the same step (#9856) Signed-off-by: ashors1 --- nemo/lightning/pytorch/callbacks/model_checkpoint.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py index eee3850dfb37..f3e8f7e6b40b 100644 --- a/nemo/lightning/pytorch/callbacks/model_checkpoint.py +++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py @@ -380,6 +380,8 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str) self.set_checkpoint_unfinished_marker(filepath, barrier_after=True) ema_callback = self._ema_callback(trainer) + self._last_global_step_saved = trainer.global_step + if ema_callback is not None: if self.async_save: raise ValueError('async_save with EMA not supported') @@ -422,7 +424,6 @@ def _get_finalize_save_checkpoint_callback( def _cb(): logging.debug(f'Finalize callback called for step {global_step}, filepath {filepath}') - self._last_global_step_saved = global_step self._last_checkpoint_saved = filepath from nemo.utils.get_rank import is_global_rank_zero From be21e95e52c3f08522d23fa1f3d4dd0c001334b0 Mon Sep 17 00:00:00 2001 From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Date: Wed, 24 Jul 2024 12:20:04 -0700 Subject: [PATCH 4/4] Disable nvFuser setup with PyTorch 23.11 and later (#9837) * Disable nvFuser setup with PyTorch 23.11 and later Signed-off-by: Abhishree * Apply isort and black reformatting Signed-off-by: athitten --------- Signed-off-by: Abhishree Signed-off-by: athitten Co-authored-by: athitten --- .../nlp/models/language_modeling/megatron_base_model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 20d532d4764a..a615dcef5051 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -399,7 +399,9 @@ def _enable_nvidia_optimizations(self): self.cfg.persist_layer_norm = False # NVFUSER available starting with 21.11 - if NVIDIA_TORCH_MAJOR >= 21 or (NVIDIA_TORCH_MAJOR == 21 and NVIDIA_TORCH_MINOR >= 11): + if (NVIDIA_TORCH_MAJOR >= 21 or (NVIDIA_TORCH_MAJOR == 21 and NVIDIA_TORCH_MINOR >= 11)) and ( + NVIDIA_TORCH_MAJOR < 23 or (NVIDIA_TORCH_MAJOR == 23 and NVIDIA_TORCH_MINOR < 11) + ): # NVFUSER torch._C._jit_set_profiling_executor(True)