Merge branch 'r2.0.0rc0' into dpykhtar/deprecate_non_mcore

NVIDIA · May 25, 2024 · b2e6b88 · b2e6b88
2 parents 8700c6d + 0411b7c
commit b2e6b88
Show file tree

Hide file tree

Showing 6 changed files with 41 additions and 15 deletions.
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
@@ -95,7 +95,9 @@ class LhotseDataLoadingConfig:
 
     # 4. Optional Lhotse data augmentation.
     #   a. On-the-fly noise/audio mixing.
-    noise_path: Any | None = None  # str | dict where dict can have any of keys: manifest_filepath, tarred_audio_filepaths, cuts_path, shar_path
+    noise_path: Any | None = (
+        None  # str | dict where dict can have any of keys: manifest_filepath, tarred_audio_filepaths, cuts_path, shar_path
+    )
     noise_snr: tuple[float, float] = (10.0, 20.0)
     noise_mix_prob: float = 0.5
     #   b. On-the-fly 3-way speed perturbation.
@@ -114,7 +116,9 @@ class LhotseDataLoadingConfig:
     cut_into_windows_duration: Optional[float] = None  # set this to enable
     cut_into_windows_hop: Optional[float] = None
     #       III) common options
-    keep_excessive_supervisions: bool = True  # when a cut is truncated in the middle of a supervision, should we keep them.
+    keep_excessive_supervisions: bool = (
+        True  # when a cut is truncated in the middle of a supervision, should we keep them.
+    )
     #   e. RIR augmentation (synthetic RIR if rir_path is None)
     #   at the moment supports only Lhotse recording manifests, e.g. https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/rir_noise.py
     rir_enabled: bool = False
@@ -130,7 +134,11 @@ class LhotseDataLoadingConfig:
 
 
 def get_lhotse_dataloader_from_config(
-    config: DictConfig, global_rank: int, world_size: int, dataset: torch.utils.data.Dataset, tokenizer=None,
+    config: DictConfig,
+    global_rank: int,
+    world_size: int,
+    dataset: torch.utils.data.Dataset,
+    tokenizer=None,
 ) -> torch.utils.data.DataLoader:
     """
     Set up a Lhotse training dataloder.
@@ -205,7 +213,11 @@ def get_lhotse_dataloader_from_config(
     #    and applying it here (before sampler/dataset) ensures optimal
     #    bucket allocation.
     if config.perturb_speed:
-        cuts = CutSet.mux(cuts, cuts.perturb_speed(0.9), cuts.perturb_speed(1.1),)
+        cuts = CutSet.mux(
+            cuts,
+            cuts.perturb_speed(0.9),
+            cuts.perturb_speed(1.1),
+        )
 
     # 2.d: truncation/slicing
     if config.truncate_duration is not None:
@@ -291,7 +303,10 @@ def get_lhotse_dataloader_from_config(
         # object with texts joined by a whitespace so that "regular" dataset classes don't
         # have to add a special support for multi-supervision cuts.
         sampler = sampler.map(
-            CutConcatenate(gap=config.concatenate_gap_seconds, duration_factor=config.concatenate_duration_factor,)
+            CutConcatenate(
+                gap=config.concatenate_gap_seconds,
+                duration_factor=config.concatenate_duration_factor,
+            )
         )
         if config.db_norm is not None:
             sampler = sampler.map(partial(_normalize_loudness, db_norm=config.db_norm))
@@ -326,7 +341,10 @@ def get_lhotse_dataloader_from_config(
         # the meta-data to Dataset, which performs the actual I/O inside its __getitem__ method.
         dloader_kwargs = dict(dataset=dataset, sampler=sampler)
     dloader = torch.utils.data.DataLoader(
-        **dloader_kwargs, batch_size=None, num_workers=config.num_workers, pin_memory=config.pin_memory,
+        **dloader_kwargs,
+        batch_size=None,
+        num_workers=config.num_workers,
+        pin_memory=config.pin_memory,
     )
 
     return dloader
@@ -377,7 +395,9 @@ class MultimodalSamplingConstraint(SamplingConstraint):
 
     def __post_init__(self):
         self._internal = TokenConstraint(
-            max_tokens=self.batch_tokens, max_examples=self.batch_size, quadratic_length=self.quadratic_factor,
+            max_tokens=self.batch_tokens,
+            max_examples=self.batch_size,
+            quadratic_length=self.quadratic_factor,
         )
 
     def add(self, example: Any) -> None:
@@ -487,7 +507,13 @@ def maybe_set_cuda_expandable_segments(enabled: bool):
             warnings.warn(
                 "You have set PYTORCH_CUDA_ALLOC_CONF without expandable_segments:True option. We're setting that option anyway. To disable it, set cuda_expandable_segments=False in NeMo dataloader configuration."
             )
-        torch.cuda.memory._set_allocator_settings("expandable_segments:True")
+
+        try:
+            torch.cuda.memory._set_allocator_settings("expandable_segments:True")
+        except RuntimeError:
+            logging.info(
+                "Failed to set expandable_segments:True for PyTorch CUDA allocator. You may get training speed improvements if you enable this"
+            )
 
 
 def _select_channel(cut, channel_selector: int | str) -> list:

diff --git a/tutorials/00_NeMo_Primer.ipynb b/tutorials/00_NeMo_Primer.ipynb
@@ -588,7 +588,7 @@
         "id": "U7Eezf_sAVS0"
       },
       "source": [
-        "You might wonder why we didnt explicitly set `citrinet.cfg.optim = cfg.optim`. \n",
+        "You might wonder why we didn't explicitly set `citrinet.cfg.optim = cfg.optim`. \n",
         "\n",
         "This is because the `setup_optimization()` method does it for you! You can still update the config manually."
       ]

diff --git a/tutorials/asr/ASR_Confidence_Estimation.ipynb b/tutorials/asr/ASR_Confidence_Estimation.ipynb
@@ -284,7 +284,7 @@
     "            eps_padded_hyp, labels, padded_labels, fill_confidence_deletions(confidence_scores, labels)\n",
     "        ):\n",
     "            word_len = len(word)\n",
-    "            # shield angle brakets for <eps>\n",
+    "            # shield angle brackets for <eps>\n",
     "            if html and word == \"<eps>\":\n",
     "                word = \"&lt;eps&gt;\"\n",
     "            if current_line_len + word_len + 1 <= terminal_width:\n",
@@ -307,7 +307,7 @@
     "        current_word_line = \"\"\n",
     "        for word, score in zip(transcript_list, confidence_scores):\n",
     "            word_len = len(word)\n",
-    "            # shield angle brakets for <eps>\n",
+    "            # shield angle brackets for <eps>\n",
     "            if html and word == \"<eps>\":\n",
     "                word = \"&lt;eps&gt;\"\n",
     "            if current_line_len + word_len + 1 <= terminal_width:\n",

diff --git a/tutorials/asr/ASR_Context_Biasing.ipynb b/tutorials/asr/ASR_Context_Biasing.ipynb
@@ -361,7 +361,7 @@
    "source": [
     "## Create a context-biasing list\n",
     "\n",
-    "Now, we need to select the words, recognition of wich we want to improve by CTC-WS context-biasing.\n",
+    "Now, we need to select the words, recognition of which we want to improve by CTC-WS context-biasing.\n",
     "Usually, we select only nontrivial words with the lowest recognition accuracy.\n",
     "Such words should have a character length >= 3 because short words in a context-biasing list may produce high false-positive recognition.\n",
     "In this toy example, we will select all the words that look like names with a recognition accuracy less than 1.0.\n",

diff --git a/tutorials/asr/Speech_Commands.ipynb b/tutorials/asr/Speech_Commands.ipynb
@@ -1431,10 +1431,10 @@
                 "# Lets change the scheduler\n",
                 "optim_sched_cfg.sched.name = \"CosineAnnealing\"\n",
                 "\n",
-                "# \"power\" isnt applicable to CosineAnnealing so let's remove it\n",
+                "# \"power\" isn't applicable to CosineAnnealing so let's remove it\n",
                 "optim_sched_cfg.sched.pop('power')\n",
                 "\n",
-                "# \"hold_ratio\" isnt applicable to CosineAnnealing, so let's remove it\n",
+                "# \"hold_ratio\" isn't applicable to CosineAnnealing, so let's remove it\n",
                 "optim_sched_cfg.sched.pop('hold_ratio')\n",
                 "\n",
                 "# Set \"min_lr\" to lower value\n",

diff --git a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
@@ -749,7 +749,7 @@
             "source": [
                 "### Optimizing Threshold\n",
                 "\n",
-                "As mentioned above, when classifiying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n",
+                "As mentioned above, when classifying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n",
                 "\n",
                 "We need to use these probabilities to generate final label predictions of 0 or 1 for each label. While we can use 0.5 as the probability threshold, it is usually the case that there is a better threshold to use depending on the metric we want to optimize. For this tutorial, we will be finding the threshold that gives us the best micro-F1 score on the validation set. After running the `optimize_threshold` method, the threshold attribute for our model will be updated."
             ]