Skip to content

Commit

Permalink
Merge branch 'r2.0.0rc0' into dpykhtar/deprecate_non_mcore
Browse files Browse the repository at this point in the history
  • Loading branch information
dimapihtar authored May 25, 2024
2 parents 8700c6d + 0411b7c commit b2e6b88
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 15 deletions.
42 changes: 34 additions & 8 deletions nemo/collections/common/data/lhotse/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,9 @@ class LhotseDataLoadingConfig:

# 4. Optional Lhotse data augmentation.
# a. On-the-fly noise/audio mixing.
noise_path: Any | None = None # str | dict where dict can have any of keys: manifest_filepath, tarred_audio_filepaths, cuts_path, shar_path
noise_path: Any | None = (
None # str | dict where dict can have any of keys: manifest_filepath, tarred_audio_filepaths, cuts_path, shar_path
)
noise_snr: tuple[float, float] = (10.0, 20.0)
noise_mix_prob: float = 0.5
# b. On-the-fly 3-way speed perturbation.
Expand All @@ -114,7 +116,9 @@ class LhotseDataLoadingConfig:
cut_into_windows_duration: Optional[float] = None # set this to enable
cut_into_windows_hop: Optional[float] = None
# III) common options
keep_excessive_supervisions: bool = True # when a cut is truncated in the middle of a supervision, should we keep them.
keep_excessive_supervisions: bool = (
True # when a cut is truncated in the middle of a supervision, should we keep them.
)
# e. RIR augmentation (synthetic RIR if rir_path is None)
# at the moment supports only Lhotse recording manifests, e.g. https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/rir_noise.py
rir_enabled: bool = False
Expand All @@ -130,7 +134,11 @@ class LhotseDataLoadingConfig:


def get_lhotse_dataloader_from_config(
config: DictConfig, global_rank: int, world_size: int, dataset: torch.utils.data.Dataset, tokenizer=None,
config: DictConfig,
global_rank: int,
world_size: int,
dataset: torch.utils.data.Dataset,
tokenizer=None,
) -> torch.utils.data.DataLoader:
"""
Set up a Lhotse training dataloder.
Expand Down Expand Up @@ -205,7 +213,11 @@ def get_lhotse_dataloader_from_config(
# and applying it here (before sampler/dataset) ensures optimal
# bucket allocation.
if config.perturb_speed:
cuts = CutSet.mux(cuts, cuts.perturb_speed(0.9), cuts.perturb_speed(1.1),)
cuts = CutSet.mux(
cuts,
cuts.perturb_speed(0.9),
cuts.perturb_speed(1.1),
)

# 2.d: truncation/slicing
if config.truncate_duration is not None:
Expand Down Expand Up @@ -291,7 +303,10 @@ def get_lhotse_dataloader_from_config(
# object with texts joined by a whitespace so that "regular" dataset classes don't
# have to add a special support for multi-supervision cuts.
sampler = sampler.map(
CutConcatenate(gap=config.concatenate_gap_seconds, duration_factor=config.concatenate_duration_factor,)
CutConcatenate(
gap=config.concatenate_gap_seconds,
duration_factor=config.concatenate_duration_factor,
)
)
if config.db_norm is not None:
sampler = sampler.map(partial(_normalize_loudness, db_norm=config.db_norm))
Expand Down Expand Up @@ -326,7 +341,10 @@ def get_lhotse_dataloader_from_config(
# the meta-data to Dataset, which performs the actual I/O inside its __getitem__ method.
dloader_kwargs = dict(dataset=dataset, sampler=sampler)
dloader = torch.utils.data.DataLoader(
**dloader_kwargs, batch_size=None, num_workers=config.num_workers, pin_memory=config.pin_memory,
**dloader_kwargs,
batch_size=None,
num_workers=config.num_workers,
pin_memory=config.pin_memory,
)

return dloader
Expand Down Expand Up @@ -377,7 +395,9 @@ class MultimodalSamplingConstraint(SamplingConstraint):

def __post_init__(self):
self._internal = TokenConstraint(
max_tokens=self.batch_tokens, max_examples=self.batch_size, quadratic_length=self.quadratic_factor,
max_tokens=self.batch_tokens,
max_examples=self.batch_size,
quadratic_length=self.quadratic_factor,
)

def add(self, example: Any) -> None:
Expand Down Expand Up @@ -487,7 +507,13 @@ def maybe_set_cuda_expandable_segments(enabled: bool):
warnings.warn(
"You have set PYTORCH_CUDA_ALLOC_CONF without expandable_segments:True option. We're setting that option anyway. To disable it, set cuda_expandable_segments=False in NeMo dataloader configuration."
)
torch.cuda.memory._set_allocator_settings("expandable_segments:True")

try:
torch.cuda.memory._set_allocator_settings("expandable_segments:True")
except RuntimeError:
logging.info(
"Failed to set expandable_segments:True for PyTorch CUDA allocator. You may get training speed improvements if you enable this"
)


def _select_channel(cut, channel_selector: int | str) -> list:
Expand Down
2 changes: 1 addition & 1 deletion tutorials/00_NeMo_Primer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@
"id": "U7Eezf_sAVS0"
},
"source": [
"You might wonder why we didnt explicitly set `citrinet.cfg.optim = cfg.optim`. \n",
"You might wonder why we didn't explicitly set `citrinet.cfg.optim = cfg.optim`. \n",
"\n",
"This is because the `setup_optimization()` method does it for you! You can still update the config manually."
]
Expand Down
4 changes: 2 additions & 2 deletions tutorials/asr/ASR_Confidence_Estimation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@
" eps_padded_hyp, labels, padded_labels, fill_confidence_deletions(confidence_scores, labels)\n",
" ):\n",
" word_len = len(word)\n",
" # shield angle brakets for <eps>\n",
" # shield angle brackets for <eps>\n",
" if html and word == \"<eps>\":\n",
" word = \"&lt;eps&gt;\"\n",
" if current_line_len + word_len + 1 <= terminal_width:\n",
Expand All @@ -307,7 +307,7 @@
" current_word_line = \"\"\n",
" for word, score in zip(transcript_list, confidence_scores):\n",
" word_len = len(word)\n",
" # shield angle brakets for <eps>\n",
" # shield angle brackets for <eps>\n",
" if html and word == \"<eps>\":\n",
" word = \"&lt;eps&gt;\"\n",
" if current_line_len + word_len + 1 <= terminal_width:\n",
Expand Down
2 changes: 1 addition & 1 deletion tutorials/asr/ASR_Context_Biasing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@
"source": [
"## Create a context-biasing list\n",
"\n",
"Now, we need to select the words, recognition of wich we want to improve by CTC-WS context-biasing.\n",
"Now, we need to select the words, recognition of which we want to improve by CTC-WS context-biasing.\n",
"Usually, we select only nontrivial words with the lowest recognition accuracy.\n",
"Such words should have a character length >= 3 because short words in a context-biasing list may produce high false-positive recognition.\n",
"In this toy example, we will select all the words that look like names with a recognition accuracy less than 1.0.\n",
Expand Down
4 changes: 2 additions & 2 deletions tutorials/asr/Speech_Commands.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1431,10 +1431,10 @@
"# Lets change the scheduler\n",
"optim_sched_cfg.sched.name = \"CosineAnnealing\"\n",
"\n",
"# \"power\" isnt applicable to CosineAnnealing so let's remove it\n",
"# \"power\" isn't applicable to CosineAnnealing so let's remove it\n",
"optim_sched_cfg.sched.pop('power')\n",
"\n",
"# \"hold_ratio\" isnt applicable to CosineAnnealing, so let's remove it\n",
"# \"hold_ratio\" isn't applicable to CosineAnnealing, so let's remove it\n",
"optim_sched_cfg.sched.pop('hold_ratio')\n",
"\n",
"# Set \"min_lr\" to lower value\n",
Expand Down
2 changes: 1 addition & 1 deletion tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -749,7 +749,7 @@
"source": [
"### Optimizing Threshold\n",
"\n",
"As mentioned above, when classifiying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n",
"As mentioned above, when classifying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n",
"\n",
"We need to use these probabilities to generate final label predictions of 0 or 1 for each label. While we can use 0.5 as the probability threshold, it is usually the case that there is a better threshold to use depending on the metric we want to optimize. For this tutorial, we will be finding the threshold that gives us the best micro-F1 score on the validation set. After running the `optimize_threshold` method, the threshold attribute for our model will be updated."
]
Expand Down

0 comments on commit b2e6b88

Please sign in to comment.