-
Notifications
You must be signed in to change notification settings - Fork 2.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into aot/qwen-recipe
- Loading branch information
Showing
28 changed files
with
1,329 additions
and
59 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
119 changes: 119 additions & 0 deletions
119
examples/audio/conf/masking_with_online_augmentation.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
name: "masking_with_online_augmenatation" | ||
|
||
model: | ||
sample_rate: 16000 | ||
skip_nan_grad: false | ||
num_outputs: 1 | ||
|
||
train_ds: | ||
use_lhotse: true # enable Lhotse data loader | ||
cuts_path: ??? # path to Lhotse cuts manifest with speech signals for augmentation (including custom "target_recording" field with the same signals) | ||
truncate_duration: 4.0 # Number of STFT time frames = 1 + truncate_duration // encoder.hop_length = 256 | ||
truncate_offset_type: random # if the file is longer than truncate_duration, use random offset to select a subsegment | ||
batch_size: 64 # batch size may be increased based on the available memory | ||
shuffle: true | ||
num_workers: 8 | ||
pin_memory: true | ||
rir_enabled: true # enable room impulse response augmentation | ||
rir_path: ??? # path to Lhotse recordings manifest with room impulse response signals | ||
noise_path: ??? # path to Lhotse cuts manifest with noise signals | ||
|
||
validation_ds: | ||
use_lhotse: true # enable Lhotse data loader | ||
cuts_path: ??? # path to Lhotse cuts manifest with noisy speech signals (including custom "target_recording" field with the clean signals) | ||
batch_size: 64 # batch size may be increased based on the available memory | ||
shuffle: false | ||
num_workers: 4 | ||
pin_memory: true | ||
|
||
test_ds: | ||
use_lhotse: true # enable Lhotse data loader | ||
cuts_path: ??? # path to Lhotse cuts manifest with noisy speech signals (including custom "target_recording" field with the clean signals) | ||
batch_size: 1 # batch size may be increased based on the available memory | ||
shuffle: false | ||
num_workers: 4 | ||
pin_memory: true | ||
|
||
encoder: | ||
_target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram | ||
fft_length: 512 # Length of the window and FFT for calculating spectrogram | ||
hop_length: 256 # Hop length for calculating spectrogram | ||
|
||
decoder: | ||
_target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio | ||
fft_length: 512 # Length of the window and FFT for calculating spectrogram | ||
hop_length: 256 # Hop length for calculating spectrogram | ||
|
||
mask_estimator: | ||
_target_: nemo.collections.audio.modules.masking.MaskEstimatorRNN | ||
num_outputs: ${model.num_outputs} | ||
num_subbands: 257 # Number of subbands of the input spectrogram | ||
num_features: 256 # Number of features at RNN input | ||
num_layers: 5 # Number of RNN layers | ||
bidirectional: true # Use bi-directional RNN | ||
|
||
mask_processor: | ||
_target_: nemo.collections.audio.modules.masking.MaskReferenceChannel # Apply mask on the reference channel | ||
ref_channel: 0 # Reference channel for the output | ||
|
||
loss: | ||
_target_: nemo.collections.audio.losses.SDRLoss | ||
scale_invariant: true # Use scale-invariant SDR | ||
|
||
metrics: | ||
val: | ||
sdr: # output SDR | ||
_target_: torchmetrics.audio.SignalDistortionRatio | ||
test: | ||
sdr_ch0: # SDR on output channel 0 | ||
_target_: torchmetrics.audio.SignalDistortionRatio | ||
channel: 0 | ||
|
||
optim: | ||
name: adamw | ||
lr: 1e-4 | ||
# optimizer arguments | ||
betas: [0.9, 0.98] | ||
weight_decay: 1e-3 | ||
|
||
trainer: | ||
devices: -1 # number of GPUs, -1 would use all available GPUs | ||
num_nodes: 1 | ||
max_epochs: -1 | ||
max_steps: -1 # computed at runtime if not set | ||
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations | ||
accelerator: auto | ||
strategy: ddp | ||
accumulate_grad_batches: 1 | ||
gradient_clip_val: null | ||
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. | ||
log_every_n_steps: 25 # Interval of logging. | ||
enable_progress_bar: true | ||
num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it | ||
check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs | ||
sync_batchnorm: true | ||
enable_checkpointing: False # Provided by exp_manager | ||
logger: false # Provided by exp_manager | ||
|
||
exp_manager: | ||
exp_dir: null | ||
name: ${name} | ||
create_tensorboard_logger: true | ||
create_checkpoint_callback: true | ||
checkpoint_callback_params: | ||
# in case of multiple validation sets, first one is used | ||
monitor: "val_loss" | ||
mode: "min" | ||
save_top_k: 5 | ||
always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints | ||
|
||
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. | ||
# you need to set these two to true to continue the training | ||
resume_if_exists: false | ||
resume_ignore_no_checkpoint: false | ||
|
||
# You may use this section to create a W&B logger | ||
create_wandb_logger: false | ||
wandb_logger_kwargs: | ||
name: null | ||
project: null |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.