-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Ante Jukić <[email protected]>
- Loading branch information
Showing
15 changed files
with
890 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,318 @@ | ||
NeMo Audio API | ||
============== | ||
|
||
Model Classes | ||
------------- | ||
Base Classes | ||
~~~~~~~~~~~~ | ||
.. autoclass:: nemo.collections.audio.models.AudioToAudioModel | ||
:show-inheritance: | ||
:members: | ||
:exclude-members: setup_training_data, setup_validation_data, training_step, on_validation_epoch_end, validation_step, setup_test_data, on_train_epoch_start | ||
|
||
|
||
Processing Models | ||
~~~~~~~~~~~~~~~~~ | ||
.. autoclass:: nemo.collections.audio.models.EncMaskDecAudioToAudioModel | ||
:show-inheritance: | ||
:members: | ||
:exclude-members: setup_training_data, setup_validation_data, training_step, on_validation_epoch_end, validation_step, setup_test_data, on_train_epoch_start | ||
|
||
.. autoclass:: nemo.collections.audio.models.FlowMatchingAudioToAudioModel | ||
:show-inheritance: | ||
:members: | ||
:exclude-members: setup_training_data, setup_validation_data, training_step, on_validation_epoch_end, validation_step, setup_test_data, on_train_epoch_start | ||
|
||
.. autoclass:: nemo.collections.audio.models.PredictiveAudioToAudioModel | ||
:show-inheritance: | ||
:members: | ||
:exclude-members: setup_training_data, setup_validation_data, training_step, on_validation_epoch_end, validation_step, setup_test_data, on_train_epoch_start | ||
|
||
.. autoclass:: nemo.collections.audio.models.ScoreBasedGenerativeAudioToAudioModel | ||
:show-inheritance: | ||
:members: | ||
:exclude-members: setup_training_data, setup_validation_data, training_step, on_validation_epoch_end, validation_step, setup_test_data, on_train_epoch_start | ||
|
||
.. autoclass:: nemo.collections.audio.models.SchroedingerBridgeAudioToAudioModel | ||
:show-inheritance: | ||
:members: | ||
:exclude-members: setup_training_data, setup_validation_data, training_step, on_validation_epoch_end, validation_step, setup_test_data, on_train_epoch_start | ||
|
||
|
||
Modules | ||
------- | ||
|
||
Features | ||
~~~~~~~~ | ||
.. autoclass:: nemo.collections.audio.modules.features.SpectrogramToMultichannelFeatures | ||
:show-inheritance: | ||
:members: | ||
|
||
|
||
Masking | ||
~~~~~~~ | ||
.. autoclass:: nemo.collections.audio.modules.masking.MaskEstimatorRNN | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.modules.masking.MaskEstimatorFlexChannels | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.modules.masking.MaskEstimatorGSS | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.modules.masking.MaskReferenceChannel | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.modules.masking.MaskBasedBeamformer | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.modules.masking.MaskBasedDereverbWPE | ||
:show-inheritance: | ||
:members: | ||
|
||
|
||
Projections | ||
~~~~~~~~~~~ | ||
|
||
.. autoclass:: nemo.collections.audio.modules.projections.MixtureConsistencyProjection | ||
:show-inheritance: | ||
:members: | ||
|
||
|
||
SSL Pretraining | ||
~~~~~~~~~~~~~~~ | ||
|
||
.. autoclass:: nemo.collections.audio.modules.ssl_pretrain_masking.SSLPretrainWithMaskedPatch | ||
:show-inheritance: | ||
:members: | ||
|
||
|
||
Transforms | ||
~~~~~~~~~~ | ||
|
||
.. autoclass:: nemo.collections.audio.modules.transforms.AudioToSpectrogram | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.modules.transforms.SpectrogramToAudio | ||
:show-inheritance: | ||
:members: | ||
|
||
|
||
Parts | ||
----- | ||
|
||
Submodules: Diffusion | ||
~~~~~~~~~~~~~~~~~~~~~ | ||
.. autoclass:: nemo.collections.audio.parts.submodules.diffusion.StochasticDifferentialEquation | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.diffusion.OrnsteinUhlenbeckVarianceExplodingSDE | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.diffusion.ReverseStochasticDifferentialEquation | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.diffusion.PredictorCorrectorSampler | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.diffusion.Predictor | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.diffusion.ReverseDiffusionPredictor | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.diffusion.Corrector | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.diffusion.AnnealedLangevinDynamics | ||
:show-inheritance: | ||
:members: | ||
|
||
|
||
Submodules: Flow | ||
~~~~~~~~~~~~~~~~ | ||
.. autoclass:: nemo.collections.audio.parts.submodules.flow.ConditionalFlow | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.flow.OptimalTransportFlow | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.flow.ConditionalFlowMatchingSampler | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.flow.ConditionalFlowMatchingEulerSampler | ||
:show-inheritance: | ||
:members: | ||
|
||
Submodules: Multichannel | ||
~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.multichannel.ChannelAugment | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.multichannel.TransformAverageConcatenate | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.multichannel.TransformAttendConcatenate | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.multichannel.ChannelAveragePool | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.multichannel.ChannelAttentionPool | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.multichannel.ParametricMultichannelWienerFilter | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.multichannel.ReferenceChannelEstimatorSNR | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.multichannel.WPEFilter | ||
:show-inheritance: | ||
:members: | ||
|
||
|
||
Submodules: NCSN++ | ||
~~~~~~~~~~~~~~~~~~ | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.ncsnpp.SpectrogramNoiseConditionalScoreNetworkPlusPlus | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.ncsnpp.NoiseConditionalScoreNetworkPlusPlus | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.ncsnpp.GaussianFourierProjection | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.ncsnpp.ResnetBlockBigGANPlusPlus | ||
:show-inheritance: | ||
:members: | ||
|
||
|
||
Submodules: Schrödinger Bridge | ||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.schroedinger_bridge.SBNoiseSchedule | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.schroedinger_bridge.SBNoiseScheduleVE | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.schroedinger_bridge.SBNoiseScheduleVP | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.schroedinger_bridge.SBSampler | ||
:show-inheritance: | ||
:members: | ||
|
||
|
||
Submodules: TransformerUNet | ||
~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.transformerunet.LearnedSinusoidalPosEmb | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.transformerunet.ConvPositionEmbed | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.transformerunet.RMSNorm | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.transformerunet.AdaptiveRMSNorm | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.transformerunet.GEGLU | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.transformerunet.TransformerUNet | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.parts.submodules.transformerunet.SpectrogramTransformerUNet | ||
:show-inheritance: | ||
:members: | ||
|
||
|
||
Losses | ||
------ | ||
|
||
.. autoclass:: nemo.collections.audio.losses.MAELoss | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.losses.MSELoss | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.losses.SDRLoss | ||
:show-inheritance: | ||
:members: | ||
|
||
|
||
Datasets | ||
-------- | ||
|
||
NeMo Format | ||
~~~~~~~~~~~ | ||
|
||
.. autoclass:: nemo.collections.audio.data.audio_to_audio.BaseAudioDataset | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.data.audio_to_audio.AudioToTargetDataset | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.data.audio_to_audio.AudioToTargetWithReferenceDataset | ||
:show-inheritance: | ||
:members: | ||
|
||
.. autoclass:: nemo.collections.audio.data.audio_to_audio.AudioToTargetWithEmbeddingDataset | ||
:show-inheritance: | ||
:members: | ||
|
||
|
||
Lhotse Format | ||
~~~~~~~~~~~~~ | ||
|
||
.. autoclass:: nemo.collections.audio.data.audio_to_audio_lhotse.LhotseAudioToTargetDataset | ||
:show-inheritance: | ||
:members: | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
@inproceedings{jukic2023flexible, | ||
title={Flexible multichannel speech enhancement for noise-robust frontend}, | ||
author={Jukić, Ante and Balam, Jagadeesh and Ginsburg, Boris}, | ||
booktitle={Proc. WASPAA}, | ||
year={2023} | ||
} | ||
|
||
@inproceedings{ito2016directional, | ||
title={Complex angular central Gaussian mixture model for directional statistics in mask-based microphone array signal processing}, | ||
author={Ito, Nobutaka and Araki, Shoko and Nakatani, Tomohiro}, | ||
booktitle={Proc. EUSIPCO}, | ||
year={2016} | ||
} | ||
|
||
@inproceedings{jukic2024sb, | ||
title={Schrödinger Bridge for Generative Speech Enhancement}, | ||
author={Ante Juki\'{c} and Roman Korostik and Jagadeesh Balam and Boris Ginsburg}, | ||
year={2024}, | ||
pages={1175-1179}, | ||
booktitle={Proc. Interspeech} | ||
} | ||
|
||
@inproceedings{welker2022speech, | ||
author={Simon Welker and Julius Richter and Timo Gerkmann}, | ||
title={Speech Enhancement with Score-Based Generative Models in the Complex {STFT} Domain}, | ||
year={2022}, | ||
pages={2928-2932}, | ||
booktitle={Proc. Interspeech} | ||
} | ||
|
||
@article{richter2023sgmse, | ||
author = {Richter, Julius and Welker, Simon and Lemercier, Jean-Marie and Lay, Bunlong and Gerkmann, Timo}, | ||
title = {{Speech Enhancement and Dereverberation with Diffusion-Based Generative Models}}, | ||
journal = {IEEE/ACM Trans. on Audio, Speech, and Language Process.}, | ||
volume = {31}, | ||
pages = {2351-2364}, | ||
year = {2023} | ||
} | ||
|
||
@article{ku2024generative, | ||
title={Generative Speech Foundation Model Pretraining for High-Quality Speech Extraction and Restoration}, | ||
author={Pin-Jui Ku and Alexander H. Liu and Roman Korostik and Sung-Feng Huang and Szu-Wei Fu and Ante Jukić}, | ||
journal={arXiv preprint arXiv:2409.16117}, | ||
year={2024}, | ||
} | ||
|
||
@souden{souden2010, | ||
author={Souden, Mehrez and Benesty, Jacob and Affes, SofiÈne}, | ||
journal={IEEE Transactions on Audio, Speech, and Language Processing}, | ||
title={On Optimal Frequency-Domain Multichannel Linear Filtering for Noise Reduction}, | ||
year={2010}, | ||
volume={18}, | ||
number={2}, | ||
pages={260-276} | ||
} | ||
|
||
@inproceedings{ | ||
lipman2023flow, | ||
title={Flow Matching for Generative Modeling}, | ||
author={Yaron Lipman and Ricky T. Q. Chen and Heli Ben-Hamu and Maximilian Nickel and Matthew Le}, | ||
booktitle={Proc. ICLR}, | ||
year={2023}, | ||
url={https://openreview.net/forum?id=PqvMRDCJT9t} | ||
} |
Oops, something went wrong.