From 1ada304cf9708dbbfef3e67cc951221608b85712 Mon Sep 17 00:00:00 2001 From: Taejin Park Date: Tue, 4 Jun 2024 16:34:26 -0700 Subject: [PATCH 1/3] Fixed clustering diarizer to load MSDD to GPU by default if cuda on Signed-off-by: Taejin Park --- .../diarization/neural_diarizer/multiscale_diar_decoder_infer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py b/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py index 05d1b3cd1304..a876c156f701 100644 --- a/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py +++ b/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py @@ -29,6 +29,7 @@ @hydra_runner(config_path="../conf/inference", config_name="diar_infer_telephonic.yaml") def main(cfg): + import torch; torch.backends.cudnn.enabled =False diarizer_model = NeuralDiarizer(cfg=cfg).to(cfg.device) diarizer_model.diarize() From df63e95d4705040aec90294dc4b3dda95c91cdb2 Mon Sep 17 00:00:00 2001 From: Taejin Park Date: Tue, 4 Jun 2024 16:37:38 -0700 Subject: [PATCH 2/3] Fixed clustering diarizer to load MSDD to GPU by default if cuda on Signed-off-by: Taejin Park --- .../neural_diarizer/multiscale_diar_decoder_infer.py | 1 - nemo/collections/asr/models/clustering_diarizer.py | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py b/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py index a876c156f701..05d1b3cd1304 100644 --- a/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py +++ b/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py @@ -29,7 +29,6 @@ @hydra_runner(config_path="../conf/inference", config_name="diar_infer_telephonic.yaml") def main(cfg): - import torch; torch.backends.cudnn.enabled =False diarizer_model = NeuralDiarizer(cfg=cfg).to(cfg.device) diarizer_model.diarize() diff --git a/nemo/collections/asr/models/clustering_diarizer.py b/nemo/collections/asr/models/clustering_diarizer.py index 533f276c0018..f73fd61e61d2 100644 --- a/nemo/collections/asr/models/clustering_diarizer.py +++ b/nemo/collections/asr/models/clustering_diarizer.py @@ -137,7 +137,10 @@ def _init_speaker_model(self, speaker_model=None): Initialize speaker embedding model with model name or path passed through config """ if speaker_model is not None: - self._speaker_model = speaker_model + if self._cfg.device is None and torch.cuda.is_available(): + self._speaker_model = speaker_model.to(torch.device('cuda')) + else: + self._speaker_model = speaker_model else: model_path = self._cfg.diarizer.speaker_embeddings.model_path if model_path is not None and model_path.endswith('.nemo'): @@ -158,7 +161,6 @@ def _init_speaker_model(self, speaker_model=None): self._speaker_model = EncDecSpeakerLabelModel.from_pretrained( model_name=model_path, map_location=self._cfg.device ) - self.multiscale_args_dict = parse_scale_configs( self._diarizer_params.speaker_embeddings.parameters.window_length_in_sec, self._diarizer_params.speaker_embeddings.parameters.shift_length_in_sec, From fad0dfbdf84feafcaafc45c5b8bcc9acd2ccce6b Mon Sep 17 00:00:00 2001 From: tango4j Date: Tue, 4 Jun 2024 23:41:37 +0000 Subject: [PATCH 3/3] Apply isort and black reformatting Signed-off-by: tango4j --- .../asr/models/clustering_diarizer.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/nemo/collections/asr/models/clustering_diarizer.py b/nemo/collections/asr/models/clustering_diarizer.py index f73fd61e61d2..93913a43c1b5 100644 --- a/nemo/collections/asr/models/clustering_diarizer.py +++ b/nemo/collections/asr/models/clustering_diarizer.py @@ -74,10 +74,10 @@ def get_available_model_names(class_name): class ClusteringDiarizer(torch.nn.Module, Model, DiarizationMixin): """ - Inference model Class for offline speaker diarization. - This class handles required functionality for diarization : Speech Activity Detection, Segmentation, - Extract Embeddings, Clustering, Resegmentation and Scoring. - All the parameters are passed through config file + Inference model Class for offline speaker diarization. + This class handles required functionality for diarization : Speech Activity Detection, Segmentation, + Extract Embeddings, Clustering, Resegmentation and Scoring. + All the parameters are passed through config file """ def __init__(self, cfg: Union[DictConfig, Any], speaker_model=None): @@ -173,7 +173,9 @@ def _setup_vad_test_data(self, manifest_vad_input): 'sample_rate': self._cfg.sample_rate, 'batch_size': self._cfg.get('batch_size'), 'vad_stream': True, - 'labels': ['infer',], + 'labels': [ + 'infer', + ], 'window_length_in_sec': self._vad_window_length_in_sec, 'shift_length_in_sec': self._vad_shift_length_in_sec, 'trim_silence': False, @@ -194,8 +196,8 @@ def _setup_spkr_test_data(self, manifest_file): def _run_vad(self, manifest_file): """ - Run voice activity detection. - Get log probability of voice activity detection and smoothes using the post processing parameters. + Run voice activity detection. + Get log probability of voice activity detection and smoothes using the post processing parameters. Using generated frame level predictions generated manifest file for later speaker embedding extraction. input: manifest_file (str) : Manifest file containing path to audio file and label as infer @@ -340,7 +342,7 @@ def _perform_speech_activity_detection(self): def _extract_embeddings(self, manifest_file: str, scale_idx: int, num_scales: int): """ This method extracts speaker embeddings from segments passed through manifest_file - Optionally you may save the intermediate speaker embeddings for debugging or any use. + Optionally you may save the intermediate speaker embeddings for debugging or any use. """ logging.info("Extracting embeddings for Diarization") self._setup_spkr_test_data(manifest_file)