NVIDIA · tango4j · Nov 14, 2024 · Nov 14, 2024 · Nov 14, 2024 · Nov 14, 2024
@@ -816,6 +816,33 @@ jobs:
         +trainer.fast_dev_run=True \
         exp_manager.exp_dir=/tmp/speaker_diarization_results
 
+  L2_Speaker_dev_run_EndtoEnd_Speaker_Diarization_Sortformer:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_EndtoEnd_Speaker_Diarization_Sortformer') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/speaker_tasks/diarization/neural_diarizer/sortformer_diar_train.py \
+        trainer.devices="[0]" \
+        batch_size=3 \
+        model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/eesd_train_tiny.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/eesd_valid_tiny.json \
+        exp_manager.exp_dir=/tmp/speaker_diarization_results \
+        +trainer.fast_dev_run=True 
+
+  L2_Speaker_dev_run_EndtoEnd_Diarizer_Inference:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_EndtoEnd_Diarizer_Inference') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/speaker_tasks/diarization/neural_diarizer/e2e_diarize_speech.py \
+        model_path=/home/TestData/an4_diarizer/diar_sortformer_4spk-v1-tiny.nemo \
+        dataset_manifest=/home/TestData/an4_diarizer/simulated_valid/eesd_valid_tiny.json \
+        batch_size=1 
+
   L2_Speaker_dev_run_Speech_to_Label:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4517,6 +4544,8 @@ jobs:
       - L2_Speech_to_Text_EMA
       - L2_Speaker_dev_run_Speaker_Recognition
       - L2_Speaker_dev_run_Speaker_Diarization
+      - L2_Speaker_dev_run_EndtoEnd_Speaker_Diarization_Sortformer
+      - L2_Speaker_dev_run_EndtoEnd_Diarizer_Inference
       - L2_Speaker_dev_run_Speech_to_Label
       - L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference
       - L2_Speaker_dev_run_Clustering_Diarizer_Inference

diff --git a/nemo/collections/asr/data/audio_to_diar_label.py b/nemo/collections/asr/data/audio_to_diar_label.py
@@ -1065,6 +1065,7 @@ def __init__(
         round_digits: int = 2,
         soft_targets: bool = False,
         subsampling_factor: int = 8,
+        device: str = 'cpu',
     ):
         super().__init__()
         self.collection = EndtoEndDiarizationSpeechLabel(
@@ -1084,6 +1085,7 @@ def __init__(
         self.soft_targets = soft_targets
         self.round_digits = 2
         self.floor_decimal = 10**self.round_digits
+        self.device = device
 
     def __len__(self):
         return len(self.collection)
@@ -1232,11 +1234,13 @@ def __getitem__(self, index):
         audio_signal = audio_signal[: round(self.featurizer.sample_rate * session_len_sec)]
 
         audio_signal_length = torch.tensor(audio_signal.shape[0]).long()
-        audio_signal, audio_signal_length = audio_signal.to('cpu'), audio_signal_length.to('cpu')
-        target_len = self.get_segment_timestamps(duration=session_len_sec, sample_rate=self.featurizer.sample_rate)
+        audio_signal, audio_signal_length = audio_signal.to(self.device), audio_signal_length.to(self.device)
+        target_len = self.get_segment_timestamps(duration=session_len_sec, sample_rate=self.featurizer.sample_rate).to(
+            self.device
+        )
         targets = self.parse_rttm_for_targets_and_lens(
             rttm_file=sample.rttm_file, offset=offset, duration=session_len_sec, target_len=target_len
-        )
+        ).to(self.device)
         return audio_signal, audio_signal_length, targets, target_len
 
 
@@ -1355,6 +1359,7 @@ def __init__(
         window_stride,
         global_rank: int,
         soft_targets: bool,
+        device: str,
     ):
         super().__init__(
             manifest_filepath=manifest_filepath,
@@ -1365,6 +1370,7 @@ def __init__(
             window_stride=window_stride,
             global_rank=global_rank,
             soft_targets=soft_targets,
+            device=device,
         )
 
     def eesd_train_collate_fn(self, batch):

diff --git a/tests/collections/speaker_tasks/test_diar_datasets.py b/tests/collections/speaker_tasks/test_diar_datasets.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import tempfile
+
+import pytest
+import torch.cuda
+
+from nemo.collections.asr.data.audio_to_diar_label import AudioToSpeechE2ESpkDiarDataset
+from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer
+from nemo.collections.asr.parts.utils.speaker_utils import get_vad_out_from_rttm_line, read_rttm_lines
+
+
+def is_rttm_length_too_long(rttm_file_path, wav_len_in_sec):
+    """
+    Check if the maximum RTTM duration exceeds the length of the provided audio file.
+
+    Args:
+        rttm_file_path (str): Path to the RTTM file.
+        wav_len_in_sec (float): Length of the audio file in seconds.
+
+    Returns:
+        bool: True if the maximum RTTM duration is less than or equal to the length of the audio file, False otherwise.
+    """
+    rttm_lines = read_rttm_lines(rttm_file_path)
+    max_rttm_sec = 0
+    for line in rttm_lines:
+        start, dur = get_vad_out_from_rttm_line(line)
+        max_rttm_sec = max(max_rttm_sec, start + dur)
+    return max_rttm_sec <= wav_len_in_sec
+
+
+class TestAudioToSpeechE2ESpkDiarDataset:
+
+    @pytest.mark.unit
+    def test_e2e_speaker_diar_dataset(self, test_data_dir):
+        manifest_path = os.path.abspath(os.path.join(test_data_dir, 'asr/diarizer/lsm_val.json'))
+
+        batch_size = 4
+        num_samples = 8
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        data_dict_list = []
+        with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8') as f:
+            with open(manifest_path, 'r', encoding='utf-8') as mfile:
+                for ix, line in enumerate(mfile):
+                    if ix >= num_samples:
+                        break
+
+                    line = line.replace("tests/data/", test_data_dir + "/").replace("\n", "")
+                    f.write(f"{line}\n")
+                    data_dict = json.loads(line)
+                    data_dict_list.append(data_dict)
+
+            f.seek(0)
+            featurizer = WaveformFeaturizer(sample_rate=16000, int_values=False, augmentor=None)
+
+            dataset = AudioToSpeechE2ESpkDiarDataset(
+                manifest_filepath=f.name,
+                soft_label_thres=0.5,
+                session_len_sec=90,
+                num_spks=4,
+                featurizer=featurizer,
+                window_stride=0.01,
+                global_rank=0,
+                soft_targets=False,
+                device=device,
+            )
+            dataloader_instance = torch.utils.data.DataLoader(
+                dataset=dataset,
+                batch_size=batch_size,
+                collate_fn=dataset.eesd_train_collate_fn,
+                drop_last=False,
+                shuffle=False,
+                num_workers=1,
+                pin_memory=False,
+            )
+            assert len(dataloader_instance) == (num_samples / batch_size)  # Check if the number of batches is correct
+            batch_counts = len(dataloader_instance)
+
+            deviation_thres_rate = 0.01  # 1% deviation allowed
+            for batch_index, batch in enumerate(dataloader_instance):
+                if batch_index != batch_counts - 1:
+                    assert len(batch) == batch_size, "Batch size does not match the expected value"
+                audio_signals, audio_signal_len, targets, target_lens = batch
+                for sample_index in range(audio_signals.shape[0]):
+                    dataloader_audio_in_sec = audio_signal_len[sample_index].item()
+                    data_dur_in_sec = abs(
+                        data_dict_list[batch_size * batch_index + sample_index]['duration'] * featurizer.sample_rate
+                        - dataloader_audio_in_sec
+                    )
+                    assert (
+                        data_dur_in_sec <= deviation_thres_rate * dataloader_audio_in_sec
+                    ), "Duration deviation exceeds 1%"
+                assert not torch.isnan(audio_signals).any(), "audio_signals tensor contains NaN values"
+                assert not torch.isnan(audio_signal_len).any(), "audio_signal_len tensor contains NaN values"
+                assert not torch.isnan(targets).any(), "targets tensor contains NaN values"
+                assert not torch.isnan(target_lens).any(), "target_lens tensor contains NaN values"
diff --git a/...collections/asr/test_diar_label_models.py → ...s/speaker_tasks/test_diar_label_models.py b/...collections/asr/test_diar_label_models.py → ...s/speaker_tasks/test_diar_label_models.py
@@ -16,6 +16,7 @@
 import torch
 from omegaconf import DictConfig
 
+from nemo.collections.asr.losses import BCELoss
 from nemo.collections.asr.models import EncDecDiarLabelModel
 
 
@@ -24,7 +25,12 @@ def msdd_model():
 
     preprocessor = {
         'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor',
-        'params': {"features": 80, "window_size": 0.025, "window_stride": 0.01, "sample_rate": 16000,},
+        'params': {
+            "features": 80,
+            "window_size": 0.025,
+            "window_stride": 0.01,
+            "sample_rate": 16000,
+        },
     }
 
     speaker_model_encoder = {
@@ -165,3 +171,37 @@ def test_forward_infer(self, msdd_model):
         assert diff <= 1e-6
         diff = torch.max(torch.abs(scale_weights_instance - scale_weights_batch))
         assert diff <= 1e-6
+
+
+class TestBCELoss:
+    @pytest.mark.unit
+    @pytest.mark.parametrize(
+        "probs, labels, target_lens, reduction, expected_output",
+        [
+            (
+                torch.tensor([[[0.5, 0.5], [0.5, 0.5]]], dtype=torch.float32),
+                torch.tensor([[[1, 0], [0, 1]]], dtype=torch.float32),
+                torch.tensor([2]),
+                "mean",
+                torch.tensor(0.693147, dtype=torch.float32),
+            ),
+            (
+                torch.tensor([[[0.5, 0.5], [0.0, 1.0]]], dtype=torch.float32),
+                torch.tensor([[[1, 0], [0, 1]]], dtype=torch.float32),
+                torch.tensor([1]),
+                "mean",
+                torch.tensor(0.693147, dtype=torch.float32),
+            ),
+            (
+                torch.tensor([[[0, 1], [1, 0]]], dtype=torch.float32),
+                torch.tensor([[[1, 0], [0, 1]]], dtype=torch.float32),
+                torch.tensor([2]),
+                "mean",
+                torch.tensor(100, dtype=torch.float32),
+            ),
+        ],
+    )
+    def test_loss(self, probs, labels, target_lens, reduction, expected_output):
+        loss = BCELoss(reduction=reduction)
+        result = loss(probs=probs, labels=labels, target_lens=target_lens)
+        assert torch.allclose(result, expected_output), f"Expected {expected_output}, but got {result}"