NVIDIA · andylamp · Dec 2, 2024 · Dec 2, 2024 · Dec 2, 2024 · Dec 2, 2024
diff --git a/nemo/collections/asr/parts/preprocessing/feature_loader.py b/nemo/collections/asr/parts/preprocessing/feature_loader.py
@@ -18,12 +18,13 @@
 
 
 class ExternalFeatureLoader(object):
-    """Feature loader that load external features store in certain format. 
+    """Feature loader that load external features store in certain format.
     Currently support pickle, npy and npz format.
     """
 
     def __init__(
-        self, augmentor: Optional["nemo.collections.asr.parts.perturb.FeatureAugmentor"] = None,
+        self,
+        augmentor: Optional["nemo.collections.asr.parts.perturb.FeatureAugmentor"] = None,
     ):
         """
         Feature loader
@@ -50,23 +51,27 @@ def _convert_samples_to_float32(samples: np.ndarray) -> np.ndarray:
         Integers will be scaled to [-1, 1] in float32.
         """
         float32_samples = samples.astype('float32')
-        if samples.dtype in np.sctypes['int']:
+        if samples.dtype in (np.int8, np.int16, np.int32, np.int64):
             bits = np.iinfo(samples.dtype).bits
             float32_samples *= 1.0 / 2 ** (bits - 1)
-        elif samples.dtype in np.sctypes['float']:
+        elif samples.dtype in (np.float16, np.float32, np.float64):
             pass
         else:
             raise TypeError("Unsupported sample type: %s." % samples.dtype)
         return float32_samples
 
     def process(self, file_path: str) -> torch.Tensor:
+        """Processes the features from the provided `file_path`."""
         features = self.load_feature_from_file(file_path)
         features = self.process_segment(features)
         return features
 
     def process_segment(self, feature_segment):
+        """Processes the provided feature segment."""
         if self.augmentor:
-            # augmentor for external features. Here possible augmentor for external embedding feature is Diaconis Augmentation and might be implemented later
+            # augmentor for external features. Here possible augmentor for
+            # external embedding feature is Diaconis Augmentation and might
+            # be implemented later
             self.augmentor.perturb(feature_segment)
             return torch.tensor(feature_segment, dtype=torch.float)
 

diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py
@@ -67,14 +67,15 @@
 
 def select_channels(signal: npt.NDArray, channel_selector: Optional[ChannelSelectorType] = None) -> npt.NDArray:
     """
-    Convert a multi-channel signal to a single-channel signal by averaging over channels or selecting a single channel,
-    or pass-through multi-channel signal when channel_selector is `None`.
+    Convert a multi-channel signal to a single-channel signal by averaging over channels or
+    selecting a single channel, or pass-through multi-channel signal when channel_selector is `None`.
 
     Args:
         signal: numpy array with shape (..., num_channels)
-        channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable
-                          of integers denoting a subset of channels. Channel selector is using zero-based indexing.
-                          If set to `None`, the original signal will be returned. Uses zero-based indexing.
+        channel selector: string denoting the downmix mode, an integer denoting the channel to be selected,
+                          or an iterable of integers denoting a subset of channels. Channel selector is
+                          using zero-based indexing. If set to `None`, the original signal will be returned.
+                          Uses zero-based indexing.
 
     Returns:
         numpy array
@@ -92,7 +93,8 @@ def select_channels(signal: npt.NDArray, channel_selector: Optional[ChannelSelec
 
     if num_channels >= num_samples:
         logging.warning(
-            'Number of channels (%d) is greater or equal than number of samples (%d). Check for possible transposition.',
+            'Number of channels (%d) is greater or equal than number of samples (%d). '
+            'Check for possible transposition.',
             num_channels,
             num_samples,
         )
@@ -199,7 +201,8 @@ def __init__(
             samples = samples.transpose()
             sample_rate = target_sr
         if trim:
-            # librosa is using channels-first layout (num_channels, num_samples), which is transpose of AudioSegment's layout
+            # librosa is using channels-first layout (num_channels, num_samples),
+            # which is transpose of AudioSegment's layout
             samples = samples.transpose()
             samples, _ = librosa.effects.trim(
                 samples, top_db=trim_top_db, ref=trim_ref, frame_length=trim_frame_length, hop_length=trim_hop_length
@@ -260,10 +263,10 @@ def _convert_samples_to_float32(samples):
         Integers will be scaled to [-1, 1] in float32.
         """
         float32_samples = samples.astype('float32')
-        if samples.dtype in np.sctypes['int']:
+        if samples.dtype in (np.int8, np.int16, np.int32, np.int64):
             bits = np.iinfo(samples.dtype).bits
             float32_samples *= 1.0 / 2 ** (bits - 1)
-        elif samples.dtype in np.sctypes['float']:
+        elif samples.dtype in (np.float16, np.float32, np.float64):
             pass
         else:
             raise TypeError("Unsupported sample type: %s." % samples.dtype)
@@ -303,11 +306,12 @@ def from_file(
         :param trim_frame_length: the number of samples per analysis frame
         :param trim_hop_length: the number of samples between analysis frames
         :param orig_sr: the original sample rate
-        :param channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable
-                                 of integers denoting a subset of channels. Channel selector is using zero-based indexing.
-                                 If set to `None`, the original signal will be used.
+        :param channel selector: string denoting the downmix mode, an integer denoting the channel to be selected,
+                                 or an iterable of integers denoting a subset of channels. Channel selector is using
+                                 zero-based indexing. If set to `None`, the original signal will be used.
         :param normalize_db (Optional[float]): if not None, normalize the audio signal to a target RMS value
-        :param ref_channel (Optional[int]): channel to use as reference for normalizing multi-channel audio, set None to use max RMS across channels
+        :param ref_channel (Optional[int]): channel to use as reference for normalizing multi-channel audio,
+                                            set None to use max RMS across channels
         :return: AudioSegment instance
         """
         samples = None
@@ -415,7 +419,8 @@ def from_file_list(
             # Shortcut when selecting a single channel
             if channel_selector >= len(audio_file_list):
                 raise RuntimeError(
-                    f'Channel cannot be selected: channel_selector={channel_selector}, num_audio_files={len(audio_file_list)}'
+                    f'Channel cannot be selected: channel_selector={channel_selector}, '
+                    f'num_audio_files={len(audio_file_list)}'
                 )
             # Select only a single file
             audio_file_list = [audio_file_list[channel_selector]]
@@ -441,7 +446,8 @@ def from_file_list(
             # Only single-channel individual files are supported for now
             if a_segment.num_channels != 1:
                 raise RuntimeError(
-                    f'Expecting a single-channel audio signal, but loaded {a_segment.num_channels} channels from file {a_file}'
+                    f'Expecting a single-channel audio signal, but loaded {a_segment.num_channels} '
+                    f'channels from file {a_file}'
                 )
 
             if target_sr is None:
@@ -523,14 +529,16 @@ def segment_from_file(
                         audio_start = math.floor(offset * sample_rate)
                         if audio_start > max_audio_start:
                             raise RuntimeError(
-                                f'Provided audio start ({audio_start}) is larger than the maximum possible ({max_audio_start})'
+                                f'Provided audio start ({audio_start}) is larger than the '
+                                f'maximum possible ({max_audio_start})'
                             )
                     f.seek(audio_start)
                     samples = f.read(n_segments_at_original_sr, dtype=dtype)
                     is_segmented = True
                 elif n_segments_at_original_sr > len(f):
                     logging.warning(
-                        f"Number of segments ({n_segments_at_original_sr}) is greater than the length ({len(f)}) of the audio file {audio_file}. This may lead to shape mismatch errors."
+                        f"Number of segments ({n_segments_at_original_sr}) is greater than the length ({len(f)}) "
+                        f"of the audio file {audio_file}. This may lead to shape mismatch errors."
                     )
                     samples = f.read(dtype=dtype)
                 else:
@@ -550,25 +558,30 @@ def segment_from_file(
 
     @property
     def samples(self):
+        """Returns a copy of the samples."""
         return self._samples.copy()
 
     @property
     def sample_rate(self):
+        """Returns the sample rate of the segment."""
         return self._sample_rate
 
     @property
     def num_channels(self):
+        """Returns the number of channels in the segment."""
         if self._samples.ndim == 1:
             return 1
         else:
             return self._samples.shape[-1]
 
     @property
     def num_samples(self):
+        """Returns the number of samples in the segment."""
         return self._samples.shape[0]
 
     @property
     def duration(self):
+        """Returns the duration of the segment in seconds."""
         return self.num_samples / float(self._sample_rate)
 
     @property
@@ -579,21 +592,26 @@ def rms_db(self):
 
     @property
     def orig_sr(self):
+        """Returns the original sample rate of the segment."""
         return self._orig_sr
 
     @property
     def offset(self):
+        """Returns the offset used for the segment."""
         return float(self._offset) if self._offset is not None else None
 
     @property
     def audio_file(self):
+        """Returns the audio file that the segment was loaded from."""
         return str(self._audio_file) if self._audio_file is not None else None
 
     def is_empty(self):
+        """Checks if the segment is empty."""
         mean_square = np.sum(np.mean(self._samples**2, axis=0))
         return self.num_samples == 0 or mean_square == 0
 
     def gain_db(self, gain):
+        """Returns the gain in decibels."""
         self._samples *= 10.0 ** (gain / 20.0)
 
     def normalize_db(self, target_db=-20, ref_channel=None):
@@ -622,7 +640,8 @@ def pad(self, pad_size, symmetric=False):
             pad_width = ((pad_size, pad_size), (0, 0)) if symmetric else ((0, pad_size), (0, 0))
         else:
             raise NotImplementedError(
-                f"Padding not implemented for signals with more that 2 dimensions. Current samples dimension: {samples_ndim}."
+                f"Padding not implemented for signals with more that 2 dimensions. "
+                f"Current samples dimension: {samples_ndim}."
             )
         # apply padding
         self._samples = np.pad(