diff --git a/nemo/collections/asr/parts/preprocessing/feature_loader.py b/nemo/collections/asr/parts/preprocessing/feature_loader.py index 8c629cf4cfd4..e715d2dafb95 100644 --- a/nemo/collections/asr/parts/preprocessing/feature_loader.py +++ b/nemo/collections/asr/parts/preprocessing/feature_loader.py @@ -18,12 +18,13 @@ class ExternalFeatureLoader(object): - """Feature loader that load external features store in certain format. + """Feature loader that load external features store in certain format. Currently support pickle, npy and npz format. """ def __init__( - self, augmentor: Optional["nemo.collections.asr.parts.perturb.FeatureAugmentor"] = None, + self, + augmentor: Optional["nemo.collections.asr.parts.perturb.FeatureAugmentor"] = None, ): """ Feature loader @@ -50,23 +51,27 @@ def _convert_samples_to_float32(samples: np.ndarray) -> np.ndarray: Integers will be scaled to [-1, 1] in float32. """ float32_samples = samples.astype('float32') - if samples.dtype in np.sctypes['int']: + if samples.dtype in (np.int8, np.int16, np.int32, np.int64): bits = np.iinfo(samples.dtype).bits float32_samples *= 1.0 / 2 ** (bits - 1) - elif samples.dtype in np.sctypes['float']: + elif samples.dtype in (np.float16, np.float32, np.float64): pass else: raise TypeError("Unsupported sample type: %s." % samples.dtype) return float32_samples def process(self, file_path: str) -> torch.Tensor: + """Processes the features from the provided `file_path`.""" features = self.load_feature_from_file(file_path) features = self.process_segment(features) return features def process_segment(self, feature_segment): + """Processes the provided feature segment.""" if self.augmentor: - # augmentor for external features. Here possible augmentor for external embedding feature is Diaconis Augmentation and might be implemented later + # augmentor for external features. Here possible augmentor for + # external embedding feature is Diaconis Augmentation and might + # be implemented later self.augmentor.perturb(feature_segment) return torch.tensor(feature_segment, dtype=torch.float) diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py index aceab6637006..00558769b020 100644 --- a/nemo/collections/asr/parts/preprocessing/segment.py +++ b/nemo/collections/asr/parts/preprocessing/segment.py @@ -67,14 +67,15 @@ def select_channels(signal: npt.NDArray, channel_selector: Optional[ChannelSelectorType] = None) -> npt.NDArray: """ - Convert a multi-channel signal to a single-channel signal by averaging over channels or selecting a single channel, - or pass-through multi-channel signal when channel_selector is `None`. + Convert a multi-channel signal to a single-channel signal by averaging over channels or + selecting a single channel, or pass-through multi-channel signal when channel_selector is `None`. Args: signal: numpy array with shape (..., num_channels) - channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable - of integers denoting a subset of channels. Channel selector is using zero-based indexing. - If set to `None`, the original signal will be returned. Uses zero-based indexing. + channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, + or an iterable of integers denoting a subset of channels. Channel selector is + using zero-based indexing. If set to `None`, the original signal will be returned. + Uses zero-based indexing. Returns: numpy array @@ -92,7 +93,8 @@ def select_channels(signal: npt.NDArray, channel_selector: Optional[ChannelSelec if num_channels >= num_samples: logging.warning( - 'Number of channels (%d) is greater or equal than number of samples (%d). Check for possible transposition.', + 'Number of channels (%d) is greater or equal than number of samples (%d). ' + 'Check for possible transposition.', num_channels, num_samples, ) @@ -199,7 +201,8 @@ def __init__( samples = samples.transpose() sample_rate = target_sr if trim: - # librosa is using channels-first layout (num_channels, num_samples), which is transpose of AudioSegment's layout + # librosa is using channels-first layout (num_channels, num_samples), + # which is transpose of AudioSegment's layout samples = samples.transpose() samples, _ = librosa.effects.trim( samples, top_db=trim_top_db, ref=trim_ref, frame_length=trim_frame_length, hop_length=trim_hop_length @@ -260,10 +263,10 @@ def _convert_samples_to_float32(samples): Integers will be scaled to [-1, 1] in float32. """ float32_samples = samples.astype('float32') - if samples.dtype in np.sctypes['int']: + if samples.dtype in (np.int8, np.int16, np.int32, np.int64): bits = np.iinfo(samples.dtype).bits float32_samples *= 1.0 / 2 ** (bits - 1) - elif samples.dtype in np.sctypes['float']: + elif samples.dtype in (np.float16, np.float32, np.float64): pass else: raise TypeError("Unsupported sample type: %s." % samples.dtype) @@ -303,11 +306,12 @@ def from_file( :param trim_frame_length: the number of samples per analysis frame :param trim_hop_length: the number of samples between analysis frames :param orig_sr: the original sample rate - :param channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable - of integers denoting a subset of channels. Channel selector is using zero-based indexing. - If set to `None`, the original signal will be used. + :param channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, + or an iterable of integers denoting a subset of channels. Channel selector is using + zero-based indexing. If set to `None`, the original signal will be used. :param normalize_db (Optional[float]): if not None, normalize the audio signal to a target RMS value - :param ref_channel (Optional[int]): channel to use as reference for normalizing multi-channel audio, set None to use max RMS across channels + :param ref_channel (Optional[int]): channel to use as reference for normalizing multi-channel audio, + set None to use max RMS across channels :return: AudioSegment instance """ samples = None @@ -415,7 +419,8 @@ def from_file_list( # Shortcut when selecting a single channel if channel_selector >= len(audio_file_list): raise RuntimeError( - f'Channel cannot be selected: channel_selector={channel_selector}, num_audio_files={len(audio_file_list)}' + f'Channel cannot be selected: channel_selector={channel_selector}, ' + f'num_audio_files={len(audio_file_list)}' ) # Select only a single file audio_file_list = [audio_file_list[channel_selector]] @@ -441,7 +446,8 @@ def from_file_list( # Only single-channel individual files are supported for now if a_segment.num_channels != 1: raise RuntimeError( - f'Expecting a single-channel audio signal, but loaded {a_segment.num_channels} channels from file {a_file}' + f'Expecting a single-channel audio signal, but loaded {a_segment.num_channels} ' + f'channels from file {a_file}' ) if target_sr is None: @@ -523,14 +529,16 @@ def segment_from_file( audio_start = math.floor(offset * sample_rate) if audio_start > max_audio_start: raise RuntimeError( - f'Provided audio start ({audio_start}) is larger than the maximum possible ({max_audio_start})' + f'Provided audio start ({audio_start}) is larger than the ' + f'maximum possible ({max_audio_start})' ) f.seek(audio_start) samples = f.read(n_segments_at_original_sr, dtype=dtype) is_segmented = True elif n_segments_at_original_sr > len(f): logging.warning( - f"Number of segments ({n_segments_at_original_sr}) is greater than the length ({len(f)}) of the audio file {audio_file}. This may lead to shape mismatch errors." + f"Number of segments ({n_segments_at_original_sr}) is greater than the length ({len(f)}) " + f"of the audio file {audio_file}. This may lead to shape mismatch errors." ) samples = f.read(dtype=dtype) else: @@ -550,14 +558,17 @@ def segment_from_file( @property def samples(self): + """Returns a copy of the samples.""" return self._samples.copy() @property def sample_rate(self): + """Returns the sample rate of the segment.""" return self._sample_rate @property def num_channels(self): + """Returns the number of channels in the segment.""" if self._samples.ndim == 1: return 1 else: @@ -565,10 +576,12 @@ def num_channels(self): @property def num_samples(self): + """Returns the number of samples in the segment.""" return self._samples.shape[0] @property def duration(self): + """Returns the duration of the segment in seconds.""" return self.num_samples / float(self._sample_rate) @property @@ -579,21 +592,26 @@ def rms_db(self): @property def orig_sr(self): + """Returns the original sample rate of the segment.""" return self._orig_sr @property def offset(self): + """Returns the offset used for the segment.""" return float(self._offset) if self._offset is not None else None @property def audio_file(self): + """Returns the audio file that the segment was loaded from.""" return str(self._audio_file) if self._audio_file is not None else None def is_empty(self): + """Checks if the segment is empty.""" mean_square = np.sum(np.mean(self._samples**2, axis=0)) return self.num_samples == 0 or mean_square == 0 def gain_db(self, gain): + """Returns the gain in decibels.""" self._samples *= 10.0 ** (gain / 20.0) def normalize_db(self, target_db=-20, ref_channel=None): @@ -622,7 +640,8 @@ def pad(self, pad_size, symmetric=False): pad_width = ((pad_size, pad_size), (0, 0)) if symmetric else ((0, pad_size), (0, 0)) else: raise NotImplementedError( - f"Padding not implemented for signals with more that 2 dimensions. Current samples dimension: {samples_ndim}." + f"Padding not implemented for signals with more that 2 dimensions. " + f"Current samples dimension: {samples_ndim}." ) # apply padding self._samples = np.pad(