From 5ac9a397b9a2e408abc8568fae737eb6ef293672 Mon Sep 17 00:00:00 2001 From: andylamp <2177249+andylamp@users.noreply.github.com> Date: Mon, 2 Dec 2024 00:02:22 +0000 Subject: [PATCH 01/16] fix numpy compatibility while replicating existing behavior Signed-off-by: andylamp <2177249+andylamp@users.noreply.github.com> --- nemo/collections/asr/parts/preprocessing/feature_loader.py | 4 ++-- nemo/collections/asr/parts/preprocessing/segment.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo/collections/asr/parts/preprocessing/feature_loader.py b/nemo/collections/asr/parts/preprocessing/feature_loader.py index 8c629cf4cfd4..17725fcebe35 100644 --- a/nemo/collections/asr/parts/preprocessing/feature_loader.py +++ b/nemo/collections/asr/parts/preprocessing/feature_loader.py @@ -50,10 +50,10 @@ def _convert_samples_to_float32(samples: np.ndarray) -> np.ndarray: Integers will be scaled to [-1, 1] in float32. """ float32_samples = samples.astype('float32') - if samples.dtype in np.sctypes['int']: + if samples.dtype in (np.int8, np.int16, np.int32, np.int64): bits = np.iinfo(samples.dtype).bits float32_samples *= 1.0 / 2 ** (bits - 1) - elif samples.dtype in np.sctypes['float']: + elif samples.dtype in (np.float16, np.float32, np.float64): pass else: raise TypeError("Unsupported sample type: %s." % samples.dtype) diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py index aceab6637006..c518fcfa5b66 100644 --- a/nemo/collections/asr/parts/preprocessing/segment.py +++ b/nemo/collections/asr/parts/preprocessing/segment.py @@ -260,10 +260,10 @@ def _convert_samples_to_float32(samples): Integers will be scaled to [-1, 1] in float32. """ float32_samples = samples.astype('float32') - if samples.dtype in np.sctypes['int']: + if samples.dtype in (np.int8, np.int16, np.int32, np.int64): bits = np.iinfo(samples.dtype).bits float32_samples *= 1.0 / 2 ** (bits - 1) - elif samples.dtype in np.sctypes['float']: + elif samples.dtype in (np.float16, np.float32, np.float64): pass else: raise TypeError("Unsupported sample type: %s." % samples.dtype) From 9738306d13d8a6682007d0b2ede0d496a5a2ec7e Mon Sep 17 00:00:00 2001 From: andylamp Date: Mon, 2 Dec 2024 00:36:35 +0000 Subject: [PATCH 02/16] Apply isort and black reformatting Signed-off-by: andylamp Signed-off-by: andylamp <2177249+andylamp@users.noreply.github.com> --- nemo/collections/asr/parts/preprocessing/feature_loader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nemo/collections/asr/parts/preprocessing/feature_loader.py b/nemo/collections/asr/parts/preprocessing/feature_loader.py index 17725fcebe35..858e3acb68c8 100644 --- a/nemo/collections/asr/parts/preprocessing/feature_loader.py +++ b/nemo/collections/asr/parts/preprocessing/feature_loader.py @@ -18,12 +18,13 @@ class ExternalFeatureLoader(object): - """Feature loader that load external features store in certain format. + """Feature loader that load external features store in certain format. Currently support pickle, npy and npz format. """ def __init__( - self, augmentor: Optional["nemo.collections.asr.parts.perturb.FeatureAugmentor"] = None, + self, + augmentor: Optional["nemo.collections.asr.parts.perturb.FeatureAugmentor"] = None, ): """ Feature loader From 00de6879378b4dbd8122e38525d88699a524da03 Mon Sep 17 00:00:00 2001 From: andylamp <2177249+andylamp@users.noreply.github.com> Date: Mon, 2 Dec 2024 01:40:36 +0000 Subject: [PATCH 03/16] add docstrings and fix line lengths --- .../asr/parts/preprocessing/feature_loader.py | 6 ++- .../asr/parts/preprocessing/segment.py | 48 +++++++++++++------ 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/nemo/collections/asr/parts/preprocessing/feature_loader.py b/nemo/collections/asr/parts/preprocessing/feature_loader.py index 858e3acb68c8..e715d2dafb95 100644 --- a/nemo/collections/asr/parts/preprocessing/feature_loader.py +++ b/nemo/collections/asr/parts/preprocessing/feature_loader.py @@ -61,13 +61,17 @@ def _convert_samples_to_float32(samples: np.ndarray) -> np.ndarray: return float32_samples def process(self, file_path: str) -> torch.Tensor: + """Processes the features from the provided `file_path`.""" features = self.load_feature_from_file(file_path) features = self.process_segment(features) return features def process_segment(self, feature_segment): + """Processes the provided feature segment.""" if self.augmentor: - # augmentor for external features. Here possible augmentor for external embedding feature is Diaconis Augmentation and might be implemented later + # augmentor for external features. Here possible augmentor for + # external embedding feature is Diaconis Augmentation and might + # be implemented later self.augmentor.perturb(feature_segment) return torch.tensor(feature_segment, dtype=torch.float) diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py index c518fcfa5b66..d43fd3aa53d0 100644 --- a/nemo/collections/asr/parts/preprocessing/segment.py +++ b/nemo/collections/asr/parts/preprocessing/segment.py @@ -67,14 +67,15 @@ def select_channels(signal: npt.NDArray, channel_selector: Optional[ChannelSelectorType] = None) -> npt.NDArray: """ - Convert a multi-channel signal to a single-channel signal by averaging over channels or selecting a single channel, - or pass-through multi-channel signal when channel_selector is `None`. + Convert a multi-channel signal to a single-channel signal by averaging over channels or + selecting a single channel, or pass-through multi-channel signal when channel_selector is `None`. Args: signal: numpy array with shape (..., num_channels) - channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable - of integers denoting a subset of channels. Channel selector is using zero-based indexing. - If set to `None`, the original signal will be returned. Uses zero-based indexing. + channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, + or an iterable of integers denoting a subset of channels. Channel selector is + using zero-based indexing. If set to `None`, the original signal will be returned. + Uses zero-based indexing. Returns: numpy array @@ -92,7 +93,8 @@ def select_channels(signal: npt.NDArray, channel_selector: Optional[ChannelSelec if num_channels >= num_samples: logging.warning( - 'Number of channels (%d) is greater or equal than number of samples (%d). Check for possible transposition.', + 'Number of channels (%d) is greater or equal than number of samples (%d). ' + 'Check for possible transposition.', num_channels, num_samples, ) @@ -199,7 +201,8 @@ def __init__( samples = samples.transpose() sample_rate = target_sr if trim: - # librosa is using channels-first layout (num_channels, num_samples), which is transpose of AudioSegment's layout + # librosa is using channels-first layout (num_channels, num_samples), + # which is transpose of AudioSegment's layout samples = samples.transpose() samples, _ = librosa.effects.trim( samples, top_db=trim_top_db, ref=trim_ref, frame_length=trim_frame_length, hop_length=trim_hop_length @@ -303,11 +306,12 @@ def from_file( :param trim_frame_length: the number of samples per analysis frame :param trim_hop_length: the number of samples between analysis frames :param orig_sr: the original sample rate - :param channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable - of integers denoting a subset of channels. Channel selector is using zero-based indexing. - If set to `None`, the original signal will be used. + :param channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, + or an iterable of integers denoting a subset of channels. Channel selector is using + zero-based indexing. If set to `None`, the original signal will be used. :param normalize_db (Optional[float]): if not None, normalize the audio signal to a target RMS value - :param ref_channel (Optional[int]): channel to use as reference for normalizing multi-channel audio, set None to use max RMS across channels + :param ref_channel (Optional[int]): channel to use as reference for normalizing multi-channel audio, + set None to use max RMS across channels :return: AudioSegment instance """ samples = None @@ -441,7 +445,8 @@ def from_file_list( # Only single-channel individual files are supported for now if a_segment.num_channels != 1: raise RuntimeError( - f'Expecting a single-channel audio signal, but loaded {a_segment.num_channels} channels from file {a_file}' + f'Expecting a single-channel audio signal, but loaded {a_segment.num_channels} ' + f'channels from file {a_file}' ) if target_sr is None: @@ -523,14 +528,16 @@ def segment_from_file( audio_start = math.floor(offset * sample_rate) if audio_start > max_audio_start: raise RuntimeError( - f'Provided audio start ({audio_start}) is larger than the maximum possible ({max_audio_start})' + f'Provided audio start ({audio_start}) is larger than the ' + f'maximum possible ({max_audio_start})' ) f.seek(audio_start) samples = f.read(n_segments_at_original_sr, dtype=dtype) is_segmented = True elif n_segments_at_original_sr > len(f): logging.warning( - f"Number of segments ({n_segments_at_original_sr}) is greater than the length ({len(f)}) of the audio file {audio_file}. This may lead to shape mismatch errors." + f"Number of segments ({n_segments_at_original_sr}) is greater than the length ({len(f)}) " + f"of the audio file {audio_file}. This may lead to shape mismatch errors." ) samples = f.read(dtype=dtype) else: @@ -550,14 +557,17 @@ def segment_from_file( @property def samples(self): + """Returns a copy of the samples.""" return self._samples.copy() @property def sample_rate(self): + """Returns the sample rate of the segment.""" return self._sample_rate @property def num_channels(self): + """Returns the number of channels in the segment.""" if self._samples.ndim == 1: return 1 else: @@ -565,10 +575,12 @@ def num_channels(self): @property def num_samples(self): + """Returns the number of samples in the segment.""" return self._samples.shape[0] @property def duration(self): + """Returns the duration of the segment in seconds.""" return self.num_samples / float(self._sample_rate) @property @@ -579,21 +591,26 @@ def rms_db(self): @property def orig_sr(self): + """Returns the original sample rate of the segment.""" return self._orig_sr @property def offset(self): + """Returns the offset used for the segment.""" return float(self._offset) if self._offset is not None else None @property def audio_file(self): + """Returns the audio file that the segment was loaded from.""" return str(self._audio_file) if self._audio_file is not None else None def is_empty(self): + """Checks if the segment is empty.""" mean_square = np.sum(np.mean(self._samples**2, axis=0)) return self.num_samples == 0 or mean_square == 0 def gain_db(self, gain): + """Returns the gain in decibels.""" self._samples *= 10.0 ** (gain / 20.0) def normalize_db(self, target_db=-20, ref_channel=None): @@ -622,7 +639,8 @@ def pad(self, pad_size, symmetric=False): pad_width = ((pad_size, pad_size), (0, 0)) if symmetric else ((0, pad_size), (0, 0)) else: raise NotImplementedError( - f"Padding not implemented for signals with more that 2 dimensions. Current samples dimension: {samples_ndim}." + f"Padding not implemented for signals with more that 2 dimensions. " + f"Current samples dimension: {samples_ndim}." ) # apply padding self._samples = np.pad( From 8a2c620ac9ec2e34bce94281c79224ffd9e7e78c Mon Sep 17 00:00:00 2001 From: andylamp <2177249+andylamp@users.noreply.github.com> Date: Mon, 2 Dec 2024 01:56:01 +0000 Subject: [PATCH 04/16] fix final warning for line len --- nemo/collections/asr/parts/preprocessing/segment.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py index d43fd3aa53d0..00558769b020 100644 --- a/nemo/collections/asr/parts/preprocessing/segment.py +++ b/nemo/collections/asr/parts/preprocessing/segment.py @@ -419,7 +419,8 @@ def from_file_list( # Shortcut when selecting a single channel if channel_selector >= len(audio_file_list): raise RuntimeError( - f'Channel cannot be selected: channel_selector={channel_selector}, num_audio_files={len(audio_file_list)}' + f'Channel cannot be selected: channel_selector={channel_selector}, ' + f'num_audio_files={len(audio_file_list)}' ) # Select only a single file audio_file_list = [audio_file_list[channel_selector]] From 5f6495a6226bd6501f40b91ac00d98199011260c Mon Sep 17 00:00:00 2001 From: andylamp <2177249+andylamp@users.noreply.github.com> Date: Mon, 2 Dec 2024 00:02:22 +0000 Subject: [PATCH 05/16] fix numpy compatibility while replicating existing behavior Signed-off-by: andylamp <2177249+andylamp@users.noreply.github.com> --- nemo/collections/asr/parts/preprocessing/feature_loader.py | 4 ++-- nemo/collections/asr/parts/preprocessing/segment.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo/collections/asr/parts/preprocessing/feature_loader.py b/nemo/collections/asr/parts/preprocessing/feature_loader.py index 8c629cf4cfd4..17725fcebe35 100644 --- a/nemo/collections/asr/parts/preprocessing/feature_loader.py +++ b/nemo/collections/asr/parts/preprocessing/feature_loader.py @@ -50,10 +50,10 @@ def _convert_samples_to_float32(samples: np.ndarray) -> np.ndarray: Integers will be scaled to [-1, 1] in float32. """ float32_samples = samples.astype('float32') - if samples.dtype in np.sctypes['int']: + if samples.dtype in (np.int8, np.int16, np.int32, np.int64): bits = np.iinfo(samples.dtype).bits float32_samples *= 1.0 / 2 ** (bits - 1) - elif samples.dtype in np.sctypes['float']: + elif samples.dtype in (np.float16, np.float32, np.float64): pass else: raise TypeError("Unsupported sample type: %s." % samples.dtype) diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py index aceab6637006..c518fcfa5b66 100644 --- a/nemo/collections/asr/parts/preprocessing/segment.py +++ b/nemo/collections/asr/parts/preprocessing/segment.py @@ -260,10 +260,10 @@ def _convert_samples_to_float32(samples): Integers will be scaled to [-1, 1] in float32. """ float32_samples = samples.astype('float32') - if samples.dtype in np.sctypes['int']: + if samples.dtype in (np.int8, np.int16, np.int32, np.int64): bits = np.iinfo(samples.dtype).bits float32_samples *= 1.0 / 2 ** (bits - 1) - elif samples.dtype in np.sctypes['float']: + elif samples.dtype in (np.float16, np.float32, np.float64): pass else: raise TypeError("Unsupported sample type: %s." % samples.dtype) From 183d5c5204ec5fea7c16617e18294e0918b55ad9 Mon Sep 17 00:00:00 2001 From: andylamp Date: Mon, 2 Dec 2024 00:36:35 +0000 Subject: [PATCH 06/16] Apply isort and black reformatting Signed-off-by: andylamp Signed-off-by: andylamp <2177249+andylamp@users.noreply.github.com> --- nemo/collections/asr/parts/preprocessing/feature_loader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nemo/collections/asr/parts/preprocessing/feature_loader.py b/nemo/collections/asr/parts/preprocessing/feature_loader.py index 17725fcebe35..858e3acb68c8 100644 --- a/nemo/collections/asr/parts/preprocessing/feature_loader.py +++ b/nemo/collections/asr/parts/preprocessing/feature_loader.py @@ -18,12 +18,13 @@ class ExternalFeatureLoader(object): - """Feature loader that load external features store in certain format. + """Feature loader that load external features store in certain format. Currently support pickle, npy and npz format. """ def __init__( - self, augmentor: Optional["nemo.collections.asr.parts.perturb.FeatureAugmentor"] = None, + self, + augmentor: Optional["nemo.collections.asr.parts.perturb.FeatureAugmentor"] = None, ): """ Feature loader From 5d842fa4297f93b2c3bcf11e33195791467ee95e Mon Sep 17 00:00:00 2001 From: andylamp <2177249+andylamp@users.noreply.github.com> Date: Mon, 2 Dec 2024 01:40:36 +0000 Subject: [PATCH 07/16] add docstrings and fix line lengths Signed-off-by: andylamp <2177249+andylamp@users.noreply.github.com> --- .../asr/parts/preprocessing/feature_loader.py | 6 ++- .../asr/parts/preprocessing/segment.py | 48 +++++++++++++------ 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/nemo/collections/asr/parts/preprocessing/feature_loader.py b/nemo/collections/asr/parts/preprocessing/feature_loader.py index 858e3acb68c8..e715d2dafb95 100644 --- a/nemo/collections/asr/parts/preprocessing/feature_loader.py +++ b/nemo/collections/asr/parts/preprocessing/feature_loader.py @@ -61,13 +61,17 @@ def _convert_samples_to_float32(samples: np.ndarray) -> np.ndarray: return float32_samples def process(self, file_path: str) -> torch.Tensor: + """Processes the features from the provided `file_path`.""" features = self.load_feature_from_file(file_path) features = self.process_segment(features) return features def process_segment(self, feature_segment): + """Processes the provided feature segment.""" if self.augmentor: - # augmentor for external features. Here possible augmentor for external embedding feature is Diaconis Augmentation and might be implemented later + # augmentor for external features. Here possible augmentor for + # external embedding feature is Diaconis Augmentation and might + # be implemented later self.augmentor.perturb(feature_segment) return torch.tensor(feature_segment, dtype=torch.float) diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py index c518fcfa5b66..d43fd3aa53d0 100644 --- a/nemo/collections/asr/parts/preprocessing/segment.py +++ b/nemo/collections/asr/parts/preprocessing/segment.py @@ -67,14 +67,15 @@ def select_channels(signal: npt.NDArray, channel_selector: Optional[ChannelSelectorType] = None) -> npt.NDArray: """ - Convert a multi-channel signal to a single-channel signal by averaging over channels or selecting a single channel, - or pass-through multi-channel signal when channel_selector is `None`. + Convert a multi-channel signal to a single-channel signal by averaging over channels or + selecting a single channel, or pass-through multi-channel signal when channel_selector is `None`. Args: signal: numpy array with shape (..., num_channels) - channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable - of integers denoting a subset of channels. Channel selector is using zero-based indexing. - If set to `None`, the original signal will be returned. Uses zero-based indexing. + channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, + or an iterable of integers denoting a subset of channels. Channel selector is + using zero-based indexing. If set to `None`, the original signal will be returned. + Uses zero-based indexing. Returns: numpy array @@ -92,7 +93,8 @@ def select_channels(signal: npt.NDArray, channel_selector: Optional[ChannelSelec if num_channels >= num_samples: logging.warning( - 'Number of channels (%d) is greater or equal than number of samples (%d). Check for possible transposition.', + 'Number of channels (%d) is greater or equal than number of samples (%d). ' + 'Check for possible transposition.', num_channels, num_samples, ) @@ -199,7 +201,8 @@ def __init__( samples = samples.transpose() sample_rate = target_sr if trim: - # librosa is using channels-first layout (num_channels, num_samples), which is transpose of AudioSegment's layout + # librosa is using channels-first layout (num_channels, num_samples), + # which is transpose of AudioSegment's layout samples = samples.transpose() samples, _ = librosa.effects.trim( samples, top_db=trim_top_db, ref=trim_ref, frame_length=trim_frame_length, hop_length=trim_hop_length @@ -303,11 +306,12 @@ def from_file( :param trim_frame_length: the number of samples per analysis frame :param trim_hop_length: the number of samples between analysis frames :param orig_sr: the original sample rate - :param channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable - of integers denoting a subset of channels. Channel selector is using zero-based indexing. - If set to `None`, the original signal will be used. + :param channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, + or an iterable of integers denoting a subset of channels. Channel selector is using + zero-based indexing. If set to `None`, the original signal will be used. :param normalize_db (Optional[float]): if not None, normalize the audio signal to a target RMS value - :param ref_channel (Optional[int]): channel to use as reference for normalizing multi-channel audio, set None to use max RMS across channels + :param ref_channel (Optional[int]): channel to use as reference for normalizing multi-channel audio, + set None to use max RMS across channels :return: AudioSegment instance """ samples = None @@ -441,7 +445,8 @@ def from_file_list( # Only single-channel individual files are supported for now if a_segment.num_channels != 1: raise RuntimeError( - f'Expecting a single-channel audio signal, but loaded {a_segment.num_channels} channels from file {a_file}' + f'Expecting a single-channel audio signal, but loaded {a_segment.num_channels} ' + f'channels from file {a_file}' ) if target_sr is None: @@ -523,14 +528,16 @@ def segment_from_file( audio_start = math.floor(offset * sample_rate) if audio_start > max_audio_start: raise RuntimeError( - f'Provided audio start ({audio_start}) is larger than the maximum possible ({max_audio_start})' + f'Provided audio start ({audio_start}) is larger than the ' + f'maximum possible ({max_audio_start})' ) f.seek(audio_start) samples = f.read(n_segments_at_original_sr, dtype=dtype) is_segmented = True elif n_segments_at_original_sr > len(f): logging.warning( - f"Number of segments ({n_segments_at_original_sr}) is greater than the length ({len(f)}) of the audio file {audio_file}. This may lead to shape mismatch errors." + f"Number of segments ({n_segments_at_original_sr}) is greater than the length ({len(f)}) " + f"of the audio file {audio_file}. This may lead to shape mismatch errors." ) samples = f.read(dtype=dtype) else: @@ -550,14 +557,17 @@ def segment_from_file( @property def samples(self): + """Returns a copy of the samples.""" return self._samples.copy() @property def sample_rate(self): + """Returns the sample rate of the segment.""" return self._sample_rate @property def num_channels(self): + """Returns the number of channels in the segment.""" if self._samples.ndim == 1: return 1 else: @@ -565,10 +575,12 @@ def num_channels(self): @property def num_samples(self): + """Returns the number of samples in the segment.""" return self._samples.shape[0] @property def duration(self): + """Returns the duration of the segment in seconds.""" return self.num_samples / float(self._sample_rate) @property @@ -579,21 +591,26 @@ def rms_db(self): @property def orig_sr(self): + """Returns the original sample rate of the segment.""" return self._orig_sr @property def offset(self): + """Returns the offset used for the segment.""" return float(self._offset) if self._offset is not None else None @property def audio_file(self): + """Returns the audio file that the segment was loaded from.""" return str(self._audio_file) if self._audio_file is not None else None def is_empty(self): + """Checks if the segment is empty.""" mean_square = np.sum(np.mean(self._samples**2, axis=0)) return self.num_samples == 0 or mean_square == 0 def gain_db(self, gain): + """Returns the gain in decibels.""" self._samples *= 10.0 ** (gain / 20.0) def normalize_db(self, target_db=-20, ref_channel=None): @@ -622,7 +639,8 @@ def pad(self, pad_size, symmetric=False): pad_width = ((pad_size, pad_size), (0, 0)) if symmetric else ((0, pad_size), (0, 0)) else: raise NotImplementedError( - f"Padding not implemented for signals with more that 2 dimensions. Current samples dimension: {samples_ndim}." + f"Padding not implemented for signals with more that 2 dimensions. " + f"Current samples dimension: {samples_ndim}." ) # apply padding self._samples = np.pad( From 9588148c5293c54ec91fdbbd3cfc37dbf5c75a1a Mon Sep 17 00:00:00 2001 From: andylamp <2177249+andylamp@users.noreply.github.com> Date: Mon, 2 Dec 2024 01:56:01 +0000 Subject: [PATCH 08/16] fix final warning for line len Signed-off-by: andylamp <2177249+andylamp@users.noreply.github.com> --- nemo/collections/asr/parts/preprocessing/segment.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py index d43fd3aa53d0..00558769b020 100644 --- a/nemo/collections/asr/parts/preprocessing/segment.py +++ b/nemo/collections/asr/parts/preprocessing/segment.py @@ -419,7 +419,8 @@ def from_file_list( # Shortcut when selecting a single channel if channel_selector >= len(audio_file_list): raise RuntimeError( - f'Channel cannot be selected: channel_selector={channel_selector}, num_audio_files={len(audio_file_list)}' + f'Channel cannot be selected: channel_selector={channel_selector}, ' + f'num_audio_files={len(audio_file_list)}' ) # Select only a single file audio_file_list = [audio_file_list[channel_selector]] From 5cfafb529ef39f39b69f8a888b9a988643d75f3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 28 Nov 2024 01:19:27 +0100 Subject: [PATCH 09/16] ci: Allow dry-run of release (#11418) * ci: Allow dry-run of release Signed-off-by: Oliver Koenig * fix Signed-off-by: Oliver Koenig * finalize Signed-off-by: Oliver Koenig --------- Signed-off-by: Oliver Koenig Signed-off-by: andylamp <2177249+andylamp@users.noreply.github.com> --- .github/workflows/release.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 03474251f995..81db8e1160d9 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -20,10 +20,15 @@ on: description: Ref (SHA or branch name) to release required: true type: string + dry-run: + description: Do not publish a wheel and GitHub release. + required: true + default: true + type: boolean jobs: release: - uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.12.3 + uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.15.0 with: release-ref: ${{ inputs.release-ref }} image-name: nemo_container @@ -35,8 +40,10 @@ jobs: python-package: nemo container-workdir: /workspace library-name: Neural Modules + dry-run: ${{ inputs.dry-run }} secrets: TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }} PAT: ${{ secrets.PAT }} + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} From 4d4070963a86b93329f9ad88ee7f575956777186 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Thu, 28 Nov 2024 01:03:58 -0800 Subject: [PATCH 10/16] fix dtype when init HF model from config (#11420) * fix dtype when init HF model from config Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa Signed-off-by: andylamp <2177249+andylamp@users.noreply.github.com> --- .../llm/gpt/model/hf_auto_model_for_causal_lm.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py index 8f4595bd6cee..481dd9a0e187 100644 --- a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py +++ b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py @@ -41,6 +41,7 @@ def __init__( model_transform=None, model_accelerator=None, trust_remote_code=False, + default_dtype=torch.bfloat16, ): super().__init__() self.save_hyperparameters() @@ -53,6 +54,7 @@ def __init__( self.model_transform = model_transform self.model_accelerator = model_accelerator self.trust_remote_code = trust_remote_code + self.default_dtype = default_dtype @property def tokenizer(self): @@ -79,7 +81,10 @@ def configure_model(self): from transformers import AutoConfig config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=self.trust_remote_code) - self.model = AutoModelForCausalLM.from_config(config, trust_remote_code=self.trust_remote_code) + dtype = getattr(config, 'torch_dtype', self.default_dtype) + self.model = AutoModelForCausalLM.from_config( + config, torch_dtype=dtype, trust_remote_code=self.trust_remote_code + ) if self.model_accelerator is not None: self.model_accelerator(self.model) From f7fa43f1cc7f00772922ed3d996ce30baafb109c Mon Sep 17 00:00:00 2001 From: nune-tadevosyan <152167970+nune-tadevosyan@users.noreply.github.com> Date: Thu, 28 Nov 2024 13:27:01 +0400 Subject: [PATCH 11/16] Removing unnecessary lines (#11408) Signed-off-by: Nune Signed-off-by: andylamp <2177249+andylamp@users.noreply.github.com> --- nemo/collections/asr/data/audio_to_text_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo/collections/asr/data/audio_to_text_dataset.py b/nemo/collections/asr/data/audio_to_text_dataset.py index f91710de3cb3..3e1301dd4d53 100644 --- a/nemo/collections/asr/data/audio_to_text_dataset.py +++ b/nemo/collections/asr/data/audio_to_text_dataset.py @@ -871,7 +871,6 @@ def write_on_batch_end( item["audio_filepath"] = sample.recording.sources[0].source else: item["audio_filepath"] = sample.id - item["audio_filepath"] = sample.recording.sources[0].source item["offset"] = sample.start item["duration"] = sample.duration item["text"] = sample.supervisions[0].text or '' From ab0ac8bc276a5efd7b0d75934e621d9c56202d7c Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Fri, 29 Nov 2024 17:01:56 +0100 Subject: [PATCH 12/16] Handle import errors in virtual environment when running vLLM tests (#11435) * Remove try / catch block to propagate import errors Signed-off-by: Jan Lasek * Small rewrite to handle import errors in export/deploy scripts Signed-off-by: Jan Lasek * Apply isort and black reformatting Signed-off-by: janekl --------- Signed-off-by: Jan Lasek Signed-off-by: janekl Co-authored-by: janekl Signed-off-by: andylamp <2177249+andylamp@users.noreply.github.com> --- nemo/deploy/nlp/__init__.py | 14 ++------------ scripts/deploy/nlp/query_inframework.py | 2 +- tests/deploy/nemo_deploy.py | 2 +- tests/export/nemo_export.py | 12 ++++-------- 4 files changed, 8 insertions(+), 22 deletions(-) diff --git a/nemo/deploy/nlp/__init__.py b/nemo/deploy/nlp/__init__.py index 5ebbe6816664..633544e300ed 100644 --- a/nemo/deploy/nlp/__init__.py +++ b/nemo/deploy/nlp/__init__.py @@ -12,15 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. - -use_query_llm = True -try: - from nemo.deploy.nlp.query_llm import NemoQueryLLM, NemoQueryLLMPyTorch -except Exception: - use_query_llm = False - -use_megatron_llm = True -try: - from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable -except Exception: - use_megatron_llm = False +from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable +from nemo.deploy.nlp.query_llm import NemoQueryLLM, NemoQueryLLMPyTorch diff --git a/scripts/deploy/nlp/query_inframework.py b/scripts/deploy/nlp/query_inframework.py index e77ab72a1f04..a62e09fa071d 100644 --- a/scripts/deploy/nlp/query_inframework.py +++ b/scripts/deploy/nlp/query_inframework.py @@ -15,7 +15,7 @@ import argparse import sys -from nemo.deploy.nlp.query_llm import NemoQueryLLMPyTorch +from nemo.deploy.nlp import NemoQueryLLMPyTorch def get_args(argv): diff --git a/tests/deploy/nemo_deploy.py b/tests/deploy/nemo_deploy.py index 23db7c4f01f3..45f2bae3425e 100644 --- a/tests/deploy/nemo_deploy.py +++ b/tests/deploy/nemo_deploy.py @@ -21,7 +21,7 @@ import torch -from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable +from nemo.deploy.nlp import MegatronLLMDeployable from tests.infer_data_path import get_infer_test_data run_export_tests = True diff --git a/tests/export/nemo_export.py b/tests/export/nemo_export.py index df6a68828d41..cb2b3619e4d3 100644 --- a/tests/export/nemo_export.py +++ b/tests/export/nemo_export.py @@ -43,7 +43,8 @@ from nemo.deploy.nlp import MegatronLLMDeployable, NemoQueryLLMPyTorch except Exception as e: LOGGER.warning( - f"Cannot import MegatronLLMDeployable, in-framework inference will not be available. {type(e).__name__}: {e}" + "Cannot import MegatronLLMDeployable or NemoQueryLLMPyTorch," + f" in-framework inference will not be available. {type(e).__name__}: {e}" ) in_framework_supported = False @@ -104,12 +105,7 @@ def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path): all_expected_outputs.append(expected_output) if model is not None: - in_framework_model = False - if in_framework_supported: - if isinstance(model, MegatronLLMDeployable): - in_framework_model = True - - if in_framework_model: + if in_framework_supported and isinstance(model, MegatronLLMDeployable): model_output = model.generate( inputs=[prompt], length_params={"min_length": 1, "max_length": 1}, @@ -153,7 +149,7 @@ def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path): correct_answers_relaxed += 1 if nq is not None: - if isinstance(nq, NemoQueryLLMPyTorch): + if in_framework_supported and isinstance(nq, NemoQueryLLMPyTorch): deployed_output = nq.query_llm( prompts=[prompt], max_length=1, From 06e0e4ea496ca6ab41ae3d82040270d3c71099a4 Mon Sep 17 00:00:00 2001 From: andylamp <2177249+andylamp@users.noreply.github.com> Date: Mon, 2 Dec 2024 00:02:22 +0000 Subject: [PATCH 13/16] fix numpy compatibility while replicating existing behavior Signed-off-by: andylamp <2177249+andylamp@users.noreply.github.com> --- nemo/collections/asr/parts/preprocessing/feature_loader.py | 4 ++-- nemo/collections/asr/parts/preprocessing/segment.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo/collections/asr/parts/preprocessing/feature_loader.py b/nemo/collections/asr/parts/preprocessing/feature_loader.py index 8c629cf4cfd4..17725fcebe35 100644 --- a/nemo/collections/asr/parts/preprocessing/feature_loader.py +++ b/nemo/collections/asr/parts/preprocessing/feature_loader.py @@ -50,10 +50,10 @@ def _convert_samples_to_float32(samples: np.ndarray) -> np.ndarray: Integers will be scaled to [-1, 1] in float32. """ float32_samples = samples.astype('float32') - if samples.dtype in np.sctypes['int']: + if samples.dtype in (np.int8, np.int16, np.int32, np.int64): bits = np.iinfo(samples.dtype).bits float32_samples *= 1.0 / 2 ** (bits - 1) - elif samples.dtype in np.sctypes['float']: + elif samples.dtype in (np.float16, np.float32, np.float64): pass else: raise TypeError("Unsupported sample type: %s." % samples.dtype) diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py index aceab6637006..c518fcfa5b66 100644 --- a/nemo/collections/asr/parts/preprocessing/segment.py +++ b/nemo/collections/asr/parts/preprocessing/segment.py @@ -260,10 +260,10 @@ def _convert_samples_to_float32(samples): Integers will be scaled to [-1, 1] in float32. """ float32_samples = samples.astype('float32') - if samples.dtype in np.sctypes['int']: + if samples.dtype in (np.int8, np.int16, np.int32, np.int64): bits = np.iinfo(samples.dtype).bits float32_samples *= 1.0 / 2 ** (bits - 1) - elif samples.dtype in np.sctypes['float']: + elif samples.dtype in (np.float16, np.float32, np.float64): pass else: raise TypeError("Unsupported sample type: %s." % samples.dtype) From eab5901d847c9fac9089aaa79a484f550c3b602e Mon Sep 17 00:00:00 2001 From: andylamp Date: Mon, 2 Dec 2024 00:36:35 +0000 Subject: [PATCH 14/16] Apply isort and black reformatting Signed-off-by: andylamp Signed-off-by: andylamp <2177249+andylamp@users.noreply.github.com> --- nemo/collections/asr/parts/preprocessing/feature_loader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nemo/collections/asr/parts/preprocessing/feature_loader.py b/nemo/collections/asr/parts/preprocessing/feature_loader.py index 17725fcebe35..858e3acb68c8 100644 --- a/nemo/collections/asr/parts/preprocessing/feature_loader.py +++ b/nemo/collections/asr/parts/preprocessing/feature_loader.py @@ -18,12 +18,13 @@ class ExternalFeatureLoader(object): - """Feature loader that load external features store in certain format. + """Feature loader that load external features store in certain format. Currently support pickle, npy and npz format. """ def __init__( - self, augmentor: Optional["nemo.collections.asr.parts.perturb.FeatureAugmentor"] = None, + self, + augmentor: Optional["nemo.collections.asr.parts.perturb.FeatureAugmentor"] = None, ): """ Feature loader From 4b2e882660f3da66e7738955b8e5f18c21b104df Mon Sep 17 00:00:00 2001 From: andylamp <2177249+andylamp@users.noreply.github.com> Date: Mon, 2 Dec 2024 01:40:36 +0000 Subject: [PATCH 15/16] add docstrings and fix line lengths Signed-off-by: andylamp <2177249+andylamp@users.noreply.github.com> --- .../asr/parts/preprocessing/feature_loader.py | 6 ++- .../asr/parts/preprocessing/segment.py | 48 +++++++++++++------ 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/nemo/collections/asr/parts/preprocessing/feature_loader.py b/nemo/collections/asr/parts/preprocessing/feature_loader.py index 858e3acb68c8..e715d2dafb95 100644 --- a/nemo/collections/asr/parts/preprocessing/feature_loader.py +++ b/nemo/collections/asr/parts/preprocessing/feature_loader.py @@ -61,13 +61,17 @@ def _convert_samples_to_float32(samples: np.ndarray) -> np.ndarray: return float32_samples def process(self, file_path: str) -> torch.Tensor: + """Processes the features from the provided `file_path`.""" features = self.load_feature_from_file(file_path) features = self.process_segment(features) return features def process_segment(self, feature_segment): + """Processes the provided feature segment.""" if self.augmentor: - # augmentor for external features. Here possible augmentor for external embedding feature is Diaconis Augmentation and might be implemented later + # augmentor for external features. Here possible augmentor for + # external embedding feature is Diaconis Augmentation and might + # be implemented later self.augmentor.perturb(feature_segment) return torch.tensor(feature_segment, dtype=torch.float) diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py index c518fcfa5b66..d43fd3aa53d0 100644 --- a/nemo/collections/asr/parts/preprocessing/segment.py +++ b/nemo/collections/asr/parts/preprocessing/segment.py @@ -67,14 +67,15 @@ def select_channels(signal: npt.NDArray, channel_selector: Optional[ChannelSelectorType] = None) -> npt.NDArray: """ - Convert a multi-channel signal to a single-channel signal by averaging over channels or selecting a single channel, - or pass-through multi-channel signal when channel_selector is `None`. + Convert a multi-channel signal to a single-channel signal by averaging over channels or + selecting a single channel, or pass-through multi-channel signal when channel_selector is `None`. Args: signal: numpy array with shape (..., num_channels) - channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable - of integers denoting a subset of channels. Channel selector is using zero-based indexing. - If set to `None`, the original signal will be returned. Uses zero-based indexing. + channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, + or an iterable of integers denoting a subset of channels. Channel selector is + using zero-based indexing. If set to `None`, the original signal will be returned. + Uses zero-based indexing. Returns: numpy array @@ -92,7 +93,8 @@ def select_channels(signal: npt.NDArray, channel_selector: Optional[ChannelSelec if num_channels >= num_samples: logging.warning( - 'Number of channels (%d) is greater or equal than number of samples (%d). Check for possible transposition.', + 'Number of channels (%d) is greater or equal than number of samples (%d). ' + 'Check for possible transposition.', num_channels, num_samples, ) @@ -199,7 +201,8 @@ def __init__( samples = samples.transpose() sample_rate = target_sr if trim: - # librosa is using channels-first layout (num_channels, num_samples), which is transpose of AudioSegment's layout + # librosa is using channels-first layout (num_channels, num_samples), + # which is transpose of AudioSegment's layout samples = samples.transpose() samples, _ = librosa.effects.trim( samples, top_db=trim_top_db, ref=trim_ref, frame_length=trim_frame_length, hop_length=trim_hop_length @@ -303,11 +306,12 @@ def from_file( :param trim_frame_length: the number of samples per analysis frame :param trim_hop_length: the number of samples between analysis frames :param orig_sr: the original sample rate - :param channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable - of integers denoting a subset of channels. Channel selector is using zero-based indexing. - If set to `None`, the original signal will be used. + :param channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, + or an iterable of integers denoting a subset of channels. Channel selector is using + zero-based indexing. If set to `None`, the original signal will be used. :param normalize_db (Optional[float]): if not None, normalize the audio signal to a target RMS value - :param ref_channel (Optional[int]): channel to use as reference for normalizing multi-channel audio, set None to use max RMS across channels + :param ref_channel (Optional[int]): channel to use as reference for normalizing multi-channel audio, + set None to use max RMS across channels :return: AudioSegment instance """ samples = None @@ -441,7 +445,8 @@ def from_file_list( # Only single-channel individual files are supported for now if a_segment.num_channels != 1: raise RuntimeError( - f'Expecting a single-channel audio signal, but loaded {a_segment.num_channels} channels from file {a_file}' + f'Expecting a single-channel audio signal, but loaded {a_segment.num_channels} ' + f'channels from file {a_file}' ) if target_sr is None: @@ -523,14 +528,16 @@ def segment_from_file( audio_start = math.floor(offset * sample_rate) if audio_start > max_audio_start: raise RuntimeError( - f'Provided audio start ({audio_start}) is larger than the maximum possible ({max_audio_start})' + f'Provided audio start ({audio_start}) is larger than the ' + f'maximum possible ({max_audio_start})' ) f.seek(audio_start) samples = f.read(n_segments_at_original_sr, dtype=dtype) is_segmented = True elif n_segments_at_original_sr > len(f): logging.warning( - f"Number of segments ({n_segments_at_original_sr}) is greater than the length ({len(f)}) of the audio file {audio_file}. This may lead to shape mismatch errors." + f"Number of segments ({n_segments_at_original_sr}) is greater than the length ({len(f)}) " + f"of the audio file {audio_file}. This may lead to shape mismatch errors." ) samples = f.read(dtype=dtype) else: @@ -550,14 +557,17 @@ def segment_from_file( @property def samples(self): + """Returns a copy of the samples.""" return self._samples.copy() @property def sample_rate(self): + """Returns the sample rate of the segment.""" return self._sample_rate @property def num_channels(self): + """Returns the number of channels in the segment.""" if self._samples.ndim == 1: return 1 else: @@ -565,10 +575,12 @@ def num_channels(self): @property def num_samples(self): + """Returns the number of samples in the segment.""" return self._samples.shape[0] @property def duration(self): + """Returns the duration of the segment in seconds.""" return self.num_samples / float(self._sample_rate) @property @@ -579,21 +591,26 @@ def rms_db(self): @property def orig_sr(self): + """Returns the original sample rate of the segment.""" return self._orig_sr @property def offset(self): + """Returns the offset used for the segment.""" return float(self._offset) if self._offset is not None else None @property def audio_file(self): + """Returns the audio file that the segment was loaded from.""" return str(self._audio_file) if self._audio_file is not None else None def is_empty(self): + """Checks if the segment is empty.""" mean_square = np.sum(np.mean(self._samples**2, axis=0)) return self.num_samples == 0 or mean_square == 0 def gain_db(self, gain): + """Returns the gain in decibels.""" self._samples *= 10.0 ** (gain / 20.0) def normalize_db(self, target_db=-20, ref_channel=None): @@ -622,7 +639,8 @@ def pad(self, pad_size, symmetric=False): pad_width = ((pad_size, pad_size), (0, 0)) if symmetric else ((0, pad_size), (0, 0)) else: raise NotImplementedError( - f"Padding not implemented for signals with more that 2 dimensions. Current samples dimension: {samples_ndim}." + f"Padding not implemented for signals with more that 2 dimensions. " + f"Current samples dimension: {samples_ndim}." ) # apply padding self._samples = np.pad( From aa306e3c02b64f6e9a655f1c617792a135f9b96f Mon Sep 17 00:00:00 2001 From: andylamp <2177249+andylamp@users.noreply.github.com> Date: Mon, 2 Dec 2024 01:56:01 +0000 Subject: [PATCH 16/16] fix final warning for line len Signed-off-by: andylamp <2177249+andylamp@users.noreply.github.com> --- nemo/collections/asr/parts/preprocessing/segment.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py index d43fd3aa53d0..00558769b020 100644 --- a/nemo/collections/asr/parts/preprocessing/segment.py +++ b/nemo/collections/asr/parts/preprocessing/segment.py @@ -419,7 +419,8 @@ def from_file_list( # Shortcut when selecting a single channel if channel_selector >= len(audio_file_list): raise RuntimeError( - f'Channel cannot be selected: channel_selector={channel_selector}, num_audio_files={len(audio_file_list)}' + f'Channel cannot be selected: channel_selector={channel_selector}, ' + f'num_audio_files={len(audio_file_list)}' ) # Select only a single file audio_file_list = [audio_file_list[channel_selector]]