Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes numpy >2.0 compatibility for asr while replicating existing behavior #11446

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions nemo/collections/asr/parts/preprocessing/feature_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@


class ExternalFeatureLoader(object):
"""Feature loader that load external features store in certain format.
"""Feature loader that load external features store in certain format.
Currently support pickle, npy and npz format.
"""

def __init__(
self, augmentor: Optional["nemo.collections.asr.parts.perturb.FeatureAugmentor"] = None,
self,
augmentor: Optional["nemo.collections.asr.parts.perturb.FeatureAugmentor"] = None,
):
"""
Feature loader
Expand All @@ -50,23 +51,27 @@ def _convert_samples_to_float32(samples: np.ndarray) -> np.ndarray:
Integers will be scaled to [-1, 1] in float32.
"""
float32_samples = samples.astype('float32')
if samples.dtype in np.sctypes['int']:
if samples.dtype in (np.int8, np.int16, np.int32, np.int64):
bits = np.iinfo(samples.dtype).bits
float32_samples *= 1.0 / 2 ** (bits - 1)
elif samples.dtype in np.sctypes['float']:
elif samples.dtype in (np.float16, np.float32, np.float64):
pass
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
return float32_samples

def process(self, file_path: str) -> torch.Tensor:
"""Processes the features from the provided `file_path`."""
features = self.load_feature_from_file(file_path)
features = self.process_segment(features)
return features

def process_segment(self, feature_segment):
"""Processes the provided feature segment."""
if self.augmentor:
# augmentor for external features. Here possible augmentor for external embedding feature is Diaconis Augmentation and might be implemented later
# augmentor for external features. Here possible augmentor for
# external embedding feature is Diaconis Augmentation and might
# be implemented later
self.augmentor.perturb(feature_segment)
return torch.tensor(feature_segment, dtype=torch.float)

Expand Down
55 changes: 37 additions & 18 deletions nemo/collections/asr/parts/preprocessing/segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,15 @@

def select_channels(signal: npt.NDArray, channel_selector: Optional[ChannelSelectorType] = None) -> npt.NDArray:
"""
Convert a multi-channel signal to a single-channel signal by averaging over channels or selecting a single channel,
or pass-through multi-channel signal when channel_selector is `None`.
Convert a multi-channel signal to a single-channel signal by averaging over channels or
selecting a single channel, or pass-through multi-channel signal when channel_selector is `None`.

Args:
signal: numpy array with shape (..., num_channels)
channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable
of integers denoting a subset of channels. Channel selector is using zero-based indexing.
If set to `None`, the original signal will be returned. Uses zero-based indexing.
channel selector: string denoting the downmix mode, an integer denoting the channel to be selected,
or an iterable of integers denoting a subset of channels. Channel selector is
using zero-based indexing. If set to `None`, the original signal will be returned.
Uses zero-based indexing.

Returns:
numpy array
Expand All @@ -92,7 +93,8 @@ def select_channels(signal: npt.NDArray, channel_selector: Optional[ChannelSelec

if num_channels >= num_samples:
logging.warning(
'Number of channels (%d) is greater or equal than number of samples (%d). Check for possible transposition.',
'Number of channels (%d) is greater or equal than number of samples (%d). '
'Check for possible transposition.',
num_channels,
num_samples,
)
Expand Down Expand Up @@ -199,7 +201,8 @@ def __init__(
samples = samples.transpose()
sample_rate = target_sr
if trim:
# librosa is using channels-first layout (num_channels, num_samples), which is transpose of AudioSegment's layout
# librosa is using channels-first layout (num_channels, num_samples),
# which is transpose of AudioSegment's layout
samples = samples.transpose()
samples, _ = librosa.effects.trim(
samples, top_db=trim_top_db, ref=trim_ref, frame_length=trim_frame_length, hop_length=trim_hop_length
Expand Down Expand Up @@ -260,10 +263,10 @@ def _convert_samples_to_float32(samples):
Integers will be scaled to [-1, 1] in float32.
"""
float32_samples = samples.astype('float32')
if samples.dtype in np.sctypes['int']:
if samples.dtype in (np.int8, np.int16, np.int32, np.int64):
bits = np.iinfo(samples.dtype).bits
float32_samples *= 1.0 / 2 ** (bits - 1)
elif samples.dtype in np.sctypes['float']:
elif samples.dtype in (np.float16, np.float32, np.float64):
pass
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
Expand Down Expand Up @@ -303,11 +306,12 @@ def from_file(
:param trim_frame_length: the number of samples per analysis frame
:param trim_hop_length: the number of samples between analysis frames
:param orig_sr: the original sample rate
:param channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable
of integers denoting a subset of channels. Channel selector is using zero-based indexing.
If set to `None`, the original signal will be used.
:param channel selector: string denoting the downmix mode, an integer denoting the channel to be selected,
or an iterable of integers denoting a subset of channels. Channel selector is using
zero-based indexing. If set to `None`, the original signal will be used.
:param normalize_db (Optional[float]): if not None, normalize the audio signal to a target RMS value
:param ref_channel (Optional[int]): channel to use as reference for normalizing multi-channel audio, set None to use max RMS across channels
:param ref_channel (Optional[int]): channel to use as reference for normalizing multi-channel audio,
set None to use max RMS across channels
:return: AudioSegment instance
"""
samples = None
Expand Down Expand Up @@ -415,7 +419,8 @@ def from_file_list(
# Shortcut when selecting a single channel
if channel_selector >= len(audio_file_list):
raise RuntimeError(
f'Channel cannot be selected: channel_selector={channel_selector}, num_audio_files={len(audio_file_list)}'
f'Channel cannot be selected: channel_selector={channel_selector}, '
f'num_audio_files={len(audio_file_list)}'
)
# Select only a single file
audio_file_list = [audio_file_list[channel_selector]]
Expand All @@ -441,7 +446,8 @@ def from_file_list(
# Only single-channel individual files are supported for now
if a_segment.num_channels != 1:
raise RuntimeError(
f'Expecting a single-channel audio signal, but loaded {a_segment.num_channels} channels from file {a_file}'
f'Expecting a single-channel audio signal, but loaded {a_segment.num_channels} '
f'channels from file {a_file}'
)

if target_sr is None:
Expand Down Expand Up @@ -523,14 +529,16 @@ def segment_from_file(
audio_start = math.floor(offset * sample_rate)
if audio_start > max_audio_start:
raise RuntimeError(
f'Provided audio start ({audio_start}) is larger than the maximum possible ({max_audio_start})'
f'Provided audio start ({audio_start}) is larger than the '
f'maximum possible ({max_audio_start})'
)
f.seek(audio_start)
samples = f.read(n_segments_at_original_sr, dtype=dtype)
is_segmented = True
elif n_segments_at_original_sr > len(f):
logging.warning(
f"Number of segments ({n_segments_at_original_sr}) is greater than the length ({len(f)}) of the audio file {audio_file}. This may lead to shape mismatch errors."
f"Number of segments ({n_segments_at_original_sr}) is greater than the length ({len(f)}) "
f"of the audio file {audio_file}. This may lead to shape mismatch errors."
)
samples = f.read(dtype=dtype)
else:
Expand All @@ -550,25 +558,30 @@ def segment_from_file(

@property
def samples(self):
"""Returns a copy of the samples."""
return self._samples.copy()

@property
def sample_rate(self):
"""Returns the sample rate of the segment."""
return self._sample_rate

@property
def num_channels(self):
"""Returns the number of channels in the segment."""
if self._samples.ndim == 1:
return 1
else:
return self._samples.shape[-1]

@property
def num_samples(self):
"""Returns the number of samples in the segment."""
return self._samples.shape[0]

@property
def duration(self):
"""Returns the duration of the segment in seconds."""
return self.num_samples / float(self._sample_rate)

@property
Expand All @@ -579,21 +592,26 @@ def rms_db(self):

@property
def orig_sr(self):
"""Returns the original sample rate of the segment."""
return self._orig_sr

@property
def offset(self):
"""Returns the offset used for the segment."""
return float(self._offset) if self._offset is not None else None

@property
def audio_file(self):
"""Returns the audio file that the segment was loaded from."""
return str(self._audio_file) if self._audio_file is not None else None

def is_empty(self):
"""Checks if the segment is empty."""
mean_square = np.sum(np.mean(self._samples**2, axis=0))
return self.num_samples == 0 or mean_square == 0

def gain_db(self, gain):
"""Returns the gain in decibels."""
self._samples *= 10.0 ** (gain / 20.0)

def normalize_db(self, target_db=-20, ref_channel=None):
Expand Down Expand Up @@ -622,7 +640,8 @@ def pad(self, pad_size, symmetric=False):
pad_width = ((pad_size, pad_size), (0, 0)) if symmetric else ((0, pad_size), (0, 0))
else:
raise NotImplementedError(
f"Padding not implemented for signals with more that 2 dimensions. Current samples dimension: {samples_ndim}."
f"Padding not implemented for signals with more that 2 dimensions. "
f"Current samples dimension: {samples_ndim}."
)
# apply padding
self._samples = np.pad(
Expand Down