Skip to content

Commit

Permalink
Fixes ASR numpy 2.x compatibility issues
Browse files Browse the repository at this point in the history
Signed-off-by: andylamp <[email protected]>
  • Loading branch information
andylamp committed Dec 2, 2024
1 parent 8becd57 commit 80c0147
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 23 deletions.
15 changes: 10 additions & 5 deletions nemo/collections/asr/parts/preprocessing/feature_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@


class ExternalFeatureLoader(object):
"""Feature loader that load external features store in certain format.
"""Feature loader that load external features store in certain format.
Currently support pickle, npy and npz format.
"""

def __init__(
self, augmentor: Optional["nemo.collections.asr.parts.perturb.FeatureAugmentor"] = None,
self,
augmentor: Optional["nemo.collections.asr.parts.perturb.FeatureAugmentor"] = None,
):
"""
Feature loader
Expand All @@ -50,23 +51,27 @@ def _convert_samples_to_float32(samples: np.ndarray) -> np.ndarray:
Integers will be scaled to [-1, 1] in float32.
"""
float32_samples = samples.astype('float32')
if samples.dtype in np.sctypes['int']:
if samples.dtype in (np.int8, np.int16, np.int32, np.int64):
bits = np.iinfo(samples.dtype).bits
float32_samples *= 1.0 / 2 ** (bits - 1)
elif samples.dtype in np.sctypes['float']:
elif samples.dtype in (np.float16, np.float32, np.float64):
pass
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
return float32_samples

def process(self, file_path: str) -> torch.Tensor:
"""Processes the features from the provided `file_path`."""
features = self.load_feature_from_file(file_path)
features = self.process_segment(features)
return features

def process_segment(self, feature_segment):
"""Processes the provided feature segment."""
if self.augmentor:
# augmentor for external features. Here possible augmentor for external embedding feature is Diaconis Augmentation and might be implemented later
# augmentor for external features. Here possible augmentor for
# external embedding feature is Diaconis Augmentation and might
# be implemented later
self.augmentor.perturb(feature_segment)
return torch.tensor(feature_segment, dtype=torch.float)

Expand Down
55 changes: 37 additions & 18 deletions nemo/collections/asr/parts/preprocessing/segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,15 @@

def select_channels(signal: npt.NDArray, channel_selector: Optional[ChannelSelectorType] = None) -> npt.NDArray:
"""
Convert a multi-channel signal to a single-channel signal by averaging over channels or selecting a single channel,
or pass-through multi-channel signal when channel_selector is `None`.
Convert a multi-channel signal to a single-channel signal by averaging over channels or
selecting a single channel, or pass-through multi-channel signal when channel_selector is `None`.
Args:
signal: numpy array with shape (..., num_channels)
channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable
of integers denoting a subset of channels. Channel selector is using zero-based indexing.
If set to `None`, the original signal will be returned. Uses zero-based indexing.
channel selector: string denoting the downmix mode, an integer denoting the channel to be selected,
or an iterable of integers denoting a subset of channels. Channel selector is
using zero-based indexing. If set to `None`, the original signal will be returned.
Uses zero-based indexing.
Returns:
numpy array
Expand All @@ -92,7 +93,8 @@ def select_channels(signal: npt.NDArray, channel_selector: Optional[ChannelSelec

if num_channels >= num_samples:
logging.warning(
'Number of channels (%d) is greater or equal than number of samples (%d). Check for possible transposition.',
'Number of channels (%d) is greater or equal than number of samples (%d). '
'Check for possible transposition.',
num_channels,
num_samples,
)
Expand Down Expand Up @@ -199,7 +201,8 @@ def __init__(
samples = samples.transpose()
sample_rate = target_sr
if trim:
# librosa is using channels-first layout (num_channels, num_samples), which is transpose of AudioSegment's layout
# librosa is using channels-first layout (num_channels, num_samples),
# which is transpose of AudioSegment's layout
samples = samples.transpose()
samples, _ = librosa.effects.trim(
samples, top_db=trim_top_db, ref=trim_ref, frame_length=trim_frame_length, hop_length=trim_hop_length
Expand Down Expand Up @@ -260,10 +263,10 @@ def _convert_samples_to_float32(samples):
Integers will be scaled to [-1, 1] in float32.
"""
float32_samples = samples.astype('float32')
if samples.dtype in np.sctypes['int']:
if samples.dtype in (np.int8, np.int16, np.int32, np.int64):
bits = np.iinfo(samples.dtype).bits
float32_samples *= 1.0 / 2 ** (bits - 1)
elif samples.dtype in np.sctypes['float']:
elif samples.dtype in (np.float16, np.float32, np.float64):
pass
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
Expand Down Expand Up @@ -303,11 +306,12 @@ def from_file(
:param trim_frame_length: the number of samples per analysis frame
:param trim_hop_length: the number of samples between analysis frames
:param orig_sr: the original sample rate
:param channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable
of integers denoting a subset of channels. Channel selector is using zero-based indexing.
If set to `None`, the original signal will be used.
:param channel selector: string denoting the downmix mode, an integer denoting the channel to be selected,
or an iterable of integers denoting a subset of channels. Channel selector is using
zero-based indexing. If set to `None`, the original signal will be used.
:param normalize_db (Optional[float]): if not None, normalize the audio signal to a target RMS value
:param ref_channel (Optional[int]): channel to use as reference for normalizing multi-channel audio, set None to use max RMS across channels
:param ref_channel (Optional[int]): channel to use as reference for normalizing multi-channel audio,
set None to use max RMS across channels
:return: AudioSegment instance
"""
samples = None
Expand Down Expand Up @@ -415,7 +419,8 @@ def from_file_list(
# Shortcut when selecting a single channel
if channel_selector >= len(audio_file_list):
raise RuntimeError(
f'Channel cannot be selected: channel_selector={channel_selector}, num_audio_files={len(audio_file_list)}'
f'Channel cannot be selected: channel_selector={channel_selector}, '
f'num_audio_files={len(audio_file_list)}'
)
# Select only a single file
audio_file_list = [audio_file_list[channel_selector]]
Expand All @@ -441,7 +446,8 @@ def from_file_list(
# Only single-channel individual files are supported for now
if a_segment.num_channels != 1:
raise RuntimeError(
f'Expecting a single-channel audio signal, but loaded {a_segment.num_channels} channels from file {a_file}'
f'Expecting a single-channel audio signal, but loaded {a_segment.num_channels} '
f'channels from file {a_file}'
)

if target_sr is None:
Expand Down Expand Up @@ -523,14 +529,16 @@ def segment_from_file(
audio_start = math.floor(offset * sample_rate)
if audio_start > max_audio_start:
raise RuntimeError(
f'Provided audio start ({audio_start}) is larger than the maximum possible ({max_audio_start})'
f'Provided audio start ({audio_start}) is larger than the '
f'maximum possible ({max_audio_start})'
)
f.seek(audio_start)
samples = f.read(n_segments_at_original_sr, dtype=dtype)
is_segmented = True
elif n_segments_at_original_sr > len(f):
logging.warning(
f"Number of segments ({n_segments_at_original_sr}) is greater than the length ({len(f)}) of the audio file {audio_file}. This may lead to shape mismatch errors."
f"Number of segments ({n_segments_at_original_sr}) is greater than the length ({len(f)}) "
f"of the audio file {audio_file}. This may lead to shape mismatch errors."
)
samples = f.read(dtype=dtype)
else:
Expand All @@ -550,25 +558,30 @@ def segment_from_file(

@property
def samples(self):
"""Returns a copy of the samples."""
return self._samples.copy()

@property
def sample_rate(self):
"""Returns the sample rate of the segment."""
return self._sample_rate

@property
def num_channels(self):
"""Returns the number of channels in the segment."""
if self._samples.ndim == 1:
return 1
else:
return self._samples.shape[-1]

@property
def num_samples(self):
"""Returns the number of samples in the segment."""
return self._samples.shape[0]

@property
def duration(self):
"""Returns the duration of the segment in seconds."""
return self.num_samples / float(self._sample_rate)

@property
Expand All @@ -579,21 +592,26 @@ def rms_db(self):

@property
def orig_sr(self):
"""Returns the original sample rate of the segment."""
return self._orig_sr

@property
def offset(self):
"""Returns the offset used for the segment."""
return float(self._offset) if self._offset is not None else None

@property
def audio_file(self):
"""Returns the audio file that the segment was loaded from."""
return str(self._audio_file) if self._audio_file is not None else None

def is_empty(self):
"""Checks if the segment is empty."""
mean_square = np.sum(np.mean(self._samples**2, axis=0))
return self.num_samples == 0 or mean_square == 0

def gain_db(self, gain):
"""Returns the gain in decibels."""
self._samples *= 10.0 ** (gain / 20.0)

def normalize_db(self, target_db=-20, ref_channel=None):
Expand Down Expand Up @@ -622,7 +640,8 @@ def pad(self, pad_size, symmetric=False):
pad_width = ((pad_size, pad_size), (0, 0)) if symmetric else ((0, pad_size), (0, 0))
else:
raise NotImplementedError(
f"Padding not implemented for signals with more that 2 dimensions. Current samples dimension: {samples_ndim}."
f"Padding not implemented for signals with more that 2 dimensions. "
f"Current samples dimension: {samples_ndim}."
)
# apply padding
self._samples = np.pad(
Expand Down

0 comments on commit 80c0147

Please sign in to comment.