Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

migrating to new api #18

Merged
merged 3 commits into from
Jan 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
/target
Cargo.lock
/py-speechsauce/.venv
py-speechsauce/speechsauce/*.abi3.so
py-speechsauce/src/lib.rs
py-speechsauce/**/__pycache__/*
5 changes: 4 additions & 1 deletion py-speechsauce/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@ edition = "2018"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
name = "speechsauce"
name = "speechsauce_python"
crate-type = ["cdylib"]

[package.metadata.maturin]
name = "speechsauce._internal"

[dependencies]
ndarray="^0.15"
pyo3 = {version= "0.16.5", features=["extension-module","abi3-py37"]}
Expand Down
82 changes: 82 additions & 0 deletions py-speechsauce/speechsauce/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from functools import lru_cache
from ._internal import mfcc as internal_mfcc, _speech_config, cmvn, preemphasis

__all__ = ["mfcc", "preemphasis", "cmvn"]


@lru_cache(maxsize=32)
def _get_speech_config(
sampling_frequency,
frame_length=0.020,
frame_stride=0.01,
num_cepstral=13,
num_filters=40,
fft_length=512,
low_frequency=0,
high_frequency=None,
dc_elimination=True,
):
"""pay no attention to the man behind the curtain

this function returns a config object to be used by the rust code, avoids recomputing elements where possible
"""
return _speech_config(
sampling_frequency,
frame_length,
frame_stride,
num_cepstral,
num_filters,
fft_length,
low_frequency,
high_frequency,
dc_elimination,
)


def mfcc(
signal,
sampling_frequency,
frame_length=0.020,
frame_stride=0.01,
num_cepstral=13,
num_filters=40,
fft_length=512,
low_frequency=0,
high_frequency=None,
dc_elimination=True,
):
"""Compute MFCC features from an audio signal.
Args:
signal (array): the audio signal from which to compute features.
Should be an N x 1 array
sampling_frequency (int): the sampling frequency of the signal
we are working with.
frame_length (float): the length of each frame in seconds.
Default is 0.020s
frame_stride (float): the step between successive frames in seconds.
Default is 0.02s (means no overlap)
num_filters (int): the number of filters in the filterbank,
default 40.
fft_length (int): number of FFT points. Default is 512.
low_frequency (float): lowest band edge of mel filters.
In Hz, default is 0.
high_frequency (float): highest band edge of mel filters.
In Hz, default is samplerate/2
num_cepstral (int): Number of cepstral coefficients.
dc_elimination (bool): hIf the first dc component should
be eliminated or not.
Returns:
array: A numpy array of size (num_frames x num_cepstral) containing mfcc features.
"""
config = _get_speech_config(
sampling_frequency,
frame_length,
frame_stride,
num_cepstral,
num_filters,
fft_length,
low_frequency,
high_frequency,
dc_elimination,
)
return internal_mfcc(signal, config)
97 changes: 71 additions & 26 deletions py-speechsauce/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,28 @@
use std::sync::Arc;

use pyo3::prelude::*;
use numpy::{IntoPyArray, PyReadonlyArray1, PyReadonlyArray2, PyArray2, PyArray1};
use speechsauce::{feature,processing};
use numpy::{IntoPyArray, PyArray1, PyArray2, PyReadonlyArray1, PyReadonlyArray2, ToPyArray};
use pyo3::{callback::IntoPyCallbackOutput, prelude::*};
use speechsauce::{config::SpeechConfig, feature, processing};
#[pyclass]
#[repr(transparent)]
#[derive(Clone)]
pub struct PySpeechConfig(SpeechConfig);

impl IntoPyCallbackOutput<Self> for PySpeechConfig {
fn convert(self, py: Python<'_>) -> PyResult<Self> {
Ok(self)
}
}

impl IntoPy<SpeechConfig> for PySpeechConfig {
fn into_py(self, py: Python<'_>) -> SpeechConfig {
self.0
}
}

#[pymodule]
fn speechsauce(_py: Python<'_>, m: &PyModule) -> PyResult<()>{
fn speechsauce(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
m.add_class::<PySpeechConfig>()?;
/// Compute MFCC features from an audio signal.
/// Args:
/// signal : the audio signal from which to compute features.
Expand All @@ -29,37 +47,64 @@ fn speechsauce(_py: Python<'_>, m: &PyModule) -> PyResult<()>{
/// array: A numpy array of size (num_frames x num_cepstral) containing mfcc features.
#[pyfn(m)]
fn mfcc<'py>(
py: Python<'py>,
py: Python<'py>,
signal: PyReadonlyArray1<f64>,
sampling_frequency: usize,
frame_length: f64, // =0.020,
frame_stride: f64, // =0.01,
num_cepstral: usize, // =13,
num_filters: usize, // =40,
fft_length: usize, // =512,
low_frequency: f64, // =0,
high_frequency: Option<f64>, // =None,
dc_elimination: bool, //True
) -> &'py PyArray2<f64>{
feature::mfcc(signal.as_array(), sampling_frequency, frame_length, frame_stride, num_cepstral, num_filters, fft_length, low_frequency, high_frequency, dc_elimination).into_pyarray(py)
config: Py<PySpeechConfig>,
) -> &'py PyArray2<f64> {
let cell = config.as_ref(py);
let obj_ref = cell.borrow();
let speech_config = &obj_ref.0;
feature::mfcc(signal.as_array(), &speech_config).to_pyarray(py)
}

//TODO: #14 make signal a mutable borrow (PyReadWriteArray) once the next version of numpy-rust is released
#[pyfn(m)]
fn preemphasis<'py>(
py: Python<'py>,
signal: PyReadonlyArray1<f64>,
shift: isize,
cof: f64
) -> &'py PyArray1<f64>{
py: Python<'py>,
signal: PyReadonlyArray1<f64>,
shift: isize,
cof: f64,
) -> &'py PyArray1<f64> {
processing::preemphasis(signal.as_array().to_owned(), shift, cof).into_pyarray(py)
}

#[pyfn(m)]
fn cmvn<'py>(py: Python<'py>, vec: PyReadonlyArray2<f64>, variance_normalization: bool)-> &'py PyArray2<f64>
{
fn cmvn<'py>(
py: Python<'py>,
vec: PyReadonlyArray2<f64>,
variance_normalization: bool,
) -> &'py PyArray2<f64> {
processing::cmvn(vec.as_array(), variance_normalization).into_pyarray(py)
}


#[pyfn(m)]
fn _speech_config<'py>(
py: Python<'py>,
sampling_frequency: usize,
frame_length: f64, // =0.020,
frame_stride: f64, // =0.01,
num_cepstral: usize, // =13,
num_filters: usize, // =40,
fft_length: usize, // =512,
low_frequency: f64, // =0,
high_frequency: Option<f64>, // =None,
dc_elimination: bool, //True
) -> Py<PySpeechConfig> {
Py::new(
py,
PySpeechConfig(SpeechConfig::new(
sampling_frequency,
fft_length,
frame_length,
frame_stride,
num_cepstral,
num_filters,
low_frequency,
high_frequency.unwrap_or(sampling_frequency as f64 / 2.0),
dc_elimination,
)),
)
.unwrap()
}
Ok(())
}
}
2 changes: 2 additions & 0 deletions speechsauce/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ num-traits = "0.2.15"
ndarray={version="^0.15",features=["approx"]}
ndarray-rand = "0.14.0"



59 changes: 30 additions & 29 deletions speechsauce/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use ndrustfft::{DctHandler, R2cFftHandler};
use crate::feature::filterbanks;

#[derive(Default)]
pub struct MfccConfigBuilder {
pub struct SpeechConfigBuilder {
///sampling frequency of the signal
sample_rate: usize,
/// number of FFT points.
Expand All @@ -25,9 +25,9 @@ pub struct MfccConfigBuilder {
dc_elimination: bool,
}

impl MfccConfigBuilder {
fn new(sample_rate: usize) -> MfccConfigBuilder {
MfccConfigBuilder {
impl SpeechConfigBuilder {
pub fn new(sample_rate: usize) -> SpeechConfigBuilder {
SpeechConfigBuilder {
sample_rate,
fft_points: 512,
frame_length: 0.02,
Expand All @@ -40,43 +40,43 @@ impl MfccConfigBuilder {
}
}

pub fn high_freq(mut self, high_frequency: f64) -> MfccConfigBuilder {
pub fn high_freq(mut self, high_frequency: f64) -> SpeechConfigBuilder {
self.high_frequency = high_frequency;
self
}

pub fn dc_elimination(mut self, dc_elimination: bool) -> MfccConfigBuilder {
pub fn dc_elimination(mut self, dc_elimination: bool) -> SpeechConfigBuilder {
self.dc_elimination = dc_elimination;
self
}

pub fn low_freq(mut self, low_frequency: f64) -> MfccConfigBuilder {
pub fn low_freq(mut self, low_frequency: f64) -> SpeechConfigBuilder {
self.low_frequency = low_frequency;
self
}

pub fn num_cepstral(mut self, num_cepstral: usize) -> MfccConfigBuilder {
pub fn num_cepstral(mut self, num_cepstral: usize) -> SpeechConfigBuilder {
self.num_cepstral = num_cepstral;
self
}

pub fn frame_stride(mut self, frame_stride: f64) -> MfccConfigBuilder {
pub fn frame_stride(mut self, frame_stride: f64) -> SpeechConfigBuilder {
self.frame_stride = frame_stride;
self
}

pub fn frame_length(mut self, frame_length: f64) -> MfccConfigBuilder {
pub fn frame_length(mut self, frame_length: f64) -> SpeechConfigBuilder {
self.frame_length = frame_length;
self
}

pub fn fft_points(mut self, fft_points: usize) -> MfccConfigBuilder {
pub fn fft_points(mut self, fft_points: usize) -> SpeechConfigBuilder {
self.fft_points = fft_points;
self
}

pub fn build(self) -> MfccConfig {
MfccConfig::new(
pub fn build(self) -> SpeechConfig {
SpeechConfig::new(
self.sample_rate,
self.fft_points,
self.frame_length,
Expand All @@ -90,33 +90,34 @@ impl MfccConfigBuilder {
}
}

pub struct MfccConfig {
#[derive(Clone)]
pub struct SpeechConfig {
///sampling frequency of the signal
sample_rate: usize,
pub sample_rate: usize,
/// number of FFT points.
fft_points: usize,
pub fft_points: usize,
/// the length of each frame in seconds.
frame_length: f64, // =0.020,
pub frame_length: f64, // =0.020,
/// the step between successive frames in seconds.
frame_stride: f64, // =0.01,
pub frame_stride: f64, // =0.01,
/// Number of cepstral coefficients.
num_cepstral: usize, // =13,
pub num_cepstral: usize, // =13,
/// the number of filters in the filterbank
num_filters: usize, // =40,
pub num_filters: usize, // =40,
///lowest band edge of mel filters in Hz
low_frequency: f64,
pub low_frequency: f64,
///highest band edge of mel filters in Hz.
high_frequency: f64,
pub high_frequency: f64,
/// If the first dc component should be eliminated or not
dc_elimination: bool,
pub dc_elimination: bool,
///for
dct_handler: DctHandler<f64>,
fft_handler: R2cFftHandler<f64>,
pub dct_handler: DctHandler<f64>,
pub fft_handler: R2cFftHandler<f64>,
/// Mel-filterbanks
filter_banks: Array2<f64>,
pub filter_banks: Array2<f64>,
}

impl MfccConfig {
impl SpeechConfig {
pub fn new(
sample_rate: usize,
fft_points: usize,
Expand Down Expand Up @@ -150,7 +151,7 @@ impl MfccConfig {
}
}

pub fn builder() -> MfccConfigBuilder {
MfccConfigBuilder::default()
pub fn builder() -> SpeechConfigBuilder {
SpeechConfigBuilder::default()
}
}
Loading