From 690ec94d3125ebf51e6829e0ac1b19f836a79e59 Mon Sep 17 00:00:00 2001 From: Sevag H Date: Fri, 28 Jul 2023 01:01:02 -0400 Subject: [PATCH] Improvements (#2) * Vendor Eigen, use Eigen Matrix/Tensor for waveform/spectrogram classes --- .gitmodules | 3 + CMakeLists.txt | 7 +- src/dsp.cpp | 203 ++++++++++++++++++++++----------------------- src/dsp.hpp | 58 +++---------- src/lstm.cpp | 118 +++++++++++++++----------- src/lstm.hpp | 13 ++- src/model.cpp | 134 ++++++++++++++++++++++++++++++ src/model.hpp | 5 ++ src/tensor.hpp | 16 ++++ test/test_dsp.cpp | 207 ++++++++++++++++++++++++---------------------- umx.cpp | 162 +++++++----------------------------- vendor/eigen | 1 + 12 files changed, 487 insertions(+), 440 deletions(-) create mode 100644 src/tensor.hpp create mode 160000 vendor/eigen diff --git a/.gitmodules b/.gitmodules index 59565e6..b18592b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "vendor/libnyquist"] path = vendor/libnyquist url = https://github.com/ddiakopoulos/libnyquist +[submodule "vendor/eigen"] + path = vendor/eigen + url = https://gitlab.com/libeigen/eigen diff --git a/CMakeLists.txt b/CMakeLists.txt index 487792c..498d06e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,8 +41,7 @@ set(LIBNYQUIST_BUILD_EXAMPLE OFF CACHE BOOL "Disable libnyquist example") add_subdirectory(vendor/libnyquist) # add library Eigen3 -find_package(Eigen3 REQUIRED) -include_directories(${EIGEN3_INCLUDE_DIR}) +include_directories(vendor/eigen) # add OpenBLAS for blas + lapack find_package(BLAS REQUIRED) @@ -57,9 +56,9 @@ include_directories(src) # include src/*.cpp and src/*.c as source files file(GLOB SOURCES "src/*.cpp") -# compile library, link against libnyquist and Eigen3 +# compile library, link against libnyquist add_library(umx.cpp.lib SHARED ${SOURCES}) -target_link_libraries(umx.cpp.lib libnyquist Eigen3::Eigen ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) +target_link_libraries(umx.cpp.lib libnyquist ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES} lapacke) if(OPENMP_FOUND) target_link_libraries(umx.cpp.lib ${OpenMP_CXX_LIBRARIES}) endif() diff --git a/src/dsp.cpp b/src/dsp.cpp index 6229098..7b2f1e7 100644 --- a/src/dsp.cpp +++ b/src/dsp.cpp @@ -15,7 +15,7 @@ using namespace nqr; static constexpr float PI = 3.14159265359F; -umxcpp::StereoWaveform umxcpp::load_audio(std::string filename) +Eigen::MatrixXf umxcpp::load_audio(std::string filename) { // load a wav file with libnyquist std::shared_ptr fileData = std::make_shared(); @@ -47,17 +47,15 @@ umxcpp::StereoWaveform umxcpp::load_audio(std::string filename) size_t N = fileData->samples.size() / fileData->channelCount; // create a struct to hold two float vectors for left and right channels - umxcpp::StereoWaveform ret; - ret.left.resize(N); - ret.right.resize(N); + Eigen::MatrixXf ret(2, N); if (fileData->channelCount == 1) { // Mono case for (size_t i = 0; i < N; ++i) { - ret.left[i] = fileData->samples[i]; // left channel - ret.right[i] = fileData->samples[i]; // right channel + ret(0, i) = fileData->samples[i]; // left channel + ret(1, i) = fileData->samples[i]; // right channel } } else @@ -65,8 +63,8 @@ umxcpp::StereoWaveform umxcpp::load_audio(std::string filename) // Stereo case for (size_t i = 0; i < N; ++i) { - ret.left[i] = fileData->samples[2 * i]; // left channel - ret.right[i] = fileData->samples[2 * i + 1]; // right channel + ret(0, i) = fileData->samples[2 * i]; // left channel + ret(1, i) = fileData->samples[2 * i + 1]; // right channel } } @@ -74,7 +72,7 @@ umxcpp::StereoWaveform umxcpp::load_audio(std::string filename) } // write a function to write a StereoWaveform to a wav file -void umxcpp::write_audio_file(const umxcpp::StereoWaveform &waveform, +void umxcpp::write_audio_file(const Eigen::MatrixXf &waveform, std::string filename) { // create a struct to hold the audio data @@ -87,18 +85,13 @@ void umxcpp::write_audio_file(const umxcpp::StereoWaveform &waveform, fileData->channelCount = 2; // set the number of samples - fileData->samples.resize(waveform.left.size() * 2); + fileData->samples.resize(waveform.cols() * 2); // write the left channel - for (size_t i = 0; i < waveform.left.size(); ++i) + for (size_t i = 0; i < waveform.cols(); ++i) { - fileData->samples[2 * i] = waveform.left[i]; - } - - // write the right channel - for (size_t i = 0; i < waveform.right.size(); ++i) - { - fileData->samples[2 * i + 1] = waveform.right[i]; + fileData->samples[2 * i] = waveform(0, i); + fileData->samples[2 * i + 1] = waveform(1, i); } int encoderStatus = @@ -135,134 +128,138 @@ std::vector hann_window(int window_size) } // reflect padding -std::vector pad_signal(const std::vector &signal, int n_fft) +void pad_signal(std::vector &signal, int n_fft) { int pad = n_fft / 2; std::vector pad_start(signal.begin(), signal.begin() + pad); std::vector pad_end(signal.end() - pad, signal.end()); std::reverse(pad_start.begin(), pad_start.end()); std::reverse(pad_end.begin(), pad_end.end()); - std::vector padded_signal = signal; - padded_signal.insert(padded_signal.begin(), pad_start.begin(), - pad_start.end()); - padded_signal.insert(padded_signal.end(), pad_end.begin(), pad_end.end()); - return padded_signal; + signal.insert(signal.begin(), pad_start.begin(), pad_start.end()); + signal.insert(signal.end(), pad_end.begin(), pad_end.end()); } // reflect unpadding -std::vector unpad_signal(const std::vector &signal, int n_fft) +void unpad_signal(std::vector &signal, int n_fft) { int pad = n_fft / 2; - std::vector unpadded_signal = signal; - unpadded_signal.erase(unpadded_signal.begin(), - unpadded_signal.begin() + - pad); // remove 'pad' elements from the start - unpadded_signal.erase( - unpadded_signal.end() - pad, - unpadded_signal.end()); // remove 'pad' elements from the end - return unpadded_signal; + signal.erase(signal.begin(), + signal.begin() + pad); // remove 'pad' elements from the start + + auto it = signal.end() - pad; + signal.erase(it, signal.end()); // remove 'pad' elements from the end } -umxcpp::StereoSpectrogramR umxcpp::magnitude(const StereoSpectrogramC &spec) +Eigen::Tensor3dXcf umxcpp::polar_to_complex(const Eigen::Tensor3dXf &magnitude, + const Eigen::Tensor3dXf &phase) { - // compute the magnitude of a complex spectrogram - StereoSpectrogramR ret; - ret.left.resize(spec.left.size()); - ret.right.resize(spec.right.size()); + // Assert dimensions are the same + assert(magnitude.dimensions() == phase.dimensions()); - for (std::size_t i = 0; i < spec.left.size(); ++i) - { - ret.left[i].resize(spec.left[i].size()); - ret.right[i].resize(spec.right[i].size()); + // Get dimensions for convenience + int dim1 = magnitude.dimension(0); + int dim2 = magnitude.dimension(1); + int dim3 = magnitude.dimension(2); - for (std::size_t j = 0; j < spec.left[i].size(); ++j) + // Initialize complex spectrogram tensor + Eigen::Tensor3dXcf complex_spectrogram(dim1, dim2, dim3); + + // Iterate over all indices and apply the transformation + for (int i = 0; i < dim1; ++i) + { + for (int j = 0; j < dim2; ++j) { - // compute the magnitude on the std::complex - ret.left[i][j] = std::abs(spec.left[i][j]); - ret.right[i][j] = std::abs(spec.right[i][j]); + for (int k = 0; k < dim3; ++k) + { + float mag = magnitude(i, j, k); + float ph = phase(i, j, k); + complex_spectrogram(i, j, k) = std::polar(mag, ph); + } } } - return ret; + return complex_spectrogram; } -// repeat the above magnitude function but adapt for phase -umxcpp::StereoSpectrogramR umxcpp::phase(const StereoSpectrogramC &spec) +Eigen::Tensor3dXcf umxcpp::stft(const Eigen::MatrixXf &audio) { - // compute the phase of a complex spectrogram - StereoSpectrogramR ret; - ret.left.resize(spec.left.size()); - ret.right.resize(spec.right.size()); + auto window = hann_window(FFT_WINDOW_SIZE); - for (std::size_t i = 0; i < spec.left.size(); ++i) - { - ret.left[i].resize(spec.left[i].size()); - ret.right[i].resize(spec.right[i].size()); + // apply padding equivalent to center padding with center=True + // in torch.stft: + // https://pytorch.org/docs/stable/generated/torch.stft.html - for (std::size_t j = 0; j < spec.left[i].size(); ++j) - { - // compute phase using std::complex - ret.left[i][j] = std::arg(spec.left[i][j]); - ret.right[i][j] = std::arg(spec.right[i][j]); - } - } + std::vector audio_left(audio.row(0).size()); + Eigen::VectorXf row_vec = audio.row(0); + std::copy_n(row_vec.data(), row_vec.size(), audio_left.begin()); - return ret; -} + std::vector audio_right(audio.row(1).size()); + row_vec = audio.row(1); + std::copy_n(row_vec.data(), row_vec.size(), audio_right.begin()); -umxcpp::StereoSpectrogramC umxcpp::combine(const StereoSpectrogramR &mag, - const StereoSpectrogramR &phase) -{ - // combine magnitude and phase into a complex spectrogram - StereoSpectrogramC ret; - ret.left.resize(mag.left.size()); - ret.right.resize(mag.right.size()); + pad_signal(audio_left, FFT_WINDOW_SIZE); + pad_signal(audio_right, FFT_WINDOW_SIZE); - for (std::size_t i = 0; i < mag.left.size(); ++i) - { - ret.left[i].resize(mag.left[i].size()); - ret.right[i].resize(mag.right[i].size()); + auto stft_left = + stft_inner(audio_left, window, FFT_WINDOW_SIZE, FFT_HOP_SIZE); + auto stft_right = + stft_inner(audio_right, window, FFT_WINDOW_SIZE, FFT_HOP_SIZE); + + // get the size of rows and cols + int rows = stft_left.size(); + int cols = stft_left[0].size(); + + Eigen::Tensor3dXcf spec(2, rows, cols); - for (std::size_t j = 0; j < mag.left[i].size(); ++j) + for (int i = 0; i < rows; ++i) + { + for (int j = 0; j < cols; ++j) { - // compute the complex number from the polar form - ret.left[i][j] = std::polar(mag.left[i][j], phase.left[i][j]); - ret.right[i][j] = std::polar(mag.right[i][j], phase.right[i][j]); + spec(0, i, j) = stft_left[i][j]; + spec(1, i, j) = stft_right[i][j]; } } - return ret; + return spec; } -umxcpp::StereoSpectrogramC umxcpp::stft(const StereoWaveform &audio) +Eigen::MatrixXf umxcpp::istft(const Eigen::Tensor3dXcf &spec) { - StereoSpectrogramC spec; auto window = hann_window(FFT_WINDOW_SIZE); - // apply padding equivalent to center padding with center=True - // in torch.stft: - // https://pytorch.org/docs/stable/generated/torch.stft.html - auto chn_left = pad_signal(audio.left, FFT_WINDOW_SIZE); - auto chn_right = pad_signal(audio.right, FFT_WINDOW_SIZE); + int rows = spec.dimension(1); + int cols = spec.dimension(2); - spec.left = stft_inner(chn_left, window, FFT_WINDOW_SIZE, FFT_HOP_SIZE); - spec.right = stft_inner(chn_right, window, FFT_WINDOW_SIZE, FFT_HOP_SIZE); + // Create the nested vectors + std::vector>> stft_left( + rows, std::vector>(cols)); + std::vector>> stft_right( + rows, std::vector>(cols)); - return spec; -} + // Populate the nested vectors + for (int i = 0; i < rows; ++i) + { + for (int j = 0; j < cols; ++j) + { + stft_left[i][j] = spec(0, i, j); + stft_right[i][j] = spec(1, i, j); + } + } -umxcpp::StereoWaveform umxcpp::istft(const StereoSpectrogramC &spec) -{ - StereoWaveform audio; - auto window = hann_window(FFT_WINDOW_SIZE); + std::vector chn_left = + istft_inner(stft_left, window, FFT_WINDOW_SIZE, FFT_HOP_SIZE); + std::vector chn_right = + istft_inner(stft_right, window, FFT_WINDOW_SIZE, FFT_HOP_SIZE); + + unpad_signal(chn_left, FFT_WINDOW_SIZE); + unpad_signal(chn_right, FFT_WINDOW_SIZE); - auto chn_left = - istft_inner(spec.left, window, FFT_WINDOW_SIZE, FFT_HOP_SIZE); - auto chn_right = - istft_inner(spec.right, window, FFT_WINDOW_SIZE, FFT_HOP_SIZE); + Eigen::MatrixXf audio(2, chn_left.size()); - audio.left = unpad_signal(chn_left, FFT_WINDOW_SIZE); - audio.right = unpad_signal(chn_right, FFT_WINDOW_SIZE); + audio.row(0) = + Eigen::Map(chn_left.data(), 1, chn_left.size()); + audio.row(1) = + Eigen::Map(chn_right.data(), 1, chn_right.size()); return audio; } diff --git a/src/dsp.hpp b/src/dsp.hpp index 269dd35..3c71276 100644 --- a/src/dsp.hpp +++ b/src/dsp.hpp @@ -1,8 +1,10 @@ #ifndef DSP_HPP #define DSP_HPP +#include #include #include +#include #include #include @@ -14,57 +16,17 @@ const int FFT_WINDOW_SIZE = 4096; const int FFT_HOP_SIZE = 1024; // 25% hop i.e. 75% overlap -// create a struct to hold two float vectors for left and right channels -// simply give it two fields std::vector left and std::vector -// right -struct StereoWaveform -{ - std::vector left; - std::vector right; -}; - -StereoWaveform load_audio(std::string filename); - -void write_audio_file(const umxcpp::StereoWaveform &waveform, - std::string filename); - -// complex-valued -struct StereoSpectrogramC -{ - // dimensions: frames x frequency bins (typically 1+NFFT//2) - std::vector>> left; - std::vector>> right; -}; - -// real-valued (can be used for mag or phase) -struct StereoSpectrogramR -{ - // dimensions: frames x frequency bins (typically 1+NFFT//2) - std::vector> left; - std::vector> right; - - // Define a default constructor that initializes the vectors to empty - StereoSpectrogramR() : left(), right() {} - - // Define a copy constructor that performs a deep copy of the vectors - StereoSpectrogramR(const StereoSpectrogramR &other) - : left(other.left), right(other.right) - { - } -}; +// waveform = 2d: (channels, samples) +Eigen::MatrixXf load_audio(std::string filename); -// get real StereoSpectrogramR which are the magnitude, phase of the input -// StereoSpectrogramC -StereoSpectrogramR magnitude(const StereoSpectrogramC &spec); -StereoSpectrogramR phase(const StereoSpectrogramC &spec); +void write_audio_file(const Eigen::MatrixXf &waveform, std::string filename); -// combine magnitude and phase StereoSpectrogramR into complex -// StereoSpectrogramC -StereoSpectrogramC combine(const StereoSpectrogramR &mag, - const StereoSpectrogramR &phase); +// combine magnitude and phase spectrograms into complex +Eigen::Tensor3dXcf polar_to_complex(const Eigen::Tensor3dXf &magnitude, + const Eigen::Tensor3dXf &phase); -StereoSpectrogramC stft(const StereoWaveform &audio); -StereoWaveform istft(const StereoSpectrogramC &spec); +Eigen::Tensor3dXcf stft(const Eigen::MatrixXf &audio); +Eigen::MatrixXf istft(const Eigen::Tensor3dXcf &spec); } // namespace umxcpp diff --git a/src/lstm.cpp b/src/lstm.cpp index b8f9e07..fabd201 100644 --- a/src/lstm.cpp +++ b/src/lstm.cpp @@ -38,50 +38,66 @@ static Eigen::MatrixXf sigmoid(const Eigen::MatrixXf &x) return 1.0 / (1.0 + (-x).array().exp()); } -Eigen::MatrixXf umxcpp::umx_lstm_forward(const struct umxcpp::umx_model &model, +struct umxcpp::lstm_data umxcpp::create_lstm_data(int hidden_size, int seq_len) +{ + int hidden_state_size = hidden_size; + int cell_state_size = hidden_state_size; + + return {// output_per_direction[3][2] + { + {Eigen::MatrixXf::Zero(seq_len, hidden_state_size), + Eigen::MatrixXf::Zero(seq_len, hidden_state_size)}, + {Eigen::MatrixXf::Zero(seq_len, hidden_state_size), + Eigen::MatrixXf::Zero(seq_len, hidden_state_size)}, + {Eigen::MatrixXf::Zero(seq_len, hidden_state_size), + Eigen::MatrixXf::Zero(seq_len, hidden_state_size)}, + }, + // output[3] + { + Eigen::MatrixXf::Zero(seq_len, 2 * hidden_state_size), + Eigen::MatrixXf::Zero(seq_len, 2 * hidden_state_size), + Eigen::MatrixXf::Zero(seq_len, 2 * hidden_state_size), + }, + // h[3][2] + { + {Eigen::MatrixXf::Zero(hidden_state_size, 1), + Eigen::MatrixXf::Zero(hidden_state_size, 1)}, + {Eigen::MatrixXf::Zero(hidden_state_size, 1), + Eigen::MatrixXf::Zero(hidden_state_size, 1)}, + {Eigen::MatrixXf::Zero(hidden_state_size, 1), + Eigen::MatrixXf::Zero(hidden_state_size, 1)}, + }, + // c[3][2] + { + {Eigen::MatrixXf::Zero(cell_state_size, 1), + Eigen::MatrixXf::Zero(cell_state_size, 1)}, + {Eigen::MatrixXf::Zero(cell_state_size, 1), + Eigen::MatrixXf::Zero(cell_state_size, 1)}, + {Eigen::MatrixXf::Zero(cell_state_size, 1), + Eigen::MatrixXf::Zero(cell_state_size, 1)}, + }}; +} + +Eigen::MatrixXf umxcpp::umx_lstm_forward(struct umxcpp::umx_model *model, int target, const Eigen::MatrixXf &input, + struct umxcpp::lstm_data *data, int hidden_size) { int seq_len = input.rows(); int hidden_state_size = hidden_size; - int cell_state_size = hidden_state_size; - - Eigen::MatrixXf output_per_direction[3][2] = { - {Eigen::MatrixXf::Zero(seq_len, hidden_state_size), - Eigen::MatrixXf::Zero(seq_len, hidden_state_size)}, - {Eigen::MatrixXf::Zero(seq_len, hidden_state_size), - Eigen::MatrixXf::Zero(seq_len, hidden_state_size)}, - {Eigen::MatrixXf::Zero(seq_len, hidden_state_size), - Eigen::MatrixXf::Zero(seq_len, hidden_state_size)}, - }; - - Eigen::MatrixXf output[3] = { - Eigen::MatrixXf::Zero(seq_len, 2 * hidden_state_size), - Eigen::MatrixXf::Zero(seq_len, 2 * hidden_state_size), - Eigen::MatrixXf::Zero(seq_len, 2 * hidden_state_size)}; - - Eigen::MatrixXf h[3][2] = {{ - Eigen::MatrixXf::Zero(hidden_state_size, 1), - Eigen::MatrixXf::Zero(hidden_state_size, 1), - }, - { - Eigen::MatrixXf::Zero(hidden_state_size, 1), - Eigen::MatrixXf::Zero(hidden_state_size, 1), - }, - {Eigen::MatrixXf::Zero(hidden_state_size, 1), - Eigen::MatrixXf::Zero(hidden_state_size, 1)}}; - Eigen::MatrixXf c[3][2] = {{ - Eigen::MatrixXf::Zero(cell_state_size, 1), - Eigen::MatrixXf::Zero(cell_state_size, 1), - }, - { - Eigen::MatrixXf::Zero(cell_state_size, 1), - Eigen::MatrixXf::Zero(cell_state_size, 1), - }, - {Eigen::MatrixXf::Zero(cell_state_size, 1), - Eigen::MatrixXf::Zero(cell_state_size, 1)}}; + // set all data to zero + for (int i = 0; i < 3; ++i) + { + data->output[i].setZero(); + for (int j = 0; j < 2; ++j) + { + data->output_per_direction[i][j].setZero(); + data->h[i][j].setZero(); + data->c[i][j].setZero(); + } + } Eigen::MatrixXf loop_input = input; @@ -108,12 +124,14 @@ Eigen::MatrixXf umxcpp::umx_lstm_forward(const struct umxcpp::umx_model &model, // // the initial values for h and c are 0 Eigen::MatrixXf gates = - model.lstm_ih_w[target][lstm_layer][direction].transpose() * + model->lstm_ih_w[target][lstm_layer][direction] + .transpose() * loop_input.row(t).transpose() + - model.lstm_ih_b[target][lstm_layer][direction] + - model.lstm_hh_w[target][lstm_layer][direction].transpose() * - h[lstm_layer][direction] + - model.lstm_hh_b[target][lstm_layer][direction]; + model->lstm_ih_b[target][lstm_layer][direction] + + model->lstm_hh_w[target][lstm_layer][direction] + .transpose() * + data->h[lstm_layer][direction] + + model->lstm_hh_b[target][lstm_layer][direction]; // slice up the gates into i|f|g|o-sized chunks Eigen::MatrixXf i_t = @@ -128,28 +146,28 @@ Eigen::MatrixXf umxcpp::umx_lstm_forward(const struct umxcpp::umx_model &model, 3 * hidden_state_size, 0, hidden_state_size, 1)); Eigen::MatrixXf c_t = - f_t.array() * c[lstm_layer][direction].array() + + f_t.array() * data->c[lstm_layer][direction].array() + i_t.array() * g_t.array(); Eigen::MatrixXf h_t = o_t.array() * (c_t.array().tanh()); // store the hidden and cell states for later use - h[lstm_layer][direction] = h_t; - c[lstm_layer][direction] = c_t; + data->h[lstm_layer][direction] = h_t; + data->c[lstm_layer][direction] = c_t; - output_per_direction[lstm_layer][direction].row(t) + data->output_per_direction[lstm_layer][direction].row(t) << h_t.transpose(); } } // after both directions are done per LSTM layer, concatenate the // outputs - output[lstm_layer] << output_per_direction[lstm_layer][0], - output_per_direction[lstm_layer][1]; + data->output[lstm_layer] << data->output_per_direction[lstm_layer][0], + data->output_per_direction[lstm_layer][1]; - loop_input = output[lstm_layer]; + loop_input = data->output[lstm_layer]; } // return the concatenated forward and backward hidden state as the final // output - return output[2]; + return data->output[2]; } diff --git a/src/lstm.hpp b/src/lstm.hpp index a249f66..34b96d7 100644 --- a/src/lstm.hpp +++ b/src/lstm.hpp @@ -6,10 +6,19 @@ namespace umxcpp { +struct lstm_data +{ + Eigen::MatrixXf output_per_direction[3][2]; + Eigen::MatrixXf output[3]; + Eigen::MatrixXf h[3][2]; + Eigen::MatrixXf c[3][2]; +}; -Eigen::MatrixXf umx_lstm_forward(const struct umx_model &model, int target, - const Eigen::MatrixXf &input, int hidden_size); +struct lstm_data create_lstm_data(int hidden_size, int seq_len); +Eigen::MatrixXf umx_lstm_forward(struct umx_model *model, int target, + const Eigen::MatrixXf &input, + struct lstm_data *data, int hidden_size); }; // namespace umxcpp #endif // LSTM_HPP diff --git a/src/model.cpp b/src/model.cpp index 6577bdd..b3ecc0a 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -1,5 +1,6 @@ #include "model.hpp" #include "dsp.hpp" +#include "lstm.hpp" #include #include #include @@ -599,3 +600,136 @@ static size_t load_single_matrix(FILE *f, std::string &name, return nbytes_tensor; } + +std::array umxcpp::umx_inference(struct umx_model *model, + const Eigen::MatrixXf &x, + int hidden_size) +{ + // clone input mix mag x to operate on targets x_{0,1,2,3} + std::array x_inputs; + + std::cout << "Input scaling" << std::endl; + +#pragma omp parallel for + for (int target = 0; target < 4; ++target) + { + x_inputs[target] = x; + // opportunistically apply input scaling and mean + + // apply formula x = x*input_scale + input_mean +#pragma omp parallel for + for (int i = 0; i < x_inputs[target].rows(); i++) + { + x_inputs[target].row(i) = x_inputs[target].row(i).array() * + model->input_scale[target].array() + + model->input_mean[target].array(); + } + } + + // create pointer to a Eigen::MatrixXf to modify in the for loop + // there are classes in Eigen for this + +#pragma omp parallel for + for (int target = 0; target < 4; ++target) + { + // y = x A^T + b + // A = weights = (out_features, in_features) + // A^T = A transpose = (in_features, out_features) + std::cout << "Target " << target << " fc1" << std::endl; + x_inputs[target] = x_inputs[target] * model->fc1_w[target]; + + std::cout << "Target " << target << " bn1" << std::endl; + // batchnorm1d calculation + // y=(x-E[x])/(sqrt(Var[x]+ϵ) * gamma + Beta +#pragma omp parallel for + for (int i = 0; i < x_inputs[target].rows(); i++) + { + x_inputs[target].row(i) = + (((x_inputs[target].row(i).array() - + model->bn1_rm[target].array()) / + (model->bn1_rv[target].array() + 1e-5).sqrt()) * + model->bn1_w[target].array() + + model->bn1_b[target].array()) + .tanh(); + } + + // now lstm time + int lstm_hidden_size = hidden_size / 2; + + // umx_lstm_forward applies bidirectional 3-layer lstm using a + // LSTMCell-like approach + // https://pytorch.org/docs/stable/generated/torch.nn.LSTMCell.html + + auto lstm_data = + umxcpp::create_lstm_data(lstm_hidden_size, x_inputs[target].rows()); + + std::cout << "Target " << target << " lstm" << std::endl; + auto lstm_out_0 = umxcpp::umx_lstm_forward( + model, target, x_inputs[target], &lstm_data, lstm_hidden_size); + + // now the concat trick from umx for the skip conn + // # apply 3-layers of stacked LSTM + // lstm_out = self.lstm(x) + // # lstm skip connection + // x = torch.cat([x, lstm_out[0]], -1) + // concat the lstm_out with the input x + Eigen::MatrixXf x_inputs_target_concat(x_inputs[target].rows(), + x_inputs[target].cols() + + lstm_out_0.cols()); + x_inputs_target_concat.leftCols(x_inputs[target].cols()) = + x_inputs[target]; + x_inputs_target_concat.rightCols(lstm_out_0.cols()) = lstm_out_0; + + x_inputs[target] = x_inputs_target_concat; + + std::cout << "Target " << target << " fc2" << std::endl; + // now time for fc2 + x_inputs[target] = x_inputs[target] * model->fc2_w[target]; + + std::cout << "Target " << target << " bn2" << std::endl; + // batchnorm1d calculation + // y=(x-E[x])/(sqrt(Var[x]+ϵ) * gamma + Beta +#pragma omp parallel for + for (int i = 0; i < x_inputs[target].rows(); i++) + { + x_inputs[target].row(i) = + (((x_inputs[target].row(i).array() - + model->bn2_rm[target].array()) / + (model->bn2_rv[target].array() + 1e-5).sqrt()) * + model->bn2_w[target].array() + + model->bn2_b[target].array()) + .cwiseMax(0); + } + + std::cout << "Target " << target << " fc3" << std::endl; + x_inputs[target] = x_inputs[target] * model->fc3_w[target]; + + std::cout << "Target " << target << " bn3" << std::endl; + // batchnorm1d calculation + // y=(x-E[x])/(sqrt(Var[x]+ϵ) * gamma + Beta +#pragma omp parallel for + for (int i = 0; i < x_inputs[target].rows(); i++) + { + x_inputs[target].row(i) = + ((x_inputs[target].row(i).array() - + model->bn3_rm[target].array()) / + (model->bn3_rv[target].array() + 1e-5).sqrt()) * + model->bn3_w[target].array() + + model->bn3_b[target].array(); + } + + std::cout << "Target " << target << " output scaling" << std::endl; + // now output scaling + // apply formula x = x*output_scale + output_mean +#pragma omp parallel for + for (int i = 0; i < x_inputs[target].rows(); i++) + { + x_inputs[target].row(i) = (x_inputs[target].row(i).array() * + model->output_scale[target].array() + + model->output_mean[target].array()) + .cwiseMax(0); + } + } + + return x_inputs; +} diff --git a/src/model.hpp b/src/model.hpp index b8ddb3a..c774a56 100644 --- a/src/model.hpp +++ b/src/model.hpp @@ -2,6 +2,7 @@ #define MODEL_HPP #include +#include #include #include @@ -52,6 +53,10 @@ struct umx_model bool load_umx_model(const std::string &model_dir, struct umx_model *model); +std::array umx_inference(struct umx_model *model, + const Eigen::MatrixXf &x, + int hidden_size); + } // namespace umxcpp #endif // MODEL_HPP diff --git a/src/tensor.hpp b/src/tensor.hpp new file mode 100644 index 0000000..32e21d9 --- /dev/null +++ b/src/tensor.hpp @@ -0,0 +1,16 @@ +#ifndef TENSOR_HPP +#define TENSOR_HPP + +#include +#include +#include +#include + +namespace Eigen +{ +// define Tensor3dXf, Tensor3dXcf for spectrograms etc. +typedef Tensor Tensor3dXf; +typedef Tensor, 3> Tensor3dXcf; +} // namespace Eigen + +#endif // TENSOR_HPP diff --git a/test/test_dsp.cpp b/test/test_dsp.cpp index a28b362..64d2d57 100644 --- a/test/test_dsp.cpp +++ b/test/test_dsp.cpp @@ -11,15 +11,14 @@ TEST(LoadAudioForKissfft, LoadMonoAudio) { // load a wav file with libnyquist std::string filename = "../test/data/gspi_mono.wav"; - umxcpp::StereoWaveform ret = umxcpp::load_audio(filename); + Eigen::MatrixXf ret = umxcpp::load_audio(filename); // check the number of samples - EXPECT_EQ(ret.left.size(), 262144); - EXPECT_EQ(ret.right.size(), 262144); + EXPECT_EQ(ret.cols(), 262144); // check the first and last samples - EXPECT_EQ(ret.left[0], ret.right[0]); - EXPECT_EQ(ret.left[262143], ret.right[262143]); + EXPECT_EQ(ret(0, 0), ret(1, 0)); + EXPECT_EQ(ret(0, 262143), ret(1, 262143)); } // write a basic test case for a stereo file @@ -27,57 +26,50 @@ TEST(LoadAudioForKissfft, LoadStereoAudio) { // load a wav file with libnyquist std::string filename = "../test/data/gspi_stereo.wav"; - umxcpp::StereoWaveform ret = umxcpp::load_audio(filename); + + Eigen::MatrixXf ret = umxcpp::load_audio(filename); // check the number of samples - EXPECT_EQ(ret.left.size(), 262144); - EXPECT_EQ(ret.right.size(), 262144); + EXPECT_EQ(ret.cols(), 262144); // check the first and last samples - EXPECT_EQ(ret.left[0], ret.right[0]); - EXPECT_EQ(ret.left[262143], ret.right[262143]); + EXPECT_EQ(ret(0, 0), ret(1, 0)); + EXPECT_EQ(ret(0, 262143), ret(1, 262143)); } // write a basic test case for the stft function TEST(DSP_STFT, STFTRoundtripRandWaveform) { - umxcpp::StereoWaveform audio_in; - - audio_in.left.resize(4096); - audio_in.right.resize(4096); + Eigen::MatrixXf audio_in(2, 4096); // populate the audio_in with some random data // between -1 and 1 for (size_t i = 0; i < 4096; ++i) { - audio_in.left[i] = (float)rand() / (float)RAND_MAX; - audio_in.right[i] = (float)rand() / (float)RAND_MAX; + audio_in(0, i) = (float)rand() / (float)RAND_MAX; + audio_in(1, i) = (float)rand() / (float)RAND_MAX; } // compute the stft - umxcpp::StereoSpectrogramC spec = umxcpp::stft(audio_in); + Eigen::Tensor3dXcf spec = umxcpp::stft(audio_in); // check the number of frequency bins per frames, first and last - auto n_frames = spec.left.size(); - EXPECT_EQ(n_frames, spec.right.size()); + auto n_frames = spec.dimension(1); std::cout << "n_frames: " << n_frames << std::endl; - EXPECT_EQ(spec.left[0].size(), 2049); - EXPECT_EQ(spec.right[0].size(), 2049); - EXPECT_EQ(spec.left[n_frames - 1].size(), 2049); - EXPECT_EQ(spec.right[n_frames - 1].size(), 2049); + EXPECT_EQ(spec.dimension(2), 2049); - umxcpp::StereoWaveform audio_out = umxcpp::istft(spec); + Eigen::MatrixXf audio_out = umxcpp::istft(spec); - EXPECT_EQ(audio_in.left.size(), audio_out.left.size()); - EXPECT_EQ(audio_in.right.size(), audio_out.right.size()); + EXPECT_EQ(audio_in.rows(), audio_out.rows()); + EXPECT_EQ(audio_in.cols(), audio_out.cols()); - for (size_t i = 0; i < audio_in.left.size(); ++i) + for (size_t i = 0; i < audio_in.cols(); ++i) { // expect similar samples with a small floating point error - EXPECT_NEAR(audio_in.left[i], audio_out.left[i], NEAR_TOLERANCE); - EXPECT_NEAR(audio_in.right[i], audio_out.right[i], NEAR_TOLERANCE); + EXPECT_NEAR(audio_in(0, i), audio_out(0, i), NEAR_TOLERANCE); + EXPECT_NEAR(audio_in(1, i), audio_out(1, i), NEAR_TOLERANCE); } } @@ -85,93 +77,103 @@ TEST(DSP_STFT, STFTRoundtripRandWaveform) // with real gspi.wav TEST(DSP_STFT, STFTRoundtripGlockenspiel) { - umxcpp::StereoWaveform audio_in = - umxcpp::load_audio("../test/data/gspi_mono.wav"); + Eigen::MatrixXf audio_in = umxcpp::load_audio("../test/data/gspi_mono.wav"); // compute the stft - umxcpp::StereoSpectrogramC spec = umxcpp::stft(audio_in); + Eigen::Tensor3dXcf spec = umxcpp::stft(audio_in); // check the number of frequency bins per frames, first and last - auto n_frames = spec.left.size(); - EXPECT_EQ(n_frames, spec.right.size()); + auto n_frames = spec.dimension(1); std::cout << "n_frames: " << n_frames << std::endl; - EXPECT_EQ(spec.left[0].size(), 2049); - EXPECT_EQ(spec.right[0].size(), 2049); - EXPECT_EQ(spec.left[n_frames - 1].size(), 2049); - EXPECT_EQ(spec.right[n_frames - 1].size(), 2049); + EXPECT_EQ(spec.dimension(2), 2049); - umxcpp::StereoWaveform audio_out = umxcpp::istft(spec); + Eigen::MatrixXf audio_out = umxcpp::istft(spec); - EXPECT_EQ(audio_in.left.size(), audio_out.left.size()); - EXPECT_EQ(audio_in.right.size(), audio_out.right.size()); + EXPECT_EQ(audio_in.rows(), audio_out.rows()); + EXPECT_EQ(audio_in.cols(), audio_out.cols()); - for (size_t i = 0; i < audio_in.left.size(); ++i) + for (size_t i = 0; i < audio_in.cols(); ++i) { // expect similar samples with a small floating point error - EXPECT_NEAR(audio_in.left[i], audio_out.left[i], NEAR_TOLERANCE); - EXPECT_NEAR(audio_in.right[i], audio_out.right[i], NEAR_TOLERANCE); + EXPECT_NEAR(audio_in(0, i), audio_out(0, i), NEAR_TOLERANCE); + EXPECT_NEAR(audio_in(1, i), audio_out(1, i), NEAR_TOLERANCE); } } // write a test for the magnitude and phase functions // and test a roundtrip with the combine function -TEST(DSP_STFT, MagnitudePhaseCombine) +TEST(DSP_STFT, MagnitudePhaseCombineMono) { - umxcpp::StereoWaveform audio_in = - umxcpp::load_audio("../test/data/gspi_mono.wav"); + Eigen::MatrixXf audio_in = umxcpp::load_audio("../test/data/gspi_mono.wav"); // compute the stft - umxcpp::StereoSpectrogramC spec = umxcpp::stft(audio_in); + Eigen::Tensor3dXcf spec = umxcpp::stft(audio_in); // compute the magnitude and phase - umxcpp::StereoSpectrogramR mag = umxcpp::magnitude(spec); - umxcpp::StereoSpectrogramR phase = umxcpp::phase(spec); + Eigen::Tensor3dXf mag = spec.unaryExpr([](const std::complex &c) + { return std::abs(c); }); + + Eigen::Tensor3dXf phase = spec.unaryExpr([](const std::complex &c) + { return std::arg(c); }); + + // print each dimension of spec + std::cout << "spec.dimensions(): "; + for (size_t i = 0; i < spec.dimensions().size(); ++i) + { + std::cout << spec.dimensions()[i] << " "; + } + std::cout << std::endl; // ensure all magnitude are positive - for (size_t i = 0; i < mag.left.size(); ++i) + for (size_t chan = 0; chan < mag.dimension(0); ++chan) { - for (size_t j = 0; j < mag.left[i].size(); ++j) + for (size_t i = 0; i < mag.dimension(1); ++i) { - EXPECT_GE(mag.left[i][j], 0.0); - EXPECT_GE(mag.right[i][j], 0.0); + for (size_t j = 0; j < mag.dimension(2); ++j) + { + EXPECT_GE(mag(chan, i, j), 0.0); + } } } // combine the magnitude and phase - umxcpp::StereoSpectrogramC spec2 = umxcpp::combine(mag, phase); + Eigen::Tensor3dXcf spec2 = umxcpp::polar_to_complex(mag, phase); // ensure spec and spec2 are the same // first check their sizes - EXPECT_EQ(spec.left.size(), spec2.left.size()); + EXPECT_EQ(spec.dimensions().size(), spec2.dimensions().size()); + for (size_t i = 0; i < spec.dimensions().size(); ++i) + { + EXPECT_EQ(spec.dimensions()[i], spec2.dimensions()[i]); + } - for (size_t i = 0; i < spec.left.size(); ++i) + for (size_t chan = 0; chan < spec.dimension(0); ++chan) { - for (size_t j = 0; j < spec.left[i].size(); ++j) + for (size_t i = 0; i < spec.dimension(1); ++i) { - EXPECT_NEAR(std::real(spec.left[i][j]), std::real(spec2.left[i][j]), - NEAR_TOLERANCE); - EXPECT_NEAR(std::imag(spec.left[i][j]), std::imag(spec2.left[i][j]), - NEAR_TOLERANCE); - EXPECT_NEAR(std::real(spec.right[i][j]), - std::real(spec2.right[i][j]), NEAR_TOLERANCE); - EXPECT_NEAR(std::imag(spec.right[i][j]), - std::imag(spec2.right[i][j]), NEAR_TOLERANCE); + for (size_t j = 0; j < spec.dimension(2); ++j) + { + EXPECT_NEAR(std::real(spec(chan, i, j)), + std::real(spec2(chan, i, j)), NEAR_TOLERANCE); + EXPECT_NEAR(std::imag(spec(chan, i, j)), + std::imag(spec2(chan, i, j)), NEAR_TOLERANCE); + } } } // compute the istft - umxcpp::StereoWaveform audio_out = umxcpp::istft(spec2); + Eigen::MatrixXf audio_out = umxcpp::istft(spec2); - EXPECT_EQ(audio_in.left.size(), audio_out.left.size()); - EXPECT_EQ(audio_in.right.size(), audio_out.right.size()); + EXPECT_EQ(audio_in.rows(), audio_out.rows()); + EXPECT_EQ(audio_in.cols(), audio_out.cols()); - for (size_t i = 0; i < audio_in.left.size(); ++i) + for (size_t i = 0; i < audio_in.cols(); ++i) { // expect similar samples with a small floating point error - EXPECT_NEAR(audio_in.left[i], audio_out.left[i], NEAR_TOLERANCE); - EXPECT_NEAR(audio_in.right[i], audio_out.right[i], NEAR_TOLERANCE); + EXPECT_NEAR(audio_in(0, i), audio_out(0, i), NEAR_TOLERANCE); + EXPECT_NEAR(audio_in(1, i), audio_out(1, i), NEAR_TOLERANCE); } } @@ -179,59 +181,66 @@ TEST(DSP_STFT, MagnitudePhaseCombine) // and test a roundtrip with the combine function TEST(DSP_STFT, MagnitudePhaseCombineStereo) { - umxcpp::StereoWaveform audio_in = + Eigen::MatrixXf audio_in = umxcpp::load_audio("../test/data/gspi_stereo.wav"); // compute the stft - umxcpp::StereoSpectrogramC spec = umxcpp::stft(audio_in); + Eigen::Tensor3dXcf spec = umxcpp::stft(audio_in); // compute the magnitude and phase - umxcpp::StereoSpectrogramR mag = umxcpp::magnitude(spec); - umxcpp::StereoSpectrogramR phase = umxcpp::phase(spec); + Eigen::Tensor3dXf mag = spec.unaryExpr([](const std::complex &c) + { return std::abs(c); }); + + Eigen::Tensor3dXf phase = spec.unaryExpr([](const std::complex &c) + { return std::arg(c); }); // ensure all magnitude are positive - for (size_t i = 0; i < mag.left.size(); ++i) + for (size_t chan = 0; chan < mag.dimension(0); ++chan) { - for (size_t j = 0; j < mag.left[i].size(); ++j) + for (size_t i = 0; i < mag.dimension(1); ++i) { - EXPECT_GE(mag.left[i][j], 0.0); - EXPECT_GE(mag.right[i][j], 0.0); + for (size_t j = 0; j < mag.dimension(2); ++j) + { + EXPECT_GE(mag(chan, i, j), 0.0); + } } } // combine the magnitude and phase - umxcpp::StereoSpectrogramC spec2 = umxcpp::combine(mag, phase); + Eigen::Tensor3dXcf spec2 = umxcpp::polar_to_complex(mag, phase); // ensure spec and spec2 are the same // first check their sizes - EXPECT_EQ(spec.left.size(), spec2.left.size()); + EXPECT_EQ(spec.dimensions().size(), spec2.dimensions().size()); + for (size_t i = 0; i < spec.dimensions().size(); ++i) + { + EXPECT_EQ(spec.dimensions()[i], spec2.dimensions()[i]); + } - for (size_t i = 0; i < spec.left.size(); ++i) + for (size_t chan = 0; chan < spec.dimension(0); ++chan) { - for (size_t j = 0; j < spec.left[i].size(); ++j) + for (size_t i = 0; i < spec.dimension(1); ++i) { - // rewrite above for std::complex - EXPECT_NEAR(std::real(spec.left[i][j]), std::real(spec2.left[i][j]), - NEAR_TOLERANCE); - EXPECT_NEAR(std::imag(spec.left[i][j]), std::imag(spec2.left[i][j]), - NEAR_TOLERANCE); - EXPECT_NEAR(std::real(spec.right[i][j]), - std::real(spec2.right[i][j]), NEAR_TOLERANCE); - EXPECT_NEAR(std::imag(spec.right[i][j]), - std::imag(spec2.right[i][j]), NEAR_TOLERANCE); + for (size_t j = 0; j < spec.dimension(2); ++j) + { + EXPECT_NEAR(std::real(spec(chan, i, j)), + std::real(spec2(chan, i, j)), NEAR_TOLERANCE); + EXPECT_NEAR(std::imag(spec(chan, i, j)), + std::imag(spec2(chan, i, j)), NEAR_TOLERANCE); + } } } // compute the istft - umxcpp::StereoWaveform audio_out = umxcpp::istft(spec2); + Eigen::MatrixXf audio_out = umxcpp::istft(spec2); - EXPECT_EQ(audio_in.left.size(), audio_out.left.size()); - EXPECT_EQ(audio_in.right.size(), audio_out.right.size()); + EXPECT_EQ(audio_in.rows(), audio_out.rows()); + EXPECT_EQ(audio_in.cols(), audio_out.cols()); - for (size_t i = 0; i < audio_in.left.size(); ++i) + for (size_t i = 0; i < audio_in.cols(); ++i) { // expect similar samples with a small floating point error - EXPECT_NEAR(audio_in.left[i], audio_out.left[i], NEAR_TOLERANCE); - EXPECT_NEAR(audio_in.right[i], audio_out.right[i], NEAR_TOLERANCE); + EXPECT_NEAR(audio_in(0, i), audio_out(0, i), NEAR_TOLERANCE); + EXPECT_NEAR(audio_in(1, i), audio_out(1, i), NEAR_TOLERANCE); } } diff --git a/umx.cpp b/umx.cpp index 2f51127..8b26278 100644 --- a/umx.cpp +++ b/umx.cpp @@ -44,7 +44,7 @@ int main(int argc, const char **argv) std::cout << "Number of physical cores: " << nb_cores << std::endl; Eigen::setNbThreads(nb_cores - 1); - StereoWaveform audio = load_audio(wav_file); + Eigen::MatrixXf audio = load_audio(wav_file); // initialize a struct umx_model struct umx_model model @@ -62,17 +62,19 @@ int main(int argc, const char **argv) // let's get a stereo complex spectrogram first std::cout << "Computing STFT" << std::endl; - StereoSpectrogramC spectrogram = stft(audio); + Eigen::Tensor3dXcf spectrogram = stft(audio); - std::cout << "spec shape: (incl 2 chan) " << spectrogram.left.size() - << " x " << spectrogram.left[0].size() << std::endl; + std::cout << "spec shape: (" << spectrogram.dimensions()[0] << ", " + << spectrogram.dimensions()[1] << ", " + << spectrogram.dimensions()[2] << ")" << std::endl; std::cout << "Computing STFT magnitude" << std::endl; // now let's get a stereo magnitude spectrogram - StereoSpectrogramR mix_mag = magnitude(spectrogram); + Eigen::Tensor3dXf mix_mag = spectrogram.abs(); std::cout << "Computing STFT phase" << std::endl; - StereoSpectrogramR mix_phase = phase(spectrogram); + Eigen::Tensor3dXf mix_phase = spectrogram.unaryExpr( + [](const std::complex &c) { return std::arg(c); }); // apply umx inference to the magnitude spectrogram // first create a ggml_tensor for the input @@ -82,8 +84,8 @@ int main(int argc, const char **argv) // input shape is (nb_frames*nb_samples, nb_channels*nb_bins) i.e. 2049*2 int nb_bins = 2049; - int nb_frames = mix_mag.left.size(); - int nb_real_bins = mix_mag.left[0].size(); + int nb_frames = mix_mag.dimension(1); + int nb_real_bins = mix_mag.dimension(2); assert(nb_real_bins == nb_bins); @@ -104,154 +106,46 @@ int main(int argc, const char **argv) { // interleave fft frames from each channel // fill first half of 2974/2 bins from left - x(i, j) = mix_mag.left[i][j]; + x(i, j) = mix_mag(0, i, j); // fill second half of 2974/2 bins from right - x(i, j + nb_bins_cropped) = mix_mag.right[i][j]; + x(i, j + nb_bins_cropped) = mix_mag(1, i, j); } } std::cout << "Running inference with Eigen matrices" << std::endl; - // clone input mix mag x to operate on targets x_{0,1,2,3} - Eigen::MatrixXf x_inputs[4]; + std::array x_outputs = + umx_inference(&model, x, hidden_size); #pragma omp parallel for for (int target = 0; target < 4; ++target) { - x_inputs[target] = x; -// opportunistically apply input scaling and mean - -// apply formula x = x*input_scale + input_mean -#pragma omp parallel for - for (int i = 0; i < x_inputs[target].rows(); i++) - { - x_inputs[target].row(i) = x_inputs[target].row(i).array() * - model.input_scale[target].array() + - model.input_mean[target].array(); - } - } - - // create pointer to a Eigen::MatrixXf to modify in the for loop - // there are classes in Eigen for this - -#pragma omp parallel for - for (int target = 0; target < 4; ++target) - { - // y = x A^T + b - // A = weights = (out_features, in_features) - // A^T = A transpose = (in_features, out_features) - x_inputs[target] = x_inputs[target] * model.fc1_w[target]; - -// batchnorm1d calculation -// y=(x-E[x])/(sqrt(Var[x]+ϵ) * gamma + Beta -#pragma omp parallel for - for (int i = 0; i < x_inputs[target].rows(); i++) - { - x_inputs[target].row(i) = - (((x_inputs[target].row(i).array() - - model.bn1_rm[target].array()) / - (model.bn1_rv[target].array() + 1e-5).sqrt()) * - model.bn1_w[target].array() + - model.bn1_b[target].array()) - .tanh(); - } - - // now lstm time - int lstm_hidden_size = hidden_size / 2; - - // umx_lstm_forward applies bidirectional 3-layer lstm using a - // LSTMCell-like approach - // https://pytorch.org/docs/stable/generated/torch.nn.LSTMCell.html - - auto lstm_out_0 = umxcpp::umx_lstm_forward( - model, target, x_inputs[target], lstm_hidden_size); - - // now the concat trick from umx for the skip conn - // # apply 3-layers of stacked LSTM - // lstm_out = self.lstm(x) - // # lstm skip connection - // x = torch.cat([x, lstm_out[0]], -1) - // concat the lstm_out with the input x - Eigen::MatrixXf x_inputs_target_concat(x_inputs[target].rows(), - x_inputs[target].cols() + - lstm_out_0.cols()); - x_inputs_target_concat.leftCols(x_inputs[target].cols()) = - x_inputs[target]; - x_inputs_target_concat.rightCols(lstm_out_0.cols()) = lstm_out_0; - - x_inputs[target] = x_inputs_target_concat; - - // now time for fc2 - x_inputs[target] = x_inputs[target] * model.fc2_w[target]; - -// batchnorm1d calculation -// y=(x-E[x])/(sqrt(Var[x]+ϵ) * gamma + Beta -#pragma omp parallel for - for (int i = 0; i < x_inputs[target].rows(); i++) - { - x_inputs[target].row(i) = - (((x_inputs[target].row(i).array() - - model.bn2_rm[target].array()) / - (model.bn2_rv[target].array() + 1e-5).sqrt()) * - model.bn2_w[target].array() + - model.bn2_b[target].array()) - .cwiseMax(0); - } - - x_inputs[target] = x_inputs[target] * model.fc3_w[target]; - -// batchnorm1d calculation -// y=(x-E[x])/(sqrt(Var[x]+ϵ) * gamma + Beta -#pragma omp parallel for - for (int i = 0; i < x_inputs[target].rows(); i++) - { - x_inputs[target].row(i) = - ((x_inputs[target].row(i).array() - - model.bn3_rm[target].array()) / - (model.bn3_rv[target].array() + 1e-5).sqrt()) * - model.bn3_w[target].array() + - model.bn3_b[target].array(); - } - -// now output scaling -// apply formula x = x*output_scale + output_mean -#pragma omp parallel for - for (int i = 0; i < x_inputs[target].rows(); i++) - { - x_inputs[target].row(i) = (x_inputs[target].row(i).array() * - model.output_scale[target].array() + - model.output_mean[target].array()) - .cwiseMax(0); - } - - // print min and max elements of x_inputs[target] - std::cout << "POST-RELU-FINAL x_inputs[target] min: " - << x_inputs[target].minCoeff() - << " x_inputs[target] max: " << x_inputs[target].maxCoeff() + std::cout << "POST-RELU-FINAL x_outputs[target] min: " + << x_outputs[target].minCoeff() + << " x_inputs[target] max: " << x_outputs[target].maxCoeff() << std::endl; // copy mix-mag - StereoSpectrogramR mix_mag_target(mix_mag); + Eigen::Tensor3dXf mix_mag_target(mix_mag); -// element-wise multiplication, taking into account the stacked outputs of the -// neural network + // element-wise multiplication, taking into account the stacked outputs + // of the neural network #pragma omp parallel for - for (std::size_t i = 0; i < mix_mag.left.size(); i++) + for (std::size_t i = 0; i < mix_mag.dimension(1); i++) { #pragma omp parallel for - for (std::size_t j = 0; j < mix_mag.left[0].size(); j++) + for (std::size_t j = 0; j < mix_mag.dimension(2); j++) { - mix_mag_target.left[i][j] *= x_inputs[target](i, j); - mix_mag_target.right[i][j] *= - x_inputs[target](i, j + mix_mag.left[0].size()); + mix_mag_target(0, i, j) *= x_outputs[target](i, j); + mix_mag_target(1, i, j) *= + x_outputs[target](i, j + mix_mag.dimension(2)); } } // now let's get a stereo waveform back first with phase - StereoSpectrogramC mix_complex_target = - combine(mix_mag_target, mix_phase); - - StereoWaveform audio_target = istft(mix_complex_target); + // initial estimate + Eigen::MatrixXf audio_target = + istft(polar_to_complex(mix_mag_target, mix_phase)); // now write the 4 audio waveforms to files in the output dir // using libnyquist diff --git a/vendor/eigen b/vendor/eigen new file mode 160000 index 0000000..9995c3d --- /dev/null +++ b/vendor/eigen @@ -0,0 +1 @@ +Subproject commit 9995c3da6ffa8634e2bce9e7ffcc9904da39b491