Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Streaming decoder for compatible engine #875

Merged
merged 6 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions crates/test_util/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,35 @@ fn generate_example_data_json(dist: &Path) -> anyhow::Result<()> {
phoneme.to_vec()
},
},
intermediate: typing::IntermediateExampleData {
f0_length: 69,
phoneme_size: 45,
feature_dim: 80,
margin_width: 14,
f0_vector: {
let mut f0 = [0.; 69];
f0[9..24].fill(5.905218);
f0[37..60].fill(5.565851);
f0.to_vec()
},
phoneme_vector: {
let mut phoneme = [0.; 45 * 69];
let mut set_one = |index, range| {
for i in range {
phoneme[(i * 45 + index) as usize] = 1.;
}
};
set_one(0, 0..9);
set_one(37, 9..13);
set_one(14, 13..24);
set_one(35, 24..30);
set_one(6, 30..37);
set_one(37, 37..45);
set_one(30, 45..60);
set_one(0, 60..69);
phoneme.to_vec()
},
},
};

fs_err::write(
Expand Down
8 changes: 8 additions & 0 deletions crates/test_util/compatible_engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,12 @@ bool yukarin_sa_forward(int64_t length, int64_t *vowel_phoneme_list,
bool decode_forward(int64_t length, int64_t phoneme_size, float *f0,
float *phoneme, int64_t *speaker_id, float *output);

bool generate_full_intermediate(int64_t length, int64_t phoneme_size,
int64_t margin_width, int64_t feature_dim,
float *f0, float *phoneme, int64_t *speaker_id,
float *output);

bool render_audio_segment(int64_t length, int64_t feature_dim, float *audio_feature,
int64_t *speaker_id, float *output);

const char *last_error_message();
11 changes: 11 additions & 0 deletions crates/test_util/src/typing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,22 @@ pub struct DecodeExampleData {
pub phoneme_vector: Vec<f32>,
}

#[derive(Debug, Serialize, Deserialize)]
pub struct IntermediateExampleData {
pub f0_length: i64,
pub phoneme_size: i64,
pub feature_dim: i64,
pub margin_width: i64,
pub f0_vector: Vec<f32>,
pub phoneme_vector: Vec<f32>,
}

#[derive(Debug, Serialize, Deserialize)]
pub struct ExampleData {
pub speaker_id: i64,

pub duration: DurationExampleData,
pub intonation: IntonationExampleData,
pub decode: DecodeExampleData,
pub intermediate: IntermediateExampleData,
}
1 change: 1 addition & 0 deletions crates/voicevox_core_c_api/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ easy-ext.workspace = true
educe.workspace = true
itertools.workspace = true
libc.workspace = true
ndarray.workspace = true
parking_lot = { workspace = true, features = ["arc_lock"] }
process_path.workspace = true
ref-cast.workspace = true
Expand Down
106 changes: 106 additions & 0 deletions crates/voicevox_core_c_api/src/compatible_engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,112 @@ pub unsafe extern "C" fn decode_forward(
}
}

/// # Safety
///
/// - `f0`はRustの`&[f32; length as usize]`として解釈できなければならない。
/// - `phoneme`はRustの`&[f32; phoneme_size * length as usize]`として解釈できなければならない。
/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。
/// - `output`はRustの`&mut [MaybeUninit<f32>; ((length + 2 * margin_width) * feature_dim) as usize]`として解釈できなければならない。
#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない
pub unsafe extern "C" fn generate_full_intermediate(
length: i64,
phoneme_size: i64,
margin_width: i64,
Yosshi999 marked this conversation as resolved.
Show resolved Hide resolved
feature_dim: i64,
f0: *mut f32,
phoneme: *mut f32,
speaker_id: *mut i64,
output: *mut f32,
Yosshi999 marked this conversation as resolved.
Show resolved Hide resolved
) -> bool {
init_logger_once();
assert_aligned(f0);
assert_aligned(phoneme);
assert_aligned(speaker_id);
assert_aligned(output);
let length = length as usize;
let phoneme_size = phoneme_size as usize;
let margin_width = margin_width as usize;
let feature_dim = feature_dim as usize;
let synthesizer = &*lock_synthesizer();
let result = ensure_initialized!(synthesizer).generate_full_intermediate(
length,
phoneme_size,
// SAFETY: The safety contract must be upheld by the caller.
unsafe { std::slice::from_raw_parts(f0, length) },
unsafe { std::slice::from_raw_parts(phoneme, phoneme_size * length) },
StyleId::new(unsafe { *speaker_id as u32 }),
);
match result {
Ok(output_arr) => {
let output_len = (length + 2 * margin_width) * feature_dim;
if output_arr.len() != output_len {
panic!("expected {}, got {}", output_len, output_arr.len());
}
let output_arr = output_arr.as_standard_layout();
// SAFETY: The safety contract must be upheld by the caller.
unsafe {
output_arr
.as_ptr()
.copy_to_nonoverlapping(output, output_len);
}
true
}
Err(err) => {
set_message(&format!("{err}"));
false
}
}
}

/// # Safety
///
/// - `audio_feature`はRustの`&[f32; (length * feature_dim) as usize]`として解釈できなければならない。
/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。
/// - `output`はRustの`&mut [MaybeUninit<f32>; length as usize * 256]`として解釈できなければならない。
#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない
pub unsafe extern "C" fn render_audio_segment(
length: i64,
feature_dim: i64,
audio_feature: *mut f32,
speaker_id: *mut i64,
output: *mut f32,
) -> bool {
Yosshi999 marked this conversation as resolved.
Show resolved Hide resolved
init_logger_once();
assert_aligned(audio_feature);
assert_aligned(speaker_id);
assert_aligned(output);
let length = length as usize;
let feature_dim = feature_dim as usize;
let synthesizer = &*lock_synthesizer();
let result = ensure_initialized!(synthesizer).render_audio_segment(
// SAFETY: The safety contract must be upheld by the caller.
unsafe {
ndarray::ArrayView2::from_shape_ptr([length, feature_dim], audio_feature).to_owned()
},
StyleId::new(unsafe { *speaker_id as u32 }),
);
match result {
Ok(output_arr) => {
let output_len = length * 256;
if output_arr.len() != output_len {
panic!("expected {}, got {}", output_len, output_arr.len());
}
let output_arr = output_arr.as_standard_layout();
// SAFETY: The safety contract must be upheld by the caller.
unsafe {
output_arr
.as_ptr()
.copy_to_nonoverlapping(output, output_len);
}
true
}
Err(err) => {
set_message(&format!("{err}"));
false
}
}
}

#[track_caller]
fn assert_aligned(ptr: *mut impl Sized) {
assert!(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// エンジンを起動してyukarin_s・yukarin_sa・decodeの推論を行う

use std::ffi::CStr;
use std::sync::LazyLock;
use std::{cmp::min, ffi::CStr};

use assert_cmd::assert::AssertResult;
use libloading::Library;
Expand Down Expand Up @@ -83,12 +83,88 @@ impl assert_cdylib::TestCase for TestCase {
wave
};

// 中間生成物を経由した場合の生成音声
let wave2 = {
let length_with_margin =
EXAMPLE_DATA.intermediate.f0_length + 2 * EXAMPLE_DATA.intermediate.margin_width;
let mut audio_feature =
vec![0.; (length_with_margin * EXAMPLE_DATA.intermediate.feature_dim) as usize];
let mut wave = vec![0.; 256 * length_with_margin as usize];
assert!(lib.generate_full_intermediate(
EXAMPLE_DATA.intermediate.f0_length,
EXAMPLE_DATA.intermediate.phoneme_size,
EXAMPLE_DATA.intermediate.margin_width,
EXAMPLE_DATA.intermediate.feature_dim,
EXAMPLE_DATA.intermediate.f0_vector.as_ptr() as *mut f32,
EXAMPLE_DATA.intermediate.phoneme_vector.as_ptr() as *mut f32,
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
audio_feature.as_mut_ptr(),
));
assert!(lib.render_audio_segment(
length_with_margin,
EXAMPLE_DATA.intermediate.feature_dim,
audio_feature.as_ptr() as *mut f32,
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
wave.as_mut_ptr(),
));
wave[256 * EXAMPLE_DATA.intermediate.margin_width as usize
..wave.len() - 256 * EXAMPLE_DATA.intermediate.margin_width as usize]
.to_vec()
};

// 中間生成物を経由し、さらにチャンクごとに変換した場合の生成音声
let wave3 = {
let length_with_margin =
EXAMPLE_DATA.intermediate.f0_length + 2 * EXAMPLE_DATA.intermediate.margin_width;
let mut audio_feature =
vec![0.; (length_with_margin * EXAMPLE_DATA.intermediate.feature_dim) as usize];
let mut wave = vec![0.; 256 * EXAMPLE_DATA.intermediate.f0_length as usize];
assert!(lib.generate_full_intermediate(
EXAMPLE_DATA.intermediate.f0_length,
EXAMPLE_DATA.intermediate.phoneme_size,
EXAMPLE_DATA.intermediate.margin_width,
EXAMPLE_DATA.intermediate.feature_dim,
EXAMPLE_DATA.intermediate.f0_vector.as_ptr() as *mut f32,
EXAMPLE_DATA.intermediate.phoneme_vector.as_ptr() as *mut f32,
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
audio_feature.as_mut_ptr(),
));
let full_length = EXAMPLE_DATA.intermediate.f0_length as usize;
let pitch = EXAMPLE_DATA.intermediate.feature_dim as usize;
for render_start in (0..full_length).step_by(10) {
// render_start .. render_end の音声を取得する
let render_end = min(render_start + 10, full_length);
let slice_start = render_start;
let slice_end = render_end + 2 * EXAMPLE_DATA.intermediate.margin_width as usize;
let feature_segment = &audio_feature[slice_start * pitch..slice_end * pitch];
let slice_length = slice_end - slice_start;
let mut wave_segment_with_margin = vec![0.; 256 * slice_length];
assert!(lib.render_audio_segment(
slice_length as i64,
pitch as i64,
feature_segment.as_ptr() as *mut f32,
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
wave_segment_with_margin.as_mut_ptr(),
));
let wave_segment = &wave_segment_with_margin[256
* EXAMPLE_DATA.intermediate.margin_width as usize
..wave_segment_with_margin.len()
- 256 * EXAMPLE_DATA.intermediate.margin_width as usize];
wave[render_start * 256..render_end * 256].clone_from_slice(wave_segment);
}
wave
};

std::assert_eq!(SNAPSHOTS.metas, metas_json);

float_assert::close_l1(&phoneme_length, &EXAMPLE_DATA.duration.result, 0.01);
float_assert::close_l1(&intonation_list, &EXAMPLE_DATA.intonation.result, 0.01);

assert!(wave.iter().copied().all(f32::is_normal));
assert!(wave2.iter().copied().all(f32::is_normal));
assert!(wave3.iter().copied().all(f32::is_normal));
float_assert::close_l1(&wave2, &wave, 0.01);
float_assert::close_l1(&wave3, &wave, 0.01);
Yosshi999 marked this conversation as resolved.
Show resolved Hide resolved

lib.finalize();
Ok(())
Expand Down
Loading