From de9c3e678ca37b8309e1744ae1a212d9009ff3c9 Mon Sep 17 00:00:00 2001 From: PeterD1524 <53310459+PeterD1524@users.noreply.github.com> Date: Wed, 7 Aug 2024 04:40:11 +0800 Subject: [PATCH] [Example] Add piper example (#145) * Add piper example Signed-off-by: PeterD1524 * update dependencies Signed-off-by: PeterD1524 * fix typo Signed-off-by: PeterD1524 * Add a GitHub workflow to ensure that the example can run successfully Signed-off-by: PeterD1524 * simplify layout Signed-off-by: PeterD1524 * Provide a simple explanation of the related dependency installation Signed-off-by: PeterD1524 * add config description Signed-off-by: PeterD1524 * use -DWASMEDGE_USE_LLVM=OFF to disable all AOT-related components Signed-off-by: PeterD1524 * ask users to download and install the onnx runtime Signed-off-by: PeterD1524 * add sudo for mv Signed-off-by: PeterD1524 * ldconfig for installing onnxruntime Signed-off-by: PeterD1524 * use the install script from the WasmEdge repo to install ONNX Runtime Signed-off-by: PeterD1524 --------- Signed-off-by: PeterD1524 --- .github/workflows/piper.yml | 68 ++++++++++++++ wasmedge-piper/Cargo.toml | 10 ++ wasmedge-piper/README.md | 126 ++++++++++++++++++++++++++ wasmedge-piper/config.schema.json | 68 ++++++++++++++ wasmedge-piper/dependencies.d2 | 7 ++ wasmedge-piper/dependencies.svg | 99 ++++++++++++++++++++ wasmedge-piper/json_input.schema.json | 20 ++++ wasmedge-piper/src/main.rs | 31 +++++++ 8 files changed, 429 insertions(+) create mode 100644 .github/workflows/piper.yml create mode 100644 wasmedge-piper/Cargo.toml create mode 100644 wasmedge-piper/README.md create mode 100644 wasmedge-piper/config.schema.json create mode 100644 wasmedge-piper/dependencies.d2 create mode 100644 wasmedge-piper/dependencies.svg create mode 100644 wasmedge-piper/json_input.schema.json create mode 100644 wasmedge-piper/src/main.rs diff --git a/.github/workflows/piper.yml b/.github/workflows/piper.yml new file mode 100644 index 0000000..402dd7e --- /dev/null +++ b/.github/workflows/piper.yml @@ -0,0 +1,68 @@ +name: Piper Example + +on: + schedule: + - cron: "0 0 * * *" + push: + paths: + - ".github/workflows/piper.yml" + - "wasmedge-piper/**" + pull_request: + paths: + - ".github/workflows/piper.yml" + - "wasmedge-piper/**" + +jobs: + build: + runs-on: ubuntu-22.04 + steps: + - name: Install Dependencies for building WasmEdge + run: | + sudo apt-get update + sudo apt-get install ninja-build + + - name: Checkout WasmEdge + uses: actions/checkout@v4 + with: + repository: WasmEdge/WasmEdge + path: WasmEdge + + - name: Install ONNX Runtime + run: sudo bash utils/wasi-nn/install-onnxruntime.sh + working-directory: WasmEdge + + - name: Build WasmEdge with WASI-NN Piper plugin + run: | + cmake -GNinja -Bbuild -DCMAKE_BUILD_TYPE=Release -DWASMEDGE_USE_LLVM=OFF -DWASMEDGE_PLUGIN_WASI_NN_BACKEND=Piper + cmake --build build + working-directory: WasmEdge + + - name: Install Rust target for wasm + run: rustup target add wasm32-wasi + + - name: Checkout WasmEdge-WASINN-examples + uses: actions/checkout@v4 + with: + path: WasmEdge-WASINN-examples + + - name: Build wasm + run: cargo build --target wasm32-wasi --release + working-directory: WasmEdge-WASINN-examples/wasmedge-piper + + - name: Download model + run: curl -LO https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx + + - name: Download config + run: curl -LO https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx.json + + - name: Download espeak-ng-data + run: | + curl -LO https://github.com/rhasspy/piper/releases/download/2023.11.14-2/piper_linux_x86_64.tar.gz + tar -xzf piper_linux_x86_64.tar.gz piper/espeak-ng-data --strip-components=1 + rm piper_linux_x86_64.tar.gz + + - name: Execute + run: WASMEDGE_PLUGIN_PATH=WasmEdge/build/plugins/wasi_nn WasmEdge/build/tools/wasmedge/wasmedge --dir .:. WasmEdge-WASINN-examples/wasmedge-piper/target/wasm32-wasi/release/wasmedge-piper.wasm + + - name: Verify output + run: test "$(file --brief welcome.wav)" == 'RIFF (little-endian) data, WAVE audio, Microsoft PCM, 16 bit, mono 22050 Hz' diff --git a/wasmedge-piper/Cargo.toml b/wasmedge-piper/Cargo.toml new file mode 100644 index 0000000..d791bec --- /dev/null +++ b/wasmedge-piper/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "wasmedge-piper" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +serde_json = "1.0.120" +wasmedge-wasi-nn = "0.8.0" diff --git a/wasmedge-piper/README.md b/wasmedge-piper/README.md new file mode 100644 index 0000000..dbc9a38 --- /dev/null +++ b/wasmedge-piper/README.md @@ -0,0 +1,126 @@ +# Text to speech example with WasmEdge WASI-NN Piper plugin + +This example demonstrates how to use WasmEdge WASI-NN Piper plugin to perform TTS. + +## Build WasmEdge with WASI-NN Piper plugin + +Overview of WASI-NN Piper plugin dependencies: + +![d2 --layout elk dependencies.d2 dependencies.svg](dependencies.svg) + +- [piper](https://github.com/rhasspy/piper): A fast, local neural text to speech system. +- [piper-phonemize](https://github.com/rhasspy/piper-phonemize): C++ library for converting text to phonemes for Piper. +- [espeak-ng](https://github.com/rhasspy/espeak-ng): An open source speech synthesizer that supports more than hundred languages and accents. Piper uses it for text to phoneme translation. +- [onnxruntime](https://github.com/microsoft/onnxruntime): A cross-platform inference and training machine-learning accelerator. [ONNX](https://onnx.ai/) is an open format built to represent machine learning models. Piper uses ONNX Runtime as an inference backend for its ONNX models to convert phoneme ids to WAV audio. + +The WasmEdge WASI-NN Piper plugin relies on the ONNX Runtime C++ API. For installation instructions, please refer to the installation table on the [official website](https://onnxruntime.ai/getting-started). + +Example of installing ONNX Runtime 1.14.1 on Ubuntu: + +```bash +curl -LO https://github.com/microsoft/onnxruntime/releases/download/v1.14.1/onnxruntime-linux-x64-1.14.1.tgz +tar zxf onnxruntime-linux-x64-1.14.1.tgz +mv onnxruntime-linux-x64-1.14.1/include/* /usr/local/include/ +mv onnxruntime-linux-x64-1.14.1/lib/* /usr/local/lib/ +rm -rf onnxruntime-linux-x64-1.14.1.tgz onnxruntime-linux-x64-1.14.1 +ldconfig +``` + +For other dependencies, WasmEdge will download and build them automatically. + +Build WasmEdge from source: + +```bash +cd /path/to/wasmedge/source/folder + +cmake -GNinja -Bbuild -DCMAKE_BUILD_TYPE=Release -DWASMEDGE_USE_LLVM=OFF -DWASMEDGE_PLUGIN_WASI_NN_BACKEND=Piper +cmake --build build +``` + +Then you will have an executable `wasmedge` runtime at `build/tools/wasmedge/wasmedge` and the WASI-NN with Piper backend plug-in at `build/plugins/wasi_nn/libwasmedgePluginWasiNN.so`. + +## Model Download Link + +In this example, we will use the [en_US-lessac-medium](https://huggingface.co/rhasspy/piper-voices/tree/main/en/en_US/lessac/medium) model. + +[MODEL CARD](https://huggingface.co/rhasspy/piper-voices/blob/main/en/en_US/lessac/medium/MODEL_CARD): + +``` +# Model card for lessac (medium) + +* Language: en_US (English, United States) +* Speakers: 1 +* Quality: medium +* Samplerate: 22,050Hz + +## Dataset + +* URL: https://www.cstr.ed.ac.uk/projects/blizzard/2013/lessac_blizzard2013/ +* License: https://www.cstr.ed.ac.uk/projects/blizzard/2013/lessac_blizzard2013/license.html + +## Training + +Trained from scratch. + +``` + +It has a model file [en_US-lessac-medium.onnx](https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx) and a config file [en_US-lessac-medium.onnx.json](https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx.json). + +```bash +# Download model +curl -LO https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx +# Download config +curl -LO https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx.json +``` + +This model uses [eSpeak NG](https://github.com/rhasspy/espeak-ng) to convert text to phonemes, so we also need to download the required espeak-ng-data. + +This will download and extract the espeak-ng-data directory to the current working directory: + +```bash +curl -LO https://github.com/rhasspy/piper/releases/download/2023.11.14-2/piper_linux_x86_64.tar.gz +tar -xzf piper_linux_x86_64.tar.gz piper/espeak-ng-data --strip-components=1 +``` + +## Build wasm + +Run the following command to build wasm, the output WASM file will be at `target/wasm32-wasi/release/` + +```bash +cargo build --target wasm32-wasi --release +``` + +## Execute + +Execute the WASM with the `wasmedge`. + +```bash +WASMEDGE_PLUGIN_PATH=/path/to/parent/directory/of/libwasmedgePluginWasiNN.so /path/to/wasmedge --dir .:. /path/to/wasm +``` + +Example layout: + +``` +. +├── en_US-lessac-medium.onnx +├── en_US-lessac-medium.onnx.json +├── espeak-ng-data/ +├── WasmEdge/build/ +│ ├── plugins/wasi_nn/libwasmedgePluginWasiNN.so +│ └── tools/wasmedge/wasmedge +└── WasmEdge-WASINN-examples/wasmedge-piper/target/wasm32-wasi/release/wasmedge-piper.wasm +``` + +Then the command will be: + +```bash +WASMEDGE_PLUGIN_PATH=WasmEdge/build/plugins/wasi_nn WasmEdge/build/tools/wasmedge/wasmedge --dir .:. WasmEdge-WASINN-examples/wasmedge-piper/target/wasm32-wasi/release/wasmedge-piper.wasm +``` + +The output `welcome.wav` is the synthesized audio. + +## Config options + +The JSON config options passed to WasmEdge WASI-NN Piper plugin via `bytes_array` in `wasmedge_wasi_nn::GraphBuilder::build_from_bytes` is similar to the Piper command-line program options. + +See [config.schema.json](config.schema.json) for available options and [json_input.schema.json](json_input.schema.json) for JSON input. diff --git a/wasmedge-piper/config.schema.json b/wasmedge-piper/config.schema.json new file mode 100644 index 0000000..bf71bae --- /dev/null +++ b/wasmedge-piper/config.schema.json @@ -0,0 +1,68 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": { + "model": { + "description": "Path to .onnx voice file", + "type": "string" + }, + "config": { + "description": "Path to JSON voice config file, default is model path + .json", + "type": "string" + }, + "output_type": { + "default": "wav", + "description": "Type of output to produce", + "enum": [ + "raw", + "wav" + ] + }, + "speaker": { + "default": 0, + "description": "Numerical id of the default speaker (multi-speaker voices)", + "type": "number" + }, + "noise_scale": { + "default": 0.667, + "description": "Amount of noise to add during audio generation, default value can be overridden by the value in voice model config", + "type": "number" + }, + "length_scale": { + "default": 1.0, + "description": "Speed of speaking (1 = normal, < 1 is faster, > 1 is slower), default value can be overridden by the value in voice model config", + "type": "number" + }, + "noise_w": { + "default": 0.8, + "description": "Variation in phoneme lengths, default value can be overridden by the value in voice model config", + "type": "number" + }, + "sentence_silence": { + "default": 0.2, + "description": "Seconds of silence to add after each sentence", + "type": "number" + }, + "espeak_data": { + "description": "Path to espeak-ng data directory, required for espeak phonemes", + "type": "string" + }, + "tashkeel_model": { + "description": "Path to libtashkeel ort model (https://github.com/mush42/libtashkeel), required for Arabic", + "type": "string" + }, + "json_input": { + "default": false, + "description": "input is JSON instead of text", + "type": "boolean" + }, + "phoneme_silence": { + "additionalProperties": { + "type": "number" + }, + "description": "Seconds of extra silence to insert after a single phoneme, this is a mapping from single codepoints to seconds" + } + }, + "required": [ + "model" + ] +} \ No newline at end of file diff --git a/wasmedge-piper/dependencies.d2 b/wasmedge-piper/dependencies.d2 new file mode 100644 index 0000000..3594ed5 --- /dev/null +++ b/wasmedge-piper/dependencies.d2 @@ -0,0 +1,7 @@ +direction: right +WasmEdge WASI-NN Piper plugin -> piper +piper -> piper-phonemize +piper -> espeak-ng +piper -> onnxruntime +piper-phonemize -> espeak-ng +piper-phonemize -> onnxruntime diff --git a/wasmedge-piper/dependencies.svg b/wasmedge-piper/dependencies.svg new file mode 100644 index 0000000..ec78295 --- /dev/null +++ b/wasmedge-piper/dependencies.svg @@ -0,0 +1,99 @@ +WasmEdge WASI-NN Piper pluginpiperpiper-phonemizeespeak-ngonnxruntime + + + + + + + diff --git a/wasmedge-piper/json_input.schema.json b/wasmedge-piper/json_input.schema.json new file mode 100644 index 0000000..3e735c7 --- /dev/null +++ b/wasmedge-piper/json_input.schema.json @@ -0,0 +1,20 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": { + "text": { + "description": "Text input for speech synthesis", + "type": "string" + }, + "speaker_id": { + "description": "Override the default speaker id, takes precedence over speaker", + "type": "number" + }, + "speaker": { + "description": "Override the default speaker by name", + "type": "string" + } + }, + "required": [ + "text" + ] +} \ No newline at end of file diff --git a/wasmedge-piper/src/main.rs b/wasmedge-piper/src/main.rs new file mode 100644 index 0000000..3d06e13 --- /dev/null +++ b/wasmedge-piper/src/main.rs @@ -0,0 +1,31 @@ +fn main() { + // create graph with the config + let config = serde_json::json!({ + "model": "en_US-lessac-medium.onnx", // path to .onnx voice file, required + "config": "en_US-lessac-medium.onnx.json", // path to JSON voice config file, optional, default is model path + .json + "espeak_data": "espeak-ng-data", // path to espeak-ng data directory, required for espeak phonemes + }); + let graph = wasmedge_wasi_nn::GraphBuilder::new( + wasmedge_wasi_nn::GraphEncoding::Piper, + wasmedge_wasi_nn::ExecutionTarget::CPU, + ) + .build_from_bytes([config.to_string()]) + .unwrap(); + + let mut context = graph.init_execution_context().unwrap(); + + // set the input text + let text = "Welcome to the world of speech synthesis!"; + context + .set_input(0, wasmedge_wasi_nn::TensorType::U8, &[1], text.as_bytes()) + .unwrap(); + + // synthesize the audio + context.compute().unwrap(); + + // retrieve the output, output is wav by default + let mut out_buffer = vec![0u8; 1 << 20]; + let size = context.get_output(0, &mut out_buffer).unwrap(); + + std::fs::write("welcome.wav", &out_buffer[..size]).unwrap(); +}