diff --git a/whisper-basic/Cargo.toml b/whisper-basic/Cargo.toml new file mode 100644 index 0000000..06938a2 --- /dev/null +++ b/whisper-basic/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "whisper-basic" +version = "0.1.0" +edition = "2021" + +[dependencies] +wasmedge-wasi-nn = "0.8.0" diff --git a/whisper-basic/README.md b/whisper-basic/README.md new file mode 100644 index 0000000..100d695 --- /dev/null +++ b/whisper-basic/README.md @@ -0,0 +1,78 @@ +# Basic Example For WASI-NN with Whisper Backend + +This example is for a basic audio recognition with WASI-NN whisper backend in WasmEdge. +In current status, WasmEdge implement the Whisper backend of WASI-NN in only English. We'll extend more options in the future. + +## Dependencies + +This crate depends on the `wasmedge-wasi-nn` in the `Cargo.toml`: + +```toml +[dependencies] +wasmedge-wasi-nn = "0.8.0" +``` + +## Build + +Compile the application to WebAssembly: + +```bash +cargo build --target=wasm32-wasi --release +``` + +The output WASM file will be at [`target/wasm32-wasi/release/whisper-basic.wasm`](whisper-basic.wasm). +To speed up the processing, we can enable the AOT mode in WasmEdge with: + +```bash +wasmedge compile target/wasm32-wasi/release/whisper-basic.wasm whisper-basic_aot.wasm +``` + +## Run + +### Test data + +The testing audio is located at `./test.wav`. + +Users should get the model by the guide from [whisper.cpp repository](https://github.com/ggerganov/whisper.cpp/tree/master/models): + +```bash +curl -sSf https://raw.githubusercontent.com/ggerganov/whisper.cpp/master/models/download-ggml-model.sh | bash -s -- base.en +``` + +The model will be stored at `./ggml-base.en.bin`. + +### Input Audio + +The WASI-NN whisper backend for WasmEdge currently supported 16kHz, 1 channel, and `pcm_s16le` format. + +Users can convert their input audio as following `ffmpeg` command: + +```bash +ffmpeg -i test.m4a -acodec pcm_s16le -ac 1 -ar 16000 test.wav +``` + +### Execute + +> Note: This is prepared for `0.14.2` or later release in the future. Please build from source now. + +Users should [install the WasmEdge with WASI-NN plug-in in Whisper backend](https://wasmedge.org/docs/start/install/#wasi-nn-plug-ins). + +```bash +curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install.sh | bash -s -- --plugins wasi_nn-whisper +``` + +Execute the WASM with the `wasmedge` with WASI-NN plug-in: + +```bash +wasmedge --dir .:. whisper-basic_aot.wasm ggml-base.en.bin test.wav +``` + +You will get recognized string from the audio file in the output: + +```bash +Read model, size in bytes: 147964211 +Loaded graph into wasi-nn with ID: Graph#0 +Read input tensor, size in bytes: 141408 +Recognized from audio: +[00:00:00.000 --> 00:00:04.300] This is a test record for whisper.cpp +``` diff --git a/whisper-basic/src/main.rs b/whisper-basic/src/main.rs new file mode 100644 index 0000000..8a5872b --- /dev/null +++ b/whisper-basic/src/main.rs @@ -0,0 +1,35 @@ +use std::env; +use std::fs; +use std::error::Error; +use wasmedge_wasi_nn::{GraphBuilder, GraphEncoding, ExecutionTarget, TensorType}; + +pub fn main() -> Result<(), Box> { + let args: Vec = env::args().collect(); + let model_bin_name: &str = &args[1]; + let wav_name: &str = &args[2]; + + let model_bin = fs::read(model_bin_name)?; + println!("Read model, size in bytes: {}", model_bin.len()); + + let graph = GraphBuilder::new(GraphEncoding::Whisper, ExecutionTarget::CPU).build_from_bytes(&[&model_bin])?; + let mut ctx = graph.init_execution_context()?; + println!("Loaded graph into wasi-nn with ID: {}", graph); + + // Load the raw pcm tensor. + let wav_buf = fs::read(wav_name)?; + println!("Read input tensor, size in bytes: {}", wav_buf.len()); + + // Set input. + ctx.set_input(0, TensorType::F32, &[1, wav_buf.len()], &wav_buf)?; + + // Execute the inference. + ctx.compute()?; + + // Retrieve the output. + let mut output_buffer = vec![0u8; 2048]; + _ = ctx.get_output(0, &mut output_buffer)?; + + println!("Recognized from audio: \n{}", String::from_utf8(output_buffer).unwrap()); + + Ok(()) +} diff --git a/whisper-basic/test.wav b/whisper-basic/test.wav new file mode 100644 index 0000000..8de174e Binary files /dev/null and b/whisper-basic/test.wav differ diff --git a/whisper-basic/whisper-basic.wasm b/whisper-basic/whisper-basic.wasm new file mode 100755 index 0000000..7c55cd9 Binary files /dev/null and b/whisper-basic/whisper-basic.wasm differ