From da18b35c3c911a40a5d2784947ce78610ce51daf Mon Sep 17 00:00:00 2001 From: Han-Wen Tsao Date: Fri, 14 Jun 2024 13:24:26 +0800 Subject: [PATCH] Add neural speed example (#135) * feat: add neural speed example * feat: change finiSingle to unload * fix: backend name --- wasmedge-neuralspeed/.gitignore | 1 + wasmedge-neuralspeed/Cargo.toml | 11 +++++ wasmedge-neuralspeed/README.md | 78 ++++++++++++++++++++++++++++++++ wasmedge-neuralspeed/src/main.rs | 62 +++++++++++++++++++++++++ 4 files changed, 152 insertions(+) create mode 100644 wasmedge-neuralspeed/.gitignore create mode 100644 wasmedge-neuralspeed/Cargo.toml create mode 100644 wasmedge-neuralspeed/README.md create mode 100644 wasmedge-neuralspeed/src/main.rs diff --git a/wasmedge-neuralspeed/.gitignore b/wasmedge-neuralspeed/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/wasmedge-neuralspeed/.gitignore @@ -0,0 +1 @@ +/target diff --git a/wasmedge-neuralspeed/Cargo.toml b/wasmedge-neuralspeed/Cargo.toml new file mode 100644 index 0000000..f9341cb --- /dev/null +++ b/wasmedge-neuralspeed/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "wasmedge-neural-speed" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +tokenizers = { version = "0.19.1", features = ["unstable_wasm"], default-features = false } +serde_json = "1.0" +wasmedge-wasi-nn = "0.7.1" diff --git a/wasmedge-neuralspeed/README.md b/wasmedge-neuralspeed/README.md new file mode 100644 index 0000000..4cc555f --- /dev/null +++ b/wasmedge-neuralspeed/README.md @@ -0,0 +1,78 @@ +# Neural chat example with WasmEdge WASI-NN Neural Speed plugin +This example demonstrates how to use WasmEdge WASI-NN Neural Speed plugin to perform an inference task with Neural chat model. + +## Install WasmeEdge with WASI-NN Neural Speed plugin + +The Neural Speed backend relies on Neural Speed, we recommend the following commands to install Neural Speed. + +``` bash +sudo apt update +sudo apt upgrade +sudo apt install python3-dev +wget https://raw.githubusercontent.com/intel/neural-speed/main/requirements.txt +pip install -r requirements.txt +pip install neural-speed +``` + +Then build and install WasmEdge from source: + +``` bash +cd + +cmake -GNinja -Bbuild -DCMAKE_BUILD_TYPE=Release -DWASMEDGE_PLUGIN_WASI_NN_BACKEND="neuralspeed" +cmake --build build + +# For the WASI-NN plugin, you should install this project. +cmake --install build +``` + +Then you will have an executable `wasmedge` runtime under `/usr/local/bin` and the WASI-NN with Neural Speed backend plug-in under `/usr/local/lib/wasmedge/libwasmedgePluginWasiNN.so` after installation. +## Model Download Link + +In this example, we will use neural-chat-7b-v3-1.Q4_0 model in GGUF format. + +``` bash +# Download model weight +wget https://huggingface.co/TheBloke/neural-chat-7B-v3-1-GGUF/resolve/main/neural-chat-7b-v3-1.Q4_0.gguf +# Download tokenizer +wget https://huggingface.co/Intel/neural-chat-7b-v3-1/raw/main/tokenizer.json -O neural-chat-tokenizer.json +``` + +## Build wasm + +Run the following command to build wasm, the output WASM file will be at `target/wasm32-wasi/release/` + +```bash +cargo build --target wasm32-wasi --release +``` + +## Execute + +Execute the WASM with the `wasmedge` using nn-preload to load model. + +```bash +wasmedge --dir .:. \ + --nn-preload default:NeuralSpeed:AUTO:neural-chat-7b-v3-1.Q4_0.gguf \ + ./target/wasm32-wasi/release/wasmedge-neural-speed.wasm default + +``` + +## Other + +You can change tokenizer_path to your tokenizer path. + +``` rust +let tokenizer_name = "neural-chat-tokenizer.json"; +``` + +Prompt is the default model input. + +``` rust +let prompt = "Once upon a time, there existed a little girl,"; +``` +If your model type not llama, you can set model_type parameter to load different model. + +``` rust +let graph = GraphBuilder::new(GraphEncoding::NeuralSpeed, ExecutionTarget::AUTO) + .config(serde_json::to_string(&json!({"model_type": "mistral"})) +``` \ No newline at end of file diff --git a/wasmedge-neuralspeed/src/main.rs b/wasmedge-neuralspeed/src/main.rs new file mode 100644 index 0000000..1c26b3e --- /dev/null +++ b/wasmedge-neuralspeed/src/main.rs @@ -0,0 +1,62 @@ +use tokenizers::tokenizer::Tokenizer; +use serde_json::json; +use wasmedge_wasi_nn::{ + self, ExecutionTarget, GraphBuilder, GraphEncoding, GraphExecutionContext, + TensorType, +}; +use std::env; +fn get_data_from_context(context: &GraphExecutionContext, index: usize) -> Vec { + // Preserve for 4096 tokens with average token length 8 + const MAX_OUTPUT_BUFFER_SIZE: usize = 4096 * 8; + let mut output_buffer = vec![0u8; MAX_OUTPUT_BUFFER_SIZE]; + let _ = context + .get_output(index, &mut output_buffer) + .expect("Failed to get output"); + + return output_buffer; +} + +fn get_output_from_context(context: &GraphExecutionContext) -> Vec { + get_data_from_context(context, 0) +} +fn main() { + let tokenizer_path = "neural-chat-tokenizer.json"; + let prompt = "Once upon a time, there existed a little girl,"; + let args: Vec = env::args().collect(); + let model_name: &str = &args[1]; + let tokenizer:Tokenizer = Tokenizer::from_file(tokenizer_path).unwrap(); + let encoding = tokenizer.encode(prompt, true).unwrap(); + let inputs = encoding.get_ids(); + let mut tensor_data: Vec = Vec::with_capacity(inputs.len() * 8); + + for &val in inputs { + let mut bytes = u64::from(val).to_be_bytes(); + bytes.reverse(); + tensor_data.extend_from_slice(&bytes); + } + let graph = GraphBuilder::new(GraphEncoding::NeuralSpeed, ExecutionTarget::AUTO) + .config(serde_json::to_string(&json!({"model_type": "mistral"})).expect("Failed to serialize options")) + .build_from_cache(model_name) + .expect("Failed to build graph"); + let mut context = graph + .init_execution_context() + .expect("Failed to init context"); + context + .set_input(0, TensorType::U8, &[1], &tensor_data) + .expect("Failed to set input"); + context.compute().expect("Failed to compute"); + let output_bytes = get_output_from_context(&context); + let output_id:Vec = output_bytes + .chunks(8) + .map(|chunk| { + chunk + .iter() + .enumerate() + .fold(0u64, |acc, (i, &byte)| acc + ((byte as u64) << (i * 8))) as u32 + }) + .collect(); + let output = tokenizer.decode(&output_id, true).unwrap(); + println!("{}", output); + graph.unload().expect("Failed to free resource"); + +} \ No newline at end of file