-
Notifications
You must be signed in to change notification settings - Fork 39
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* feat: add neural speed example * feat: change finiSingle to unload * fix: backend name
- Loading branch information
Showing
4 changed files
with
152 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
/target |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
[package] | ||
name = "wasmedge-neural-speed" | ||
version = "0.1.0" | ||
edition = "2021" | ||
|
||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | ||
|
||
[dependencies] | ||
tokenizers = { version = "0.19.1", features = ["unstable_wasm"], default-features = false } | ||
serde_json = "1.0" | ||
wasmedge-wasi-nn = "0.7.1" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
# Neural chat example with WasmEdge WASI-NN Neural Speed plugin | ||
This example demonstrates how to use WasmEdge WASI-NN Neural Speed plugin to perform an inference task with Neural chat model. | ||
|
||
## Install WasmeEdge with WASI-NN Neural Speed plugin | ||
|
||
The Neural Speed backend relies on Neural Speed, we recommend the following commands to install Neural Speed. | ||
|
||
``` bash | ||
sudo apt update | ||
sudo apt upgrade | ||
sudo apt install python3-dev | ||
wget https://raw.githubusercontent.com/intel/neural-speed/main/requirements.txt | ||
pip install -r requirements.txt | ||
pip install neural-speed | ||
``` | ||
|
||
Then build and install WasmEdge from source: | ||
|
||
``` bash | ||
cd <path/to/your/wasmedge/source/folder> | ||
|
||
cmake -GNinja -Bbuild -DCMAKE_BUILD_TYPE=Release -DWASMEDGE_PLUGIN_WASI_NN_BACKEND="neuralspeed" | ||
cmake --build build | ||
|
||
# For the WASI-NN plugin, you should install this project. | ||
cmake --install build | ||
``` | ||
|
||
Then you will have an executable `wasmedge` runtime under `/usr/local/bin` and the WASI-NN with Neural Speed backend plug-in under `/usr/local/lib/wasmedge/libwasmedgePluginWasiNN.so` after installation. | ||
## Model Download Link | ||
|
||
In this example, we will use neural-chat-7b-v3-1.Q4_0 model in GGUF format. | ||
|
||
``` bash | ||
# Download model weight | ||
wget https://huggingface.co/TheBloke/neural-chat-7B-v3-1-GGUF/resolve/main/neural-chat-7b-v3-1.Q4_0.gguf | ||
# Download tokenizer | ||
wget https://huggingface.co/Intel/neural-chat-7b-v3-1/raw/main/tokenizer.json -O neural-chat-tokenizer.json | ||
``` | ||
|
||
## Build wasm | ||
|
||
Run the following command to build wasm, the output WASM file will be at `target/wasm32-wasi/release/` | ||
|
||
```bash | ||
cargo build --target wasm32-wasi --release | ||
``` | ||
|
||
## Execute | ||
|
||
Execute the WASM with the `wasmedge` using nn-preload to load model. | ||
|
||
```bash | ||
wasmedge --dir .:. \ | ||
--nn-preload default:NeuralSpeed:AUTO:neural-chat-7b-v3-1.Q4_0.gguf \ | ||
./target/wasm32-wasi/release/wasmedge-neural-speed.wasm default | ||
|
||
``` | ||
|
||
## Other | ||
|
||
You can change tokenizer_path to your tokenizer path. | ||
|
||
``` rust | ||
let tokenizer_name = "neural-chat-tokenizer.json"; | ||
``` | ||
|
||
Prompt is the default model input. | ||
|
||
``` rust | ||
let prompt = "Once upon a time, there existed a little girl,"; | ||
``` | ||
If your model type not llama, you can set model_type parameter to load different model. | ||
|
||
``` rust | ||
let graph = GraphBuilder::new(GraphEncoding::NeuralSpeed, ExecutionTarget::AUTO) | ||
.config(serde_json::to_string(&json!({"model_type": "mistral"})) | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
use tokenizers::tokenizer::Tokenizer; | ||
use serde_json::json; | ||
use wasmedge_wasi_nn::{ | ||
self, ExecutionTarget, GraphBuilder, GraphEncoding, GraphExecutionContext, | ||
TensorType, | ||
}; | ||
use std::env; | ||
fn get_data_from_context(context: &GraphExecutionContext, index: usize) -> Vec<u8> { | ||
// Preserve for 4096 tokens with average token length 8 | ||
const MAX_OUTPUT_BUFFER_SIZE: usize = 4096 * 8; | ||
let mut output_buffer = vec![0u8; MAX_OUTPUT_BUFFER_SIZE]; | ||
let _ = context | ||
.get_output(index, &mut output_buffer) | ||
.expect("Failed to get output"); | ||
|
||
return output_buffer; | ||
} | ||
|
||
fn get_output_from_context(context: &GraphExecutionContext) -> Vec<u8> { | ||
get_data_from_context(context, 0) | ||
} | ||
fn main() { | ||
let tokenizer_path = "neural-chat-tokenizer.json"; | ||
let prompt = "Once upon a time, there existed a little girl,"; | ||
let args: Vec<String> = env::args().collect(); | ||
let model_name: &str = &args[1]; | ||
let tokenizer:Tokenizer = Tokenizer::from_file(tokenizer_path).unwrap(); | ||
let encoding = tokenizer.encode(prompt, true).unwrap(); | ||
let inputs = encoding.get_ids(); | ||
let mut tensor_data: Vec<u8> = Vec::with_capacity(inputs.len() * 8); | ||
|
||
for &val in inputs { | ||
let mut bytes = u64::from(val).to_be_bytes(); | ||
bytes.reverse(); | ||
tensor_data.extend_from_slice(&bytes); | ||
} | ||
let graph = GraphBuilder::new(GraphEncoding::NeuralSpeed, ExecutionTarget::AUTO) | ||
.config(serde_json::to_string(&json!({"model_type": "mistral"})).expect("Failed to serialize options")) | ||
.build_from_cache(model_name) | ||
.expect("Failed to build graph"); | ||
let mut context = graph | ||
.init_execution_context() | ||
.expect("Failed to init context"); | ||
context | ||
.set_input(0, TensorType::U8, &[1], &tensor_data) | ||
.expect("Failed to set input"); | ||
context.compute().expect("Failed to compute"); | ||
let output_bytes = get_output_from_context(&context); | ||
let output_id:Vec<u32> = output_bytes | ||
.chunks(8) | ||
.map(|chunk| { | ||
chunk | ||
.iter() | ||
.enumerate() | ||
.fold(0u64, |acc, (i, &byte)| acc + ((byte as u64) << (i * 8))) as u32 | ||
}) | ||
.collect(); | ||
let output = tokenizer.decode(&output_id, true).unwrap(); | ||
println!("{}", output); | ||
graph.unload().expect("Failed to free resource"); | ||
|
||
} |