From da18b35c3c911a40a5d2784947ce78610ce51daf Mon Sep 17 00:00:00 2001
From: Han-Wen Tsao <chodehirgd157842@gmail.com>
Date: Fri, 14 Jun 2024 13:24:26 +0800
Subject: [PATCH] Add neural speed example (#135)

* feat: add neural speed example

* feat: change finiSingle to unload

* fix: backend name
---
 wasmedge-neuralspeed/.gitignore  |  1 +
 wasmedge-neuralspeed/Cargo.toml  | 11 +++++
 wasmedge-neuralspeed/README.md   | 78 ++++++++++++++++++++++++++++++++
 wasmedge-neuralspeed/src/main.rs | 62 +++++++++++++++++++++++++
 4 files changed, 152 insertions(+)
 create mode 100644 wasmedge-neuralspeed/.gitignore
 create mode 100644 wasmedge-neuralspeed/Cargo.toml
 create mode 100644 wasmedge-neuralspeed/README.md
 create mode 100644 wasmedge-neuralspeed/src/main.rs

diff --git a/wasmedge-neuralspeed/.gitignore b/wasmedge-neuralspeed/.gitignore
new file mode 100644
index 0000000..ea8c4bf
--- /dev/null
+++ b/wasmedge-neuralspeed/.gitignore
@@ -0,0 +1 @@
+/target
diff --git a/wasmedge-neuralspeed/Cargo.toml b/wasmedge-neuralspeed/Cargo.toml
new file mode 100644
index 0000000..f9341cb
--- /dev/null
+++ b/wasmedge-neuralspeed/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "wasmedge-neural-speed"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+tokenizers =  { version = "0.19.1", features = ["unstable_wasm"], default-features = false }
+serde_json = "1.0"
+wasmedge-wasi-nn = "0.7.1"
diff --git a/wasmedge-neuralspeed/README.md b/wasmedge-neuralspeed/README.md
new file mode 100644
index 0000000..4cc555f
--- /dev/null
+++ b/wasmedge-neuralspeed/README.md
@@ -0,0 +1,78 @@
+# Neural chat example with WasmEdge WASI-NN Neural Speed plugin
+This example demonstrates how to use WasmEdge WASI-NN Neural Speed plugin to perform an inference task with Neural chat model.
+
+## Install WasmeEdge with WASI-NN Neural Speed plugin
+
+The Neural Speed backend relies on Neural Speed, we recommend the following commands to install Neural Speed.
+
+``` bash
+sudo apt update
+sudo apt upgrade
+sudo apt install python3-dev
+wget https://raw.githubusercontent.com/intel/neural-speed/main/requirements.txt
+pip install -r requirements.txt
+pip install neural-speed
+```
+
+Then build and install WasmEdge from source:
+
+``` bash
+cd <path/to/your/wasmedge/source/folder>
+
+cmake -GNinja -Bbuild -DCMAKE_BUILD_TYPE=Release -DWASMEDGE_PLUGIN_WASI_NN_BACKEND="neuralspeed"
+cmake --build build
+
+# For the WASI-NN plugin, you should install this project.
+cmake --install build
+```
+
+Then you will have an executable `wasmedge` runtime under `/usr/local/bin` and the WASI-NN with Neural Speed backend plug-in under `/usr/local/lib/wasmedge/libwasmedgePluginWasiNN.so` after installation.
+## Model Download Link
+
+In this example, we will use neural-chat-7b-v3-1.Q4_0 model in GGUF format.
+
+``` bash
+# Download model weight
+wget https://huggingface.co/TheBloke/neural-chat-7B-v3-1-GGUF/resolve/main/neural-chat-7b-v3-1.Q4_0.gguf
+# Download tokenizer
+wget https://huggingface.co/Intel/neural-chat-7b-v3-1/raw/main/tokenizer.json -O neural-chat-tokenizer.json
+```
+
+## Build wasm
+
+Run the following command to build wasm, the output WASM file will be at `target/wasm32-wasi/release/`
+
+```bash
+cargo build --target wasm32-wasi --release
+```
+
+## Execute 
+
+Execute the WASM with the `wasmedge` using nn-preload to load model. 
+
+```bash
+wasmedge --dir .:. \
+  --nn-preload default:NeuralSpeed:AUTO:neural-chat-7b-v3-1.Q4_0.gguf \
+  ./target/wasm32-wasi/release/wasmedge-neural-speed.wasm default
+
+```
+
+## Other 
+
+You can change tokenizer_path to your tokenizer path.
+
+``` rust
+let tokenizer_name = "neural-chat-tokenizer.json";        
+```
+
+Prompt is the default model input.
+
+``` rust
+let prompt = "Once upon a time, there existed a little girl,";
+```
+If your model type not llama, you can set model_type parameter to load different model.
+
+``` rust
+let graph = GraphBuilder::new(GraphEncoding::NeuralSpeed, ExecutionTarget::AUTO)
+        .config(serde_json::to_string(&json!({"model_type": "mistral"}))
+```
\ No newline at end of file
diff --git a/wasmedge-neuralspeed/src/main.rs b/wasmedge-neuralspeed/src/main.rs
new file mode 100644
index 0000000..1c26b3e
--- /dev/null
+++ b/wasmedge-neuralspeed/src/main.rs
@@ -0,0 +1,62 @@
+use tokenizers::tokenizer::Tokenizer;
+use serde_json::json;
+use wasmedge_wasi_nn::{
+    self, ExecutionTarget, GraphBuilder, GraphEncoding, GraphExecutionContext,
+    TensorType,
+};
+use std::env;
+fn get_data_from_context(context: &GraphExecutionContext, index: usize) -> Vec<u8> {
+    // Preserve for 4096 tokens with average token length 8
+    const MAX_OUTPUT_BUFFER_SIZE: usize = 4096 * 8;
+    let mut output_buffer = vec![0u8; MAX_OUTPUT_BUFFER_SIZE];
+    let _ = context
+        .get_output(index, &mut output_buffer)
+        .expect("Failed to get output");
+
+    return output_buffer;
+}
+
+fn get_output_from_context(context: &GraphExecutionContext) -> Vec<u8> {
+    get_data_from_context(context, 0)
+}
+fn main() {
+        let tokenizer_path = "neural-chat-tokenizer.json";
+        let prompt = "Once upon a time, there existed a little girl,";
+        let args: Vec<String> = env::args().collect();
+        let model_name: &str = &args[1];
+        let tokenizer:Tokenizer = Tokenizer::from_file(tokenizer_path).unwrap();
+        let encoding = tokenizer.encode(prompt, true).unwrap();
+        let inputs = encoding.get_ids();
+        let mut tensor_data: Vec<u8> = Vec::with_capacity(inputs.len() * 8);
+
+        for &val in inputs {
+            let mut bytes = u64::from(val).to_be_bytes();
+            bytes.reverse();
+            tensor_data.extend_from_slice(&bytes);
+        }
+        let graph = GraphBuilder::new(GraphEncoding::NeuralSpeed, ExecutionTarget::AUTO)
+        .config(serde_json::to_string(&json!({"model_type": "mistral"})).expect("Failed to serialize options"))
+        .build_from_cache(model_name)
+        .expect("Failed to build graph");
+        let mut context = graph
+            .init_execution_context()
+            .expect("Failed to init context");
+        context
+            .set_input(0, TensorType::U8, &[1], &tensor_data)
+            .expect("Failed to set input");
+        context.compute().expect("Failed to compute");
+        let output_bytes = get_output_from_context(&context);
+        let output_id:Vec<u32> = output_bytes
+        .chunks(8)
+        .map(|chunk| {
+            chunk
+            .iter()
+            .enumerate()
+            .fold(0u64, |acc, (i, &byte)| acc + ((byte as u64) << (i * 8))) as u32
+        })
+        .collect();
+        let output = tokenizer.decode(&output_id, true).unwrap();
+        println!("{}", output);
+        graph.unload().expect("Failed to free resource");
+
+}
\ No newline at end of file