diff --git a/.github/workflows/llama.yml b/.github/workflows/llama.yml index c0b9cd3..534e318 100644 --- a/.github/workflows/llama.yml +++ b/.github/workflows/llama.yml @@ -25,8 +25,8 @@ jobs: strategy: matrix: runner: [ubuntu-20.04, macos-m1] - wasmedge: ["0.13.5", "0.14.0"] - plugin: [wasi_nn-ggml] + wasmedge: ["0.14.1"] + plugin: [wasi_nn-ggml-b4381] job: - name: "Tiny Llama" run: | @@ -301,6 +301,23 @@ jobs: default \ $'[INST] <>\nYou are a helpful, respectful and honest assistant. Always output JSON format string.\n<>\nGive me a JSON array of Apple products.[/INST]' + - name: Qwen2-VL + run: | + test -f ~/.wasmedge/env && source ~/.wasmedge/env + cd wasmedge-ggml/qwen2vl + curl -LO https://huggingface.co/second-state/Qwen2-VL-2B-Instruct-GGUF/resolve/main/Qwen2-VL-2B-Instruct-vision-encoder.gguf + curl -LO https://huggingface.co/second-state/Qwen2-VL-2B-Instruct-GGUF/resolve/main/Qwen2-VL-2B-Instruct-Q5_K_M.gguf + curl -LO https://llava-vl.github.io/static/images/monalisa.jpg + cargo build --target wasm32-wasi --release + time wasmedge --dir .:. \ + --env n_gpu_layers="$NGL" \ + --nn-preload default:GGML:AUTO:Qwen2-VL-2B-Instruct-Q5_K_M.gguf \ + --env mmproj=Qwen2-VL-2B-Instruct-vision-encoder.gguf \ + --env image=monalisa.jpg \ + target/wasm32-wasi/release/wasmedge-ggml-qwen2vl.wasm \ + default \ + $'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|vision_end|>what is in this picture?<|im_end|>\n<|im_start|>assistant\n' + - name: Build llama-stream run: | cd wasmedge-ggml/llama-stream diff --git a/wasmedge-ggml/qwen2vl/Cargo.toml b/wasmedge-ggml/qwen2vl/Cargo.toml new file mode 100644 index 0000000..122469d --- /dev/null +++ b/wasmedge-ggml/qwen2vl/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "wasmedge-ggml-qwen2vl" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde_json = "1.0" +wasmedge-wasi-nn = "0.7.1" diff --git a/wasmedge-ggml/qwen2vl/README.md b/wasmedge-ggml/qwen2vl/README.md new file mode 100644 index 0000000..1143a4e --- /dev/null +++ b/wasmedge-ggml/qwen2vl/README.md @@ -0,0 +1,47 @@ +# Qwen-2VL Example For WASI-NN with GGML Backend + +> [!NOTE] +> Please refer to the [wasmedge-ggml/README.md](../README.md) for the general introduction and the setup of the WASI-NN plugin with GGML backend. This document will focus on the specific example of the Qwen2-VL model. + +## Get Qwen2-VL Model + +In this example, we are going to use the pre-converted [Qwen2-VL-2B](https://huggingface.co/second-state/Qwen2-VL-2B-Instruct-GGUF/tree/main) model. + +Download the model: + +```bash +curl -LO https://huggingface.co/second-state/Qwen2-VL-2B-Instruct-GGUF/resolve/main/Qwen2-VL-2B-Instruct-vision-encoder.gguf +curl -LO https://huggingface.co/second-state/Qwen2-VL-2B-Instruct-GGUF/resolve/main/Qwen2-VL-2B-Instruct-Q5_K_M.gguf +``` + +## Prepare the Image + +Download the image you want to perform inference on: + +```bash +curl -LO https://llava-vl.github.io/static/images/monalisa.jpg +``` + +## Parameters + +> [!NOTE] +> Please check the parameters section of [wasmedge-ggml/README.md](https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters) first. + +In Qwen2-VL inference, we recommend to use the `ctx-size` at least `4096` for better results. + +```rust +options.insert("ctx-size", Value::from(4096)); +``` + +## Execute + +Execute the WASM with the `wasmedge` using the named model feature to preload a large model: + +```bash +wasmedge --dir .:. \ + --nn-preload default:GGML:AUTO:Qwen2-VL-2B-Instruct-Q5_K_M.gguf \ + --env mmproj=Qwen2-VL-2B-Instruct-vision-encoder.gguf \ + --env image=monalisa.jpg \ + --env ctx_size=4096 \ + wasmedge-ggml-qwen2vl.wasm default +``` diff --git a/wasmedge-ggml/qwen2vl/src/main.rs b/wasmedge-ggml/qwen2vl/src/main.rs new file mode 100644 index 0000000..73356dd --- /dev/null +++ b/wasmedge-ggml/qwen2vl/src/main.rs @@ -0,0 +1,197 @@ +use serde_json::Value; +use std::collections::HashMap; +use std::env; +use std::io; +use wasmedge_wasi_nn::{ + self, BackendError, Error, ExecutionTarget, GraphBuilder, GraphEncoding, GraphExecutionContext, + TensorType, +}; + +fn read_input() -> String { + loop { + let mut answer = String::new(); + io::stdin() + .read_line(&mut answer) + .expect("Failed to read line"); + if !answer.is_empty() && answer != "\n" && answer != "\r\n" { + return answer.trim().to_string(); + } + } +} + +fn get_options_from_env() -> HashMap<&'static str, Value> { + let mut options = HashMap::new(); + + // Required parameters for llava + if let Ok(val) = env::var("mmproj") { + options.insert("mmproj", Value::from(val.as_str())); + } else { + eprintln!("Failed to get mmproj model."); + std::process::exit(1); + } + if let Ok(val) = env::var("image") { + options.insert("image", Value::from(val.as_str())); + } else { + eprintln!("Failed to get the target image."); + std::process::exit(1); + } + + // Optional parameters + if let Ok(val) = env::var("enable_log") { + options.insert("enable-log", serde_json::from_str(val.as_str()).unwrap()); + } else { + options.insert("enable-log", Value::from(false)); + } + if let Ok(val) = env::var("ctx_size") { + options.insert("ctx-size", serde_json::from_str(val.as_str()).unwrap()); + } else { + options.insert("ctx-size", Value::from(4096)); + } + if let Ok(val) = env::var("n_gpu_layers") { + options.insert("n-gpu-layers", serde_json::from_str(val.as_str()).unwrap()); + } else { + options.insert("n-gpu-layers", Value::from(0)); + } + options +} + +fn set_data_to_context(context: &mut GraphExecutionContext, data: Vec) -> Result<(), Error> { + context.set_input(0, TensorType::U8, &[1], &data) +} + +fn get_data_from_context(context: &GraphExecutionContext, index: usize) -> String { + // Preserve for 4096 tokens with average token length 6 + const MAX_OUTPUT_BUFFER_SIZE: usize = 4096 * 6; + let mut output_buffer = vec![0u8; MAX_OUTPUT_BUFFER_SIZE]; + let mut output_size = context + .get_output(index, &mut output_buffer) + .expect("Failed to get output"); + output_size = std::cmp::min(MAX_OUTPUT_BUFFER_SIZE, output_size); + + String::from_utf8_lossy(&output_buffer[..output_size]).to_string() +} + +fn get_output_from_context(context: &GraphExecutionContext) -> String { + get_data_from_context(context, 0) +} + +fn get_metadata_from_context(context: &GraphExecutionContext) -> Value { + serde_json::from_str(&get_data_from_context(context, 1)).expect("Failed to get metadata") +} + +fn main() { + let args: Vec = env::args().collect(); + let model_name: &str = &args[1]; + + // Set options for the graph. Check our README for more details: + // https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters + let options = get_options_from_env(); + // You could also set the options manually like this: + + // Create graph and initialize context. + let graph = GraphBuilder::new(GraphEncoding::Ggml, ExecutionTarget::AUTO) + .config(serde_json::to_string(&options).expect("Failed to serialize options")) + .build_from_cache(model_name) + .expect("Failed to build graph"); + let mut context = graph + .init_execution_context() + .expect("Failed to init context"); + + // If there is a third argument, use it as the prompt and enter non-interactive mode. + // This is mainly for the CI workflow. + if args.len() >= 3 { + let prompt = &args[2]; + // Set the prompt. + println!("Prompt:\n{}", prompt); + let tensor_data = prompt.as_bytes().to_vec(); + context + .set_input(0, TensorType::U8, &[1], &tensor_data) + .expect("Failed to set input"); + println!("Response:"); + + // Get the number of input tokens and llama.cpp versions. + let input_metadata = get_metadata_from_context(&context); + println!("[INFO] llama_commit: {}", input_metadata["llama_commit"]); + println!( + "[INFO] llama_build_number: {}", + input_metadata["llama_build_number"] + ); + println!( + "[INFO] Number of input tokens: {}", + input_metadata["input_tokens"] + ); + + // Get the output. + context.compute().expect("Failed to compute"); + let output = get_output_from_context(&context); + println!("{}", output.trim()); + + // Retrieve the output metadata. + let metadata = get_metadata_from_context(&context); + println!( + "[INFO] Number of input tokens: {}", + metadata["input_tokens"] + ); + println!( + "[INFO] Number of output tokens: {}", + metadata["output_tokens"] + ); + std::process::exit(0); + } + + let mut saved_prompt = String::new(); + let system_prompt = String::from("You are a helpful assistant."); + let image_placeholder = ""; + + loop { + println!("USER:"); + let input = read_input(); + + // Qwen2VL prompt format: <|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n<|vision_start|>{image_placeholder}<|vision_end|>{user_prompt}<|im_end|>\n<|im_start|>assistant\n"; + if saved_prompt.is_empty() { + saved_prompt = format!( + "<|im_start|>system\n{}<|im_end|>\n<|im_start|>user\n<|vision_start|>{}<|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n", + system_prompt, image_placeholder, input + ); + } else { + saved_prompt = format!( + "{}<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n", + saved_prompt, input + ); + } + + // Set prompt to the input tensor. + set_data_to_context(&mut context, saved_prompt.as_bytes().to_vec()) + .expect("Failed to set input"); + + // Execute the inference. + let mut reset_prompt = false; + match context.compute() { + Ok(_) => (), + Err(Error::BackendError(BackendError::ContextFull)) => { + println!("\n[INFO] Context full, we'll reset the context and continue."); + reset_prompt = true; + } + Err(Error::BackendError(BackendError::PromptTooLong)) => { + println!("\n[INFO] Prompt too long, we'll reset the context and continue."); + reset_prompt = true; + } + Err(err) => { + println!("\n[ERROR] {}", err); + std::process::exit(1); + } + } + + // Retrieve the output. + let mut output = get_output_from_context(&context); + println!("ASSISTANT:\n{}", output.trim()); + + // Update the saved prompt. + if reset_prompt { + saved_prompt.clear(); + } else { + output = output.trim().to_string(); + saved_prompt = format!("{}{}<|im_end|>\n", saved_prompt, output); + } + } +} diff --git a/wasmedge-ggml/qwen2vl/wasmedge-ggml-qwen2vl.wasm b/wasmedge-ggml/qwen2vl/wasmedge-ggml-qwen2vl.wasm new file mode 100755 index 0000000..e713ff0 Binary files /dev/null and b/wasmedge-ggml/qwen2vl/wasmedge-ggml-qwen2vl.wasm differ