[Example] ggml: add grammar example (#126)

* [Example] ggml: add grammar example Signed-off-by: dm4 <[email protected]> * [CI] add grammar test Signed-off-by: dm4 <[email protected]> --------- Signed-off-by: dm4 <[email protected]>
second-state · Apr 3, 2024 · 9a2451a · 9a2451a
1 parent 653daf0
commit 9a2451a
Show file tree

Hide file tree

Showing 5 changed files with 248 additions and 0 deletions.
diff --git a/.github/workflows/llama.yml b/.github/workflows/llama.yml
@@ -171,6 +171,19 @@ jobs:
                 default \
                 '<start_of_turn>user Where is the capital of Japan? <end_of_turn><start_of_turn>model'
 
+          - name: Grammar Example
+            run: |
+              test -f ~/.wasmedge/env && source ~/.wasmedge/env
+              cd wasmedge-ggml/grammar
+              curl -LO https://huggingface.co/TheBloke/Llama-2-7b-GGUF/resolve/main/llama-2-7b.Q5_K_M.gguf
+              cargo build --target wasm32-wasi --release
+              time wasmedge --dir .:. \
+                --env n_gpu_layers="$NGL" \
+                --nn-preload default:GGML:AUTO:llama-2-7b.Q5_K_M.gguf \
+                target/wasm32-wasi/release/wasmedge-ggml-grammar.wasm \
+                default \
+                'JSON object with 5 country names as keys and their capitals as values: '
+
           - name: Build llama-stream
             run: |
               cd wasmedge-ggml/llama-stream

diff --git a/wasmedge-ggml/grammar/Cargo.toml b/wasmedge-ggml/grammar/Cargo.toml
@@ -0,0 +1,8 @@
+[package]
+name = "wasmedge-ggml-grammar"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+serde_json = "1.0"
+wasmedge-wasi-nn = "0.7.0"
diff --git a/wasmedge-ggml/grammar/README.md b/wasmedge-ggml/grammar/README.md
@@ -0,0 +1,37 @@
+# Grammar Example For WASI-NN with GGML Backend
+
+> [!NOTE]
+> Please refer to the [wasmedge-ggml/README.md](../README.md) for the general introduction and the setup of the WASI-NN plugin with GGML backend. This document will focus on the specific example of using grammar in ggml.
+
+## Get the Model
+
+In this example, we are going to use the [llama-2-7b](https://huggingface.co/TheBloke/Llama-2-7B-GGUF) model. Please note that we are not using a fine-tuned chat model.
+
+```bash
+curl -LO https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q5_K_M.gguf
+```
+
+## Parameters
+
+> [!NOTE]
+> Please check the parameters section of [wasmedge-ggml/README.md](https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters) first.
+
+In this example, we are going to use the `grammar` option to constrain the model to generate the JSON output in a specific format.
+
+You can check [the documents at llama.cpp](https://github.com/ggerganov/llama.cpp/tree/master/grammars) for more details about grammars.
+
+## Execute
+
+In this example, we are going to use the `n_predict` option to avoid the model from generating too many outputs.
+
+```console
+$ wasmedge --dir .:. \
+  --env n_predict=99 \
+  --nn-preload default:GGML:AUTO:llama-2-7b.Q5_K_M.gguf \
+  wasmedge-ggml-grammar.wasm default
+
+USER:
+JSON object with 5 country names as keys and their capitals as values:
+ASSISTANT:
+{"US": "Washington", "UK": "London", "Germany": "Berlin", "France": "Paris", "Italy": "Rome"}
+```
diff --git a/wasmedge-ggml/grammar/src/main.rs b/wasmedge-ggml/grammar/src/main.rs
@@ -0,0 +1,190 @@
+use serde_json::json;
+use serde_json::Value;
+use std::env;
+use std::io;
+use wasmedge_wasi_nn::{
+    self, BackendError, Error, ExecutionTarget, GraphBuilder, GraphEncoding, GraphExecutionContext,
+    TensorType,
+};
+
+fn read_input() -> String {
+    loop {
+        let mut answer = String::new();
+        io::stdin()
+            .read_line(&mut answer)
+            .expect("Failed to read line");
+        if !answer.is_empty() && answer != "\n" && answer != "\r\n" {
+            return answer.trim().to_string();
+        }
+    }
+}
+
+fn get_options_from_env() -> Value {
+    let mut options = json!({});
+    if let Ok(val) = env::var("enable_log") {
+        options["enable-log"] = serde_json::from_str(val.as_str())
+            .expect("invalid value for enable-log option (true/false)")
+    } else {
+        options["enable-log"] = serde_json::from_str("false").unwrap()
+    }
+    if let Ok(val) = env::var("n_gpu_layers") {
+        options["n-gpu-layers"] =
+            serde_json::from_str(val.as_str()).expect("invalid ngl value (unsigned integer")
+    } else {
+        options["n-gpu-layers"] = serde_json::from_str("0").unwrap()
+    }
+    if let Ok(val) = env::var("n_predict") {
+        options["n-predict"] =
+            serde_json::from_str(val.as_str()).expect("invalid n-predict value (unsigned integer")
+    }
+    options["ctx-size"] = serde_json::from_str("1024").unwrap();
+
+    options
+}
+
+fn set_data_to_context(context: &mut GraphExecutionContext, data: Vec<u8>) -> Result<(), Error> {
+    context.set_input(0, TensorType::U8, &[1], &data)
+}
+
+#[allow(dead_code)]
+fn set_metadata_to_context(
+    context: &mut GraphExecutionContext,
+    data: Vec<u8>,
+) -> Result<(), Error> {
+    context.set_input(1, TensorType::U8, &[1], &data)
+}
+
+fn get_data_from_context(context: &GraphExecutionContext, index: usize) -> String {
+    // Preserve for 4096 tokens with average token length 6
+    const MAX_OUTPUT_BUFFER_SIZE: usize = 4096 * 6;
+    let mut output_buffer = vec![0u8; MAX_OUTPUT_BUFFER_SIZE];
+    let mut output_size = context
+        .get_output(index, &mut output_buffer)
+        .expect("Failed to get output");
+    output_size = std::cmp::min(MAX_OUTPUT_BUFFER_SIZE, output_size);
+
+    return String::from_utf8_lossy(&output_buffer[..output_size]).to_string();
+}
+
+fn get_output_from_context(context: &GraphExecutionContext) -> String {
+    get_data_from_context(context, 0)
+}
+
+fn get_metadata_from_context(context: &GraphExecutionContext) -> Value {
+    serde_json::from_str(&get_data_from_context(context, 1)).expect("Failed to get metadata")
+}
+
+const JSON_GRAMMAR: &str = r#"
+root   ::= object
+value  ::= object | array | string | number | ("true" | "false" | "null") ws
+object ::=
+  "{" ws (
+            string ":" ws value
+    ("," ws string ":" ws value)*
+  )? "}" ws
+array  ::=
+  "[" ws (
+            value
+    ("," ws value)*
+  )? "]" ws
+string ::=
+  "\"" (
+    [^"\\\x7F\x00-\x1F] |
+    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+  )* "\"" ws
+number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+ws ::= ([ \t\n] ws)?
+"#;
+
+fn main() {
+    let args: Vec<String> = env::args().collect();
+    let model_name: &str = &args[1];
+
+    // Set options for the graph. Check our README for more details:
+    // https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters
+    let mut options = get_options_from_env();
+
+    // Add grammar for JSON output.
+    // Check [here](https://github.com/ggerganov/llama.cpp/tree/master/grammars) for more details.
+    options["grammar"] = JSON_GRAMMAR.into();
+
+    // Make the output more consistent.
+    options["temp"] = json!(0.1);
+
+    // Create graph and initialize context.
+    let graph = GraphBuilder::new(GraphEncoding::Ggml, ExecutionTarget::AUTO)
+        .config(serde_json::to_string(&options).expect("Failed to serialize options"))
+        .build_from_cache(model_name)
+        .expect("Failed to build graph");
+    let mut context = graph
+        .init_execution_context()
+        .expect("Failed to init context");
+
+    // If there is a third argument, use it as the prompt and enter non-interactive mode.
+    // This is mainly for the CI workflow.
+    if args.len() >= 3 {
+        let prompt = &args[2];
+        // Set the prompt.
+        println!("Prompt:\n{}", prompt);
+        let tensor_data = prompt.as_bytes().to_vec();
+        context
+            .set_input(0, TensorType::U8, &[1], &tensor_data)
+            .expect("Failed to set input");
+        println!("Response:");
+
+        // Get the number of input tokens and llama.cpp versions.
+        let input_metadata = get_metadata_from_context(&context);
+        println!("[INFO] llama_commit: {}", input_metadata["llama_commit"]);
+        println!(
+            "[INFO] llama_build_number: {}",
+            input_metadata["llama_build_number"]
+        );
+        println!(
+            "[INFO] Number of input tokens: {}",
+            input_metadata["input_tokens"]
+        );
+
+        // Get the output.
+        context.compute().expect("Failed to compute");
+        let output = get_output_from_context(&context);
+        println!("{}", output.trim());
+
+        // Retrieve the output metadata.
+        let metadata = get_metadata_from_context(&context);
+        println!(
+            "[INFO] Number of input tokens: {}",
+            metadata["input_tokens"]
+        );
+        println!(
+            "[INFO] Number of output tokens: {}",
+            metadata["output_tokens"]
+        );
+        std::process::exit(0);
+    }
+
+    loop {
+        println!("USER:");
+        let input = read_input();
+
+        // Set prompt to the input tensor.
+        set_data_to_context(&mut context, input.as_bytes().to_vec()).expect("Failed to set input");
+
+        // Execute the inference.
+        match context.compute() {
+            Ok(_) => (),
+            Err(Error::BackendError(BackendError::ContextFull)) => {
+                println!("\n[INFO] Context full, we'll reset the context and continue.");
+            }
+            Err(Error::BackendError(BackendError::PromptTooLong)) => {
+                println!("\n[INFO] Prompt too long, we'll reset the context and continue.");
+            }
+            Err(err) => {
+                println!("\n[ERROR] {}", err);
+            }
+        }
+
+        // Retrieve the output.
+        let output = get_output_from_context(&context);
+        println!("ASSISTANT:\n{}", output.trim());
+    }
+}
diff --git a/wasmedge-ggml/grammar/wasmedge-ggml-grammar.wasm b/wasmedge-ggml/grammar/wasmedge-ggml-grammar.wasm