diff --git a/wasmedge-mlx/README.md b/wasmedge-mlx/README.md index 95dbfd4..bc054b3 100644 --- a/wasmedge-mlx/README.md +++ b/wasmedge-mlx/README.md @@ -70,11 +70,20 @@ wasmedge --dir .:. \ There are some metadata for MLX plugin you can set. +### Basic setting + - model_type (required): LLM model type. - tokenizer (required): tokenizer.json path - max_token (option): maximum generate token number, default is 1024. - enable_debug_log (option): if print debug log, default is false. +### Quantization + +The following three parameters need to be set together. +- is_quantized (option): If the weight is quantized. If is_quantized is false, then MLX backend will quantize the weight. +- group_size (option): The group size to use for quantization. +- q_bits (option): The number of bits to quantize to. + ``` rust let graph = GraphBuilder::new(GraphEncoding::Mlx, ExecutionTarget::AUTO) .config(serde_json::to_string(&json!({"model_type": "tiny_llama_1.1B_chat_v1.0", "tokenizer":tokenizer_path, "max_token":100})) diff --git a/wasmedge-mlx/src/main.rs b/wasmedge-mlx/src/main.rs index 14df217..098e75e 100644 --- a/wasmedge-mlx/src/main.rs +++ b/wasmedge-mlx/src/main.rs @@ -24,7 +24,7 @@ fn main() { let args: Vec = env::args().collect(); let model_name: &str = &args[1]; let graph = GraphBuilder::new(GraphEncoding::Mlx, ExecutionTarget::AUTO) - .config(serde_json::to_string(&json!({"model_type": "tiny_llama_1.1B_chat_v1.0", "tokenizer":tokenizer_path, "max_token":100})).expect("Failed to serialize options")) + .config(serde_json::to_string(&json!({"is_quantized":false, "group_size": 64, "q_bits": 4,"model_type": "tiny_llama_1.1B_chat_v1.0", "tokenizer":tokenizer_path, "max_token":100})).expect("Failed to serialize options")) .build_from_cache(model_name) .expect("Failed to build graph"); let mut context = graph