fix: flash attention param typo (#50)

Co-authored-by: vansangpfiev <[email protected]>
janhq · May 23, 2024 · 7d0b2e3 · 7d0b2e3
1 parent 4da4d3a
commit 7d0b2e3
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -145,4 +145,4 @@ Table of parameters
 |`model_type` | String | Model type we want to use: llm or embedding, default value is llm|
 |`model_alias`| String | Used as model_id if specified in request, mandatory in loadmodel|
 |`model`      | String | Used as model_id if specified in request, mandatory in chat/embedding request|
-|`flash-attn` | Boolean| To enable Flash Attention, default is false|
+|`flash_attn` | Boolean| To enable Flash Attention, default is false|
diff --git a/src/llama_engine.cc b/src/llama_engine.cc
@@ -334,8 +334,11 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
         jsonBody->get("cpu_threads", std::thread::hardware_concurrency())
             .asInt();
     params.cont_batching = jsonBody->get("cont_batching", false).asBool();
-    params.flash_attn = jsonBody->get("flash-attn", false).asBool();
-    if(params.flash_attn) {
+    // Check for backward compatible
+    auto fa0 = jsonBody->get("flash-attn", false).asBool();
+    auto fa1 = jsonBody->get("flash_attn", false).asBool();
+    params.flash_attn = fa0 || fa1;
+    if (params.flash_attn) {
       LOG_DEBUG << "Enabled Flash Attention";
     }
     server_map_[model_id].caching_enabled =