Update llama.cpp submodule to latest release b4397 (#352)

* Update submodule to latest release b4397 * fix: build --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: vansangpfiev <[email protected]>
janhq · Dec 30, 2024 · 5a94d51 · 5a94d51
1 parent eb45b83
commit 5a94d51
Show file tree

Hide file tree

Showing 8 changed files with 22 additions and 22 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -65,7 +65,7 @@ jobs:
           - os: "linux"
             name: "arm64"
             runs-on: "ubuntu-2004-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             ccache: true

diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml
@@ -64,7 +64,7 @@ jobs:
           - os: "linux"
             name: "arm64"
             runs-on: "ubuntu-2004-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             ccache: true

diff --git a/.github/workflows/template-e2e-weekend-test.yml b/.github/workflows/template-e2e-weekend-test.yml
@@ -33,7 +33,7 @@ jobs:
           - os: "linux"
             name: "arm64"
             runs-on: "ubuntu-2004-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             ccache: true

diff --git a/.github/workflows/template-quality-gate-pr.yml b/.github/workflows/template-quality-gate-pr.yml
@@ -31,7 +31,7 @@ jobs:
           - os: "linux"
             name: "arm64"
             runs-on: "ubuntu-2004-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             ccache: true

diff --git a/.github/workflows/template-quality-gate-submodule.yml b/.github/workflows/template-quality-gate-submodule.yml
@@ -31,7 +31,7 @@ jobs:
           - os: "linux"
             name: "arm64"
             runs-on: "ubuntu-2004-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             ccache: true

diff --git a/llama.cpp b/llama.cpp
diff --git a/src/llama_client_slot.cc b/src/llama_client_slot.cc
@@ -76,19 +76,19 @@ json LlamaClientSlot::GetFormatedTimings() {
 }
 
 void LlamaClientSlot::PrintTimings() const {
-  LOG_DEBUG << __func__ << ": prompt eval time = " << t_prompt_processing
-            << "ms / " << num_prompt_tokens_processed << " tokens ("
-            << t_prompt_processing / num_prompt_tokens_processed
-            << " ms per "
-               "token, "
-            << 1e3 / t_prompt_processing * num_prompt_tokens_processed
-            << " tokens per second)";
-  LOG_DEBUG << __func__ << ":        eval time = " << t_token_generation
-            << " ms / " << n_decoded << " runs   ("
-            << t_token_generation / n_decoded
-            << " ms per "
-               "token, "
-            << 1e3 / t_token_generation * n_decoded << " tokens per second)\n";
-  LOG_DEBUG << __func__ << ":       total time = "
-            << t_prompt_processing + t_token_generation << " ms";
+  LOG_INFO << __func__ << ": prompt eval time = " << t_prompt_processing
+           << "ms / " << num_prompt_tokens_processed << " tokens ("
+           << t_prompt_processing / num_prompt_tokens_processed
+           << " ms per "
+              "token, "
+           << 1e3 / t_prompt_processing * num_prompt_tokens_processed
+           << " tokens per second)";
+  LOG_INFO << __func__ << ":        eval time = " << t_token_generation
+           << " ms / " << n_decoded << " runs   ("
+           << t_token_generation / n_decoded
+           << " ms per "
+              "token, "
+           << 1e3 / t_token_generation * n_decoded << " tokens per second)\n";
+  LOG_INFO << __func__ << ":       total time = "
+           << t_prompt_processing + t_token_generation << " ms";
 }
diff --git a/src/llama_server_context.cc b/src/llama_server_context.cc
@@ -1066,7 +1066,7 @@ void LlamaServerContext::SendEmbedding(LlamaClientSlot& slot) {
       continue;
     }
 
-    common_embd_normalize(embd, embd_res.data(), n_embd);
+    common_embd_normalize(embd, embd_res.data(), n_embd, 2);
   }
   res.result_json = json{
       {"tokens_evaluated", slot.num_prompt_tokens},