From 5a94d516977472e0b3f2ba244381097ab5104c00 Mon Sep 17 00:00:00 2001
From: jan-service-account
 <136811300+jan-service-account@users.noreply.github.com>
Date: Mon, 30 Dec 2024 11:05:42 +0700
Subject: [PATCH] Update llama.cpp submodule to latest release b4397 (#352)

* Update submodule to latest release b4397

* fix: build

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: vansangpfiev <sang@jan.ai>
---
 .github/workflows/build.yml                   |  2 +-
 .github/workflows/nightly-build.yml           |  2 +-
 .../workflows/template-e2e-weekend-test.yml   |  2 +-
 .../workflows/template-quality-gate-pr.yml    |  2 +-
 .../template-quality-gate-submodule.yml       |  2 +-
 llama.cpp                                     |  2 +-
 src/llama_client_slot.cc                      | 30 +++++++++----------
 src/llama_server_context.cc                   |  2 +-
 8 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 9fee8fa..e3543f9 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -65,7 +65,7 @@ jobs:
           - os: "linux"
             name: "arm64"
             runs-on: "ubuntu-2004-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             ccache: true
diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml
index 9a64901..db19c4b 100644
--- a/.github/workflows/nightly-build.yml
+++ b/.github/workflows/nightly-build.yml
@@ -64,7 +64,7 @@ jobs:
           - os: "linux"
             name: "arm64"
             runs-on: "ubuntu-2004-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             ccache: true
diff --git a/.github/workflows/template-e2e-weekend-test.yml b/.github/workflows/template-e2e-weekend-test.yml
index 7161a74..fdd280d 100644
--- a/.github/workflows/template-e2e-weekend-test.yml
+++ b/.github/workflows/template-e2e-weekend-test.yml
@@ -33,7 +33,7 @@ jobs:
           - os: "linux"
             name: "arm64"
             runs-on: "ubuntu-2004-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             ccache: true
diff --git a/.github/workflows/template-quality-gate-pr.yml b/.github/workflows/template-quality-gate-pr.yml
index 93e055d..875b320 100644
--- a/.github/workflows/template-quality-gate-pr.yml
+++ b/.github/workflows/template-quality-gate-pr.yml
@@ -31,7 +31,7 @@ jobs:
           - os: "linux"
             name: "arm64"
             runs-on: "ubuntu-2004-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             ccache: true
diff --git a/.github/workflows/template-quality-gate-submodule.yml b/.github/workflows/template-quality-gate-submodule.yml
index 8d7e45d..ed61dec 100644
--- a/.github/workflows/template-quality-gate-submodule.yml
+++ b/.github/workflows/template-quality-gate-submodule.yml
@@ -31,7 +31,7 @@ jobs:
           - os: "linux"
             name: "arm64"
             runs-on: "ubuntu-2004-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             ccache: true
diff --git a/llama.cpp b/llama.cpp
index 0e70ba6..a813bad 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit 0e70ba686e6c717a0aa41d88284e2a392c2bd0cd
+Subproject commit a813badbbdf0d38705f249df7a0c99af5cdee678
diff --git a/src/llama_client_slot.cc b/src/llama_client_slot.cc
index a47a0f5..b2ad2e4 100644
--- a/src/llama_client_slot.cc
+++ b/src/llama_client_slot.cc
@@ -76,19 +76,19 @@ json LlamaClientSlot::GetFormatedTimings() {
 }
 
 void LlamaClientSlot::PrintTimings() const {
-  LOG_DEBUG << __func__ << ": prompt eval time = " << t_prompt_processing
-            << "ms / " << num_prompt_tokens_processed << " tokens ("
-            << t_prompt_processing / num_prompt_tokens_processed
-            << " ms per "
-               "token, "
-            << 1e3 / t_prompt_processing * num_prompt_tokens_processed
-            << " tokens per second)";
-  LOG_DEBUG << __func__ << ":        eval time = " << t_token_generation
-            << " ms / " << n_decoded << " runs   ("
-            << t_token_generation / n_decoded
-            << " ms per "
-               "token, "
-            << 1e3 / t_token_generation * n_decoded << " tokens per second)\n";
-  LOG_DEBUG << __func__ << ":       total time = "
-            << t_prompt_processing + t_token_generation << " ms";
+  LOG_INFO << __func__ << ": prompt eval time = " << t_prompt_processing
+           << "ms / " << num_prompt_tokens_processed << " tokens ("
+           << t_prompt_processing / num_prompt_tokens_processed
+           << " ms per "
+              "token, "
+           << 1e3 / t_prompt_processing * num_prompt_tokens_processed
+           << " tokens per second)";
+  LOG_INFO << __func__ << ":        eval time = " << t_token_generation
+           << " ms / " << n_decoded << " runs   ("
+           << t_token_generation / n_decoded
+           << " ms per "
+              "token, "
+           << 1e3 / t_token_generation * n_decoded << " tokens per second)\n";
+  LOG_INFO << __func__ << ":       total time = "
+           << t_prompt_processing + t_token_generation << " ms";
 }
diff --git a/src/llama_server_context.cc b/src/llama_server_context.cc
index a13a3d9..7118df4 100644
--- a/src/llama_server_context.cc
+++ b/src/llama_server_context.cc
@@ -1066,7 +1066,7 @@ void LlamaServerContext::SendEmbedding(LlamaClientSlot& slot) {
       continue;
     }
 
-    common_embd_normalize(embd, embd_res.data(), n_embd);
+    common_embd_normalize(embd, embd_res.data(), n_embd, 2);
   }
   res.result_json = json{
       {"tokens_evaluated", slot.num_prompt_tokens},