From 5a94d516977472e0b3f2ba244381097ab5104c00 Mon Sep 17 00:00:00 2001 From: jan-service-account <136811300+jan-service-account@users.noreply.github.com> Date: Mon, 30 Dec 2024 11:05:42 +0700 Subject: [PATCH] Update llama.cpp submodule to latest release b4397 (#352) * Update submodule to latest release b4397 * fix: build --------- Co-authored-by: github-actions[bot] Co-authored-by: vansangpfiev --- .github/workflows/build.yml | 2 +- .github/workflows/nightly-build.yml | 2 +- .../workflows/template-e2e-weekend-test.yml | 2 +- .../workflows/template-quality-gate-pr.yml | 2 +- .../template-quality-gate-submodule.yml | 2 +- llama.cpp | 2 +- src/llama_client_slot.cc | 30 +++++++++---------- src/llama_server_context.cc | 2 +- 8 files changed, 22 insertions(+), 22 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9fee8fa..e3543f9 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -65,7 +65,7 @@ jobs: - os: "linux" name: "arm64" runs-on: "ubuntu-2004-arm64" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" run-e2e: true vulkan: false ccache: true diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml index 9a64901..db19c4b 100644 --- a/.github/workflows/nightly-build.yml +++ b/.github/workflows/nightly-build.yml @@ -64,7 +64,7 @@ jobs: - os: "linux" name: "arm64" runs-on: "ubuntu-2004-arm64" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" run-e2e: true vulkan: false ccache: true diff --git a/.github/workflows/template-e2e-weekend-test.yml b/.github/workflows/template-e2e-weekend-test.yml index 7161a74..fdd280d 100644 --- a/.github/workflows/template-e2e-weekend-test.yml +++ b/.github/workflows/template-e2e-weekend-test.yml @@ -33,7 +33,7 @@ jobs: - os: "linux" name: "arm64" runs-on: "ubuntu-2004-arm64" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" run-e2e: true vulkan: false ccache: true diff --git a/.github/workflows/template-quality-gate-pr.yml b/.github/workflows/template-quality-gate-pr.yml index 93e055d..875b320 100644 --- a/.github/workflows/template-quality-gate-pr.yml +++ b/.github/workflows/template-quality-gate-pr.yml @@ -31,7 +31,7 @@ jobs: - os: "linux" name: "arm64" runs-on: "ubuntu-2004-arm64" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" run-e2e: true vulkan: false ccache: true diff --git a/.github/workflows/template-quality-gate-submodule.yml b/.github/workflows/template-quality-gate-submodule.yml index 8d7e45d..ed61dec 100644 --- a/.github/workflows/template-quality-gate-submodule.yml +++ b/.github/workflows/template-quality-gate-submodule.yml @@ -31,7 +31,7 @@ jobs: - os: "linux" name: "arm64" runs-on: "ubuntu-2004-arm64" - cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" + cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja" run-e2e: true vulkan: false ccache: true diff --git a/llama.cpp b/llama.cpp index 0e70ba6..a813bad 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 0e70ba686e6c717a0aa41d88284e2a392c2bd0cd +Subproject commit a813badbbdf0d38705f249df7a0c99af5cdee678 diff --git a/src/llama_client_slot.cc b/src/llama_client_slot.cc index a47a0f5..b2ad2e4 100644 --- a/src/llama_client_slot.cc +++ b/src/llama_client_slot.cc @@ -76,19 +76,19 @@ json LlamaClientSlot::GetFormatedTimings() { } void LlamaClientSlot::PrintTimings() const { - LOG_DEBUG << __func__ << ": prompt eval time = " << t_prompt_processing - << "ms / " << num_prompt_tokens_processed << " tokens (" - << t_prompt_processing / num_prompt_tokens_processed - << " ms per " - "token, " - << 1e3 / t_prompt_processing * num_prompt_tokens_processed - << " tokens per second)"; - LOG_DEBUG << __func__ << ": eval time = " << t_token_generation - << " ms / " << n_decoded << " runs (" - << t_token_generation / n_decoded - << " ms per " - "token, " - << 1e3 / t_token_generation * n_decoded << " tokens per second)\n"; - LOG_DEBUG << __func__ << ": total time = " - << t_prompt_processing + t_token_generation << " ms"; + LOG_INFO << __func__ << ": prompt eval time = " << t_prompt_processing + << "ms / " << num_prompt_tokens_processed << " tokens (" + << t_prompt_processing / num_prompt_tokens_processed + << " ms per " + "token, " + << 1e3 / t_prompt_processing * num_prompt_tokens_processed + << " tokens per second)"; + LOG_INFO << __func__ << ": eval time = " << t_token_generation + << " ms / " << n_decoded << " runs (" + << t_token_generation / n_decoded + << " ms per " + "token, " + << 1e3 / t_token_generation * n_decoded << " tokens per second)\n"; + LOG_INFO << __func__ << ": total time = " + << t_prompt_processing + t_token_generation << " ms"; } diff --git a/src/llama_server_context.cc b/src/llama_server_context.cc index a13a3d9..7118df4 100644 --- a/src/llama_server_context.cc +++ b/src/llama_server_context.cc @@ -1066,7 +1066,7 @@ void LlamaServerContext::SendEmbedding(LlamaClientSlot& slot) { continue; } - common_embd_normalize(embd, embd_res.data(), n_embd); + common_embd_normalize(embd, embd_res.data(), n_embd, 2); } res.result_json = json{ {"tokens_evaluated", slot.num_prompt_tokens},