Skip to content

Commit

Permalink
Update llama.cpp submodule to latest release b4341 (#338)
Browse files Browse the repository at this point in the history
* Update submodule to latest release b4341

* fix: build

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: vansangpfiev <[email protected]>
  • Loading branch information
3 people authored Dec 18, 2024
1 parent 430ca85 commit 6c85652
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 10 deletions.
2 changes: 1 addition & 1 deletion llama.cpp
Submodule llama.cpp updated 120 files
34 changes: 28 additions & 6 deletions src/llama_engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,28 @@ std::string CreateReturnJson(const std::string& id, const std::string& model,
// producing compact output.
return Json::writeString(writer, root);
}

const std::vector<ggml_type> kv_cache_types = {
GGML_TYPE_F32,
GGML_TYPE_F16,
GGML_TYPE_BF16,
GGML_TYPE_Q8_0,
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1,
GGML_TYPE_IQ4_NL,
GGML_TYPE_Q5_0,
GGML_TYPE_Q5_1,
};

ggml_type kv_cache_type_from_str(const std::string & s) {
for (const auto & type : kv_cache_types) {
if (ggml_type_name(type) == s) {
return type;
}
}
throw std::runtime_error("Unsupported cache type: " + s);
}

} // namespace

void LlamaEngine::Load(EngineLoadOption opts) {
Expand Down Expand Up @@ -608,18 +630,19 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
params.cont_batching =
json_body->get("cont_batching", true)
.asBool(); // default true according to llama.cpp upstream

params.cache_type_k = json_body->get("cache_type", kTypeF16).asString();
if (!IsValidCacheType(params.cache_type_k)) {
auto cache_type_k = json_body->get("cache_type", kTypeF16).asString();
if (!IsValidCacheType(cache_type_k)) {
LOG_WARN << "Unsupported cache type: " << params.cache_type_k
<< ", fallback to f16";
params.cache_type_k = kTypeF16;
params.cache_type_k = GGML_TYPE_F16;
} else {
params.cache_type_k = kv_cache_type_from_str(cache_type_k);
}
params.cache_type_v = params.cache_type_k;
LOG_DEBUG << "cache_type: " << params.cache_type_k;

auto fa = json_body->get("flash_attn", true).asBool();
auto force_enable_fa = params.cache_type_k != kTypeF16;
auto force_enable_fa = params.cache_type_k != GGML_TYPE_F16;
if (force_enable_fa) {
LOG_DEBUG << "Using KV cache quantization, force enable Flash Attention";
}
Expand Down Expand Up @@ -767,7 +790,6 @@ void LlamaEngine::HandleInferenceImpl(
data["mirostat"] = completion.mirostat;
data["mirostat_tau"] = completion.mirostat_tau;
data["mirostat_eta"] = completion.mirostat_eta;
data["penalize_nl"] = completion.penalize_nl;
data["ignore_eos"] = completion.ignore_eos;
data["n_probs"] = completion.n_probs;
data["min_keep"] = completion.min_keep;
Expand Down
3 changes: 0 additions & 3 deletions src/llama_server_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -479,8 +479,6 @@ bool LlamaServerContext::LaunchSlotWithData(LlamaClientSlot*& slot, json data) {
json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
slot->sparams.mirostat_eta =
json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
slot->sparams.penalize_nl =
json_value(data, "penalize_nl", default_sparams.penalize_nl);
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
slot->params.seed = json_value(data, "seed", default_params.seed);
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
Expand Down Expand Up @@ -917,7 +915,6 @@ json LlamaServerContext::GetFormatedGeneration(LlamaClientSlot& slot) {
{"mirostat", slot.sparams.mirostat},
{"mirostat_tau", slot.sparams.mirostat_tau},
{"mirostat_eta", slot.sparams.mirostat_eta},
{"penalize_nl", slot.sparams.penalize_nl},
{"stop", slot.params.antiprompt},
{"n_predict", slot.params.n_predict},
{"n_keep", params.n_keep},
Expand Down

0 comments on commit 6c85652

Please sign in to comment.