Skip to content

Commit

Permalink
Merge pull request #229 from tikikun/main
Browse files Browse the repository at this point in the history
multiple important fixes for CPU specific optimization
  • Loading branch information
tikikun authored Dec 2, 2023
2 parents 37944d5 + 16c46a7 commit f777d79
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 9 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ Table of parameters
| `system_prompt` | String | The prompt to use for system rules. |
| `pre_prompt` | String | The prompt to use for internal configuration. |
| `cpu_threads` | Integer | The number of threads to use for inferencing (CPU MODE ONLY) |
| `n_batch` | Integer | The batch size for prompt eval step |

***OPTIONAL***: You can run Nitro on a different port like 5000 instead of 3928 by running it manually in terminal
```zsh
Expand Down
17 changes: 8 additions & 9 deletions controllers/llamaCPP.cc
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,9 @@ void llamaCPP::chatCompletion(
data["cache_prompt"] = true;
data["n_keep"] = -1;

// Passing load value
data["repeat_last_n"] = this->repeat_last_n;

data["stream"] = (*jsonBody).get("stream", false).asBool();
data["n_predict"] = (*jsonBody).get("max_tokens", 500).asInt();
data["top_p"] = (*jsonBody).get("top_p", 0.95).asFloat();
Expand Down Expand Up @@ -200,6 +203,8 @@ void llamaCPP::chatCompletion(
stopWords.push_back(stop_word.asString());
}
// specify default stop words
// Ensure success case for chatML
stopWords.push_back("<|im_end|>");
stopWords.push_back(nitro_utils::rtrim(user_prompt));
data["stop"] = stopWords;
}
Expand Down Expand Up @@ -374,7 +379,7 @@ void llamaCPP::loadModel(
params.n_ctx = (*jsonBody).get("ctx_len", 2048).asInt();
params.embedding = (*jsonBody).get("embedding", true).asBool();
// Check if n_parallel exists in jsonBody, if not, set to drogon_thread

params.n_batch = (*jsonBody).get("n_batch", 512).asInt();
params.n_parallel = (*jsonBody).get("n_parallel", drogon_thread).asInt();
params.n_threads =
(*jsonBody)
Expand All @@ -386,14 +391,8 @@ void llamaCPP::loadModel(
this->ai_prompt = (*jsonBody).get("ai_prompt", "ASSISTANT: ").asString();
this->system_prompt =
(*jsonBody).get("system_prompt", "ASSISTANT's RULE: ").asString();
this->pre_prompt =
(*jsonBody)
.get("pre_prompt",
"A chat between a curious user and an artificial "
"intelligence "
"assistant. The assistant follows the given rules no matter "
"what.\\n")
.asString();
this->pre_prompt = (*jsonBody).get("pre_prompt", "").asString();
this->repeat_last_n = (*jsonBody).get("repeat_last_n", 32).asInt();
}
#ifdef GGML_USE_CUBLAS
LOG_INFO << "Setting up GGML CUBLAS PARAMS";
Expand Down
1 change: 1 addition & 0 deletions controllers/llamaCPP.h
Original file line number Diff line number Diff line change
Expand Up @@ -2161,5 +2161,6 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
std::string ai_prompt;
std::string system_prompt;
std::string pre_prompt;
int repeat_last_n;
};
}; // namespace inferences

0 comments on commit f777d79

Please sign in to comment.