Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: support ctx_len for model start cli #1766

Merged
merged 3 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions docs/docs/cli/models/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,10 @@ This command uses a `model_id` from the model that you have downloaded or availa

| Option | Description | Required | Default value | Example |
|---------------------------|---------------------------------------------------------------------------|----------|----------------------------------------------|------------------------|
| `model_id` | The identifier of the model you want to start. | Yes | `Prompt to select from the available models` | `mistral` |
| `--gpus` | List of GPUs to use. | No | - | `[0,1]` |
| `-h`, `--help` | Display help information for the command. | No | - | `-h` |
| `model_id` | The identifier of the model you want to start. | Yes | `Prompt to select from the available models` | `mistral` |
| `--gpus` | List of GPUs to use. | No | - | `[0,1]` |
| `--ctx_len` | Maximum context length for inference. | No | `min(8192, max_model_context_length)` | `1024` |
| `-h`, `--help` | Display help information for the command. | No | - | `-h` |

## `cortex models stop`
:::info
Expand Down
1 change: 1 addition & 0 deletions docs/docs/cli/models/start.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ cortex models start [model_id]:[engine] [options]
|---------------------------|----------------------------------------------------------|----------|----------------------------------------------|-------------------|
| `model_id` | The identifier of the model you want to start. | No | `Prompt to select from the available models` | `mistral` |
| `--gpus` | List of GPUs to use. | No | - | `[0,1]` |
| `--ctx_len` | Maximum context length for inference. | No | `min(8192, max_model_context_length)` | `1024` |
| `-h`, `--help` | Display help information for the command. | No | - | `-h` |


Expand Down
5 changes: 3 additions & 2 deletions docs/docs/cli/run.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ You can use the `--verbose` flag to display more detailed output of the internal

| Option | Description | Required | Default value | Example |
|-----------------------------|-----------------------------------------------------------------------------|----------|----------------------------------------------|------------------------|
| `model_id` | The identifier of the model you want to chat with. | Yes | - | `mistral` |
| `--gpus` | List of GPUs to use. | No | - | `[0,1]` |
| `model_id` | The identifier of the model you want to chat with. | Yes | - | `mistral` |
| `--gpus` | List of GPUs to use. | No | - | `[0,1]` |
| `--ctx_len` | Maximum context length for inference. | No | `min(8192, max_model_context_length)` | `1024` |
| `-h`, `--help` | Display help information for the command. | No | - | `-h` |
<!-- | `-t`, `--thread <thread_id>` | Specify the Thread ID. Defaults to creating a new thread if none specified. | No | - | `-t jan_1717650808` | | `-c` | -->
18 changes: 11 additions & 7 deletions engine/cli/command_line_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -163,16 +163,18 @@ void CommandLineParser::SetupCommonCommands() {
run_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
" run [options] [model_id]");
run_cmd->add_option("model_id", cml_data_.model_id, "");
run_cmd->add_option("--gpus", hw_activate_opts_["gpus"],
run_cmd->add_option("--gpus", run_settings_["gpus"],
"List of GPU to activate, for example [0, 1]");
run_cmd->add_option("--ctx_len", run_settings_["ctx_len"],
"Maximum context length for inference");
run_cmd->add_flag("-d,--detach", cml_data_.run_detach, "Detached mode");
run_cmd->callback([this, run_cmd] {
if (std::exchange(executed_, true))
return;
commands::RunCmd rc(cml_data_.config.apiServerHost,
std::stoi(cml_data_.config.apiServerPort),
cml_data_.model_id, download_service_);
rc.Exec(cml_data_.run_detach, hw_activate_opts_);
rc.Exec(cml_data_.run_detach, run_settings_);
});
}

Expand Down Expand Up @@ -203,8 +205,10 @@ void CommandLineParser::SetupModelCommands() {
model_start_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
" models start [model_id]");
model_start_cmd->add_option("model_id", cml_data_.model_id, "");
model_start_cmd->add_option("--gpus", hw_activate_opts_["gpus"],
model_start_cmd->add_option("--gpus", run_settings_["gpus"],
"List of GPU to activate, for example [0, 1]");
model_start_cmd->add_option("--ctx_len", run_settings_["ctx_len"],
"Maximum context length for inference");
model_start_cmd->group(kSubcommands);
model_start_cmd->callback([this, model_start_cmd]() {
if (std::exchange(executed_, true))
Expand All @@ -216,7 +220,7 @@ void CommandLineParser::SetupModelCommands() {
};
commands::ModelStartCmd().Exec(cml_data_.config.apiServerHost,
std::stoi(cml_data_.config.apiServerPort),
cml_data_.model_id, hw_activate_opts_);
cml_data_.model_id, run_settings_);
});

auto stop_model_cmd =
Expand Down Expand Up @@ -562,7 +566,7 @@ void CommandLineParser::SetupHardwareCommands() {
hw_activate_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
" hardware activate --gpus [list_gpu]");
hw_activate_cmd->group(kSubcommands);
hw_activate_cmd->add_option("--gpus", hw_activate_opts_["gpus"],
hw_activate_cmd->add_option("--gpus", run_settings_["gpus"],
"List of GPU to activate, for example [0, 1]");
hw_activate_cmd->callback([this, hw_activate_cmd]() {
if (std::exchange(executed_, true))
Expand All @@ -572,14 +576,14 @@ void CommandLineParser::SetupHardwareCommands() {
return;
}

if (hw_activate_opts_["gpus"].empty()) {
if (run_settings_["gpus"].empty()) {
CLI_LOG("[list_gpu] is required\n");
CLI_LOG(hw_activate_cmd->help());
return;
}
commands::HardwareActivateCmd().Exec(
cml_data_.config.apiServerHost,
std::stoi(cml_data_.config.apiServerPort), hw_activate_opts_);
std::stoi(cml_data_.config.apiServerPort), run_settings_);
});
}

Expand Down
2 changes: 1 addition & 1 deletion engine/cli/command_line_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,5 @@ class CommandLineParser {
std::unordered_map<std::string, std::string> config_update_opts_;
bool executed_ = false;
commands::HarwareOptions hw_opts_;
std::unordered_map<std::string, std::string> hw_activate_opts_;
std::unordered_map<std::string, std::string> run_settings_;
};
19 changes: 17 additions & 2 deletions engine/cli/commands/model_start_cmd.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ bool ModelStartCmd::Exec(

//
bool should_activate_hw = false;
for (auto const& [_, v] : options) {
if (!v.empty()) {
for (auto const& [k, v] : options) {
if (k == "gpus" && !v.empty()) {
should_activate_hw = true;
break;
}
Expand All @@ -57,6 +57,9 @@ bool ModelStartCmd::Exec(

Json::Value json_data;
json_data["model"] = model_id.value();
for (auto const& [k, v] : options) {
UpdateConfig(json_data, k, v);
}
auto data_str = json_data.toStyledString();
auto res = curl_utils::SimplePostJson(url.ToFullPath(), data_str);
if (res.has_error()) {
Expand All @@ -75,4 +78,16 @@ bool ModelStartCmd::Exec(
}
return true;
}

bool ModelStartCmd::UpdateConfig(Json::Value& data, const std::string& key,
const std::string& value) {
if (key == "ctx_len" && !value.empty()) {
try {
data["ctx_len"] = std::stoi(value);
} catch (const std::exception& e) {
CLI_LOG("Failed to parse numeric value for " << key << ": " << e.what());
}
}
return true;
}
}; // namespace commands
4 changes: 4 additions & 0 deletions engine/cli/commands/model_start_cmd.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <string>
#include <unordered_map>
#include "json/json.h"

namespace commands {

Expand All @@ -10,5 +11,8 @@ class ModelStartCmd {
bool Exec(const std::string& host, int port, const std::string& model_handle,
const std::unordered_map<std::string, std::string>& options,
bool print_success_log = true);
private:
bool UpdateConfig(Json::Value& data, const std::string& key,
const std::string& value);
};
} // namespace commands
10 changes: 8 additions & 2 deletions engine/services/model_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -702,6 +702,8 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
config::YamlHandler yaml_handler;

try {
constexpr const int kDefautlContextLength = 8192;
int max_model_context_length = kDefautlContextLength;
Json::Value json_data;
// Currently we don't support download vision models, so we need to bypass check
if (!params_override.bypass_model_check()) {
Expand Down Expand Up @@ -732,6 +734,8 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
json_data["system_prompt"] = mc.system_template;
json_data["user_prompt"] = mc.user_template;
json_data["ai_prompt"] = mc.ai_template;
json_data["ctx_len"] = std::min(kDefautlContextLength, mc.ctx_len);
max_model_context_length = mc.ctx_len;
} else {
bypass_stop_check_set_.insert(model_handle);
}
Expand All @@ -753,12 +757,14 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
ASSIGN_IF_PRESENT(json_data, params_override, cache_enabled);
ASSIGN_IF_PRESENT(json_data, params_override, ngl);
ASSIGN_IF_PRESENT(json_data, params_override, n_parallel);
ASSIGN_IF_PRESENT(json_data, params_override, ctx_len);
ASSIGN_IF_PRESENT(json_data, params_override, cache_type);
ASSIGN_IF_PRESENT(json_data, params_override, mmproj);
ASSIGN_IF_PRESENT(json_data, params_override, model_path);
#undef ASSIGN_IF_PRESENT

if (params_override.ctx_len) {
json_data["ctx_len"] =
std::min(params_override.ctx_len.value(), max_model_context_length);
}
CTL_INF(json_data.toStyledString());
auto may_fallback_res = MayFallbackToCpu(json_data["model_path"].asString(),
json_data["ngl"].asInt(),
Expand Down
Loading