diff --git a/docs/docs/cli/models/index.mdx b/docs/docs/cli/models/index.mdx index b75bf9d49..dff452788 100644 --- a/docs/docs/cli/models/index.mdx +++ b/docs/docs/cli/models/index.mdx @@ -159,9 +159,10 @@ This command uses a `model_id` from the model that you have downloaded or availa | Option | Description | Required | Default value | Example | |---------------------------|---------------------------------------------------------------------------|----------|----------------------------------------------|------------------------| -| `model_id` | The identifier of the model you want to start. | Yes | `Prompt to select from the available models` | `mistral` | -| `--gpus` | List of GPUs to use. | No | - | `[0,1]` | -| `-h`, `--help` | Display help information for the command. | No | - | `-h` | +| `model_id` | The identifier of the model you want to start. | Yes | `Prompt to select from the available models` | `mistral` | +| `--gpus` | List of GPUs to use. | No | - | `[0,1]` | +| `--ctx_len` | Maximum context length for inference. | No | `min(8192, max_model_context_length)` | `1024` | +| `-h`, `--help` | Display help information for the command. | No | - | `-h` | ## `cortex models stop` :::info diff --git a/docs/docs/cli/models/start.md b/docs/docs/cli/models/start.md index 77addd0b4..3880cd477 100644 --- a/docs/docs/cli/models/start.md +++ b/docs/docs/cli/models/start.md @@ -33,6 +33,7 @@ cortex models start [model_id]:[engine] [options] |---------------------------|----------------------------------------------------------|----------|----------------------------------------------|-------------------| | `model_id` | The identifier of the model you want to start. | No | `Prompt to select from the available models` | `mistral` | | `--gpus` | List of GPUs to use. | No | - | `[0,1]` | +| `--ctx_len` | Maximum context length for inference. | No | `min(8192, max_model_context_length)` | `1024` | | `-h`, `--help` | Display help information for the command. | No | - | `-h` | diff --git a/docs/docs/cli/run.mdx b/docs/docs/cli/run.mdx index bbce017f1..57c8358a2 100644 --- a/docs/docs/cli/run.mdx +++ b/docs/docs/cli/run.mdx @@ -36,7 +36,8 @@ You can use the `--verbose` flag to display more detailed output of the internal | Option | Description | Required | Default value | Example | |-----------------------------|-----------------------------------------------------------------------------|----------|----------------------------------------------|------------------------| -| `model_id` | The identifier of the model you want to chat with. | Yes | - | `mistral` | -| `--gpus` | List of GPUs to use. | No | - | `[0,1]` | +| `model_id` | The identifier of the model you want to chat with. | Yes | - | `mistral` | +| `--gpus` | List of GPUs to use. | No | - | `[0,1]` | +| `--ctx_len` | Maximum context length for inference. | No | `min(8192, max_model_context_length)` | `1024` | | `-h`, `--help` | Display help information for the command. | No | - | `-h` | diff --git a/engine/cli/command_line_parser.cc b/engine/cli/command_line_parser.cc index 34c6b9069..9d5d83ffc 100644 --- a/engine/cli/command_line_parser.cc +++ b/engine/cli/command_line_parser.cc @@ -163,8 +163,10 @@ void CommandLineParser::SetupCommonCommands() { run_cmd->usage("Usage:\n" + commands::GetCortexBinary() + " run [options] [model_id]"); run_cmd->add_option("model_id", cml_data_.model_id, ""); - run_cmd->add_option("--gpus", hw_activate_opts_["gpus"], + run_cmd->add_option("--gpus", run_settings_["gpus"], "List of GPU to activate, for example [0, 1]"); + run_cmd->add_option("--ctx_len", run_settings_["ctx_len"], + "Maximum context length for inference"); run_cmd->add_flag("-d,--detach", cml_data_.run_detach, "Detached mode"); run_cmd->callback([this, run_cmd] { if (std::exchange(executed_, true)) @@ -172,7 +174,7 @@ void CommandLineParser::SetupCommonCommands() { commands::RunCmd rc(cml_data_.config.apiServerHost, std::stoi(cml_data_.config.apiServerPort), cml_data_.model_id, download_service_); - rc.Exec(cml_data_.run_detach, hw_activate_opts_); + rc.Exec(cml_data_.run_detach, run_settings_); }); } @@ -203,8 +205,10 @@ void CommandLineParser::SetupModelCommands() { model_start_cmd->usage("Usage:\n" + commands::GetCortexBinary() + " models start [model_id]"); model_start_cmd->add_option("model_id", cml_data_.model_id, ""); - model_start_cmd->add_option("--gpus", hw_activate_opts_["gpus"], + model_start_cmd->add_option("--gpus", run_settings_["gpus"], "List of GPU to activate, for example [0, 1]"); + model_start_cmd->add_option("--ctx_len", run_settings_["ctx_len"], + "Maximum context length for inference"); model_start_cmd->group(kSubcommands); model_start_cmd->callback([this, model_start_cmd]() { if (std::exchange(executed_, true)) @@ -216,7 +220,7 @@ void CommandLineParser::SetupModelCommands() { }; commands::ModelStartCmd().Exec(cml_data_.config.apiServerHost, std::stoi(cml_data_.config.apiServerPort), - cml_data_.model_id, hw_activate_opts_); + cml_data_.model_id, run_settings_); }); auto stop_model_cmd = @@ -562,7 +566,7 @@ void CommandLineParser::SetupHardwareCommands() { hw_activate_cmd->usage("Usage:\n" + commands::GetCortexBinary() + " hardware activate --gpus [list_gpu]"); hw_activate_cmd->group(kSubcommands); - hw_activate_cmd->add_option("--gpus", hw_activate_opts_["gpus"], + hw_activate_cmd->add_option("--gpus", run_settings_["gpus"], "List of GPU to activate, for example [0, 1]"); hw_activate_cmd->callback([this, hw_activate_cmd]() { if (std::exchange(executed_, true)) @@ -572,14 +576,14 @@ void CommandLineParser::SetupHardwareCommands() { return; } - if (hw_activate_opts_["gpus"].empty()) { + if (run_settings_["gpus"].empty()) { CLI_LOG("[list_gpu] is required\n"); CLI_LOG(hw_activate_cmd->help()); return; } commands::HardwareActivateCmd().Exec( cml_data_.config.apiServerHost, - std::stoi(cml_data_.config.apiServerPort), hw_activate_opts_); + std::stoi(cml_data_.config.apiServerPort), run_settings_); }); } diff --git a/engine/cli/command_line_parser.h b/engine/cli/command_line_parser.h index f7ca3f507..aec10dcb4 100644 --- a/engine/cli/command_line_parser.h +++ b/engine/cli/command_line_parser.h @@ -79,5 +79,5 @@ class CommandLineParser { std::unordered_map config_update_opts_; bool executed_ = false; commands::HarwareOptions hw_opts_; - std::unordered_map hw_activate_opts_; + std::unordered_map run_settings_; }; diff --git a/engine/cli/commands/model_start_cmd.cc b/engine/cli/commands/model_start_cmd.cc index ea6b81e5a..12aec944d 100644 --- a/engine/cli/commands/model_start_cmd.cc +++ b/engine/cli/commands/model_start_cmd.cc @@ -30,8 +30,8 @@ bool ModelStartCmd::Exec( // bool should_activate_hw = false; - for (auto const& [_, v] : options) { - if (!v.empty()) { + for (auto const& [k, v] : options) { + if (k == "gpus" && !v.empty()) { should_activate_hw = true; break; } @@ -57,6 +57,9 @@ bool ModelStartCmd::Exec( Json::Value json_data; json_data["model"] = model_id.value(); + for (auto const& [k, v] : options) { + UpdateConfig(json_data, k, v); + } auto data_str = json_data.toStyledString(); auto res = curl_utils::SimplePostJson(url.ToFullPath(), data_str); if (res.has_error()) { @@ -75,4 +78,16 @@ bool ModelStartCmd::Exec( } return true; } + +bool ModelStartCmd::UpdateConfig(Json::Value& data, const std::string& key, + const std::string& value) { + if (key == "ctx_len" && !value.empty()) { + try { + data["ctx_len"] = std::stoi(value); + } catch (const std::exception& e) { + CLI_LOG("Failed to parse numeric value for " << key << ": " << e.what()); + } + } + return true; +} }; // namespace commands diff --git a/engine/cli/commands/model_start_cmd.h b/engine/cli/commands/model_start_cmd.h index 519db0f0d..124ef463d 100644 --- a/engine/cli/commands/model_start_cmd.h +++ b/engine/cli/commands/model_start_cmd.h @@ -2,6 +2,7 @@ #include #include +#include "json/json.h" namespace commands { @@ -10,5 +11,8 @@ class ModelStartCmd { bool Exec(const std::string& host, int port, const std::string& model_handle, const std::unordered_map& options, bool print_success_log = true); + private: + bool UpdateConfig(Json::Value& data, const std::string& key, + const std::string& value); }; } // namespace commands diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index cc1f99bdc..3cfff5cb2 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -702,6 +702,8 @@ cpp::result ModelService::StartModel( config::YamlHandler yaml_handler; try { + constexpr const int kDefautlContextLength = 8192; + int max_model_context_length = kDefautlContextLength; Json::Value json_data; // Currently we don't support download vision models, so we need to bypass check if (!params_override.bypass_model_check()) { @@ -732,6 +734,8 @@ cpp::result ModelService::StartModel( json_data["system_prompt"] = mc.system_template; json_data["user_prompt"] = mc.user_template; json_data["ai_prompt"] = mc.ai_template; + json_data["ctx_len"] = std::min(kDefautlContextLength, mc.ctx_len); + max_model_context_length = mc.ctx_len; } else { bypass_stop_check_set_.insert(model_handle); } @@ -753,12 +757,14 @@ cpp::result ModelService::StartModel( ASSIGN_IF_PRESENT(json_data, params_override, cache_enabled); ASSIGN_IF_PRESENT(json_data, params_override, ngl); ASSIGN_IF_PRESENT(json_data, params_override, n_parallel); - ASSIGN_IF_PRESENT(json_data, params_override, ctx_len); ASSIGN_IF_PRESENT(json_data, params_override, cache_type); ASSIGN_IF_PRESENT(json_data, params_override, mmproj); ASSIGN_IF_PRESENT(json_data, params_override, model_path); #undef ASSIGN_IF_PRESENT - + if (params_override.ctx_len) { + json_data["ctx_len"] = + std::min(params_override.ctx_len.value(), max_model_context_length); + } CTL_INF(json_data.toStyledString()); auto may_fallback_res = MayFallbackToCpu(json_data["model_path"].asString(), json_data["ngl"].asInt(),