Skip to content

Commit

Permalink
Merge branch 'dev' into chore/convention
Browse files Browse the repository at this point in the history
  • Loading branch information
vansangpfiev authored Dec 26, 2024
2 parents 0139fd2 + fb72167 commit ca35c1f
Show file tree
Hide file tree
Showing 27 changed files with 4,572 additions and 192 deletions.
78 changes: 78 additions & 0 deletions docs/static/openapi/cortex.json
Original file line number Diff line number Diff line change
Expand Up @@ -2199,6 +2199,84 @@
"tags": ["Engines"]
}
},
"/v1/engines/{name}/releases/{version}": {
"get": {
"summary": "List variants for a specific engine version",
"description": "Lists all available variants (builds) for a specific version of an engine. Variants can include different CPU architectures (AVX, AVX2, AVX512), GPU support (CUDA, Vulkan), and operating systems (Windows, Linux, macOS).",
"parameters": [
{
"name": "name",
"in": "path",
"required": true,
"schema": {
"type": "string",
"enum": ["llama-cpp", "onnxruntime", "tensorrt-llm"],
"default": "llama-cpp"
},
"description": "The type of engine"
},
{
"name": "version",
"in": "path",
"required": true,
"schema": {
"type": "string"
},
"description": "The version of the engine"
},
{
"name": "show",
"in": "query",
"required": false,
"schema": {
"type": "string",
"enum": ["all", "compatible"],
"default": "all"
},
"description": "Filter the variants list. Use 'compatible' to show only variants compatible with the current system, or 'all' to show all available variants."
}
],
"responses": {
"200": {
"description": "Successfully retrieved variants list",
"content": {
"application/json": {
"schema": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "The name of the variant, including OS, architecture, and capabilities",
"example": "linux-amd64-avx-cuda-11-7"
},
"created_at": {
"type": "string",
"format": "date-time",
"description": "Creation timestamp of the variant",
"example": "2024-11-13T04:51:16Z"
},
"size": {
"type": "integer",
"description": "Size of the variant in bytes",
"example": 151224604
},
"download_count": {
"type": "integer",
"description": "Number of times this variant has been downloaded",
"example": 0
}
}
}
}
}
}
}
},
"tags": ["Engines"]
}
},
"/v1/engines/{name}/releases/latest": {
"get": {
"summary": "Get latest release",
Expand Down
13 changes: 6 additions & 7 deletions engine/cli/commands/chat_completion_cmd.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ size_t WriteCallback(char* ptr, size_t size, size_t nmemb, void* userdata) {

return data_length;
}

} // namespace

void ChatCompletionCmd::Exec(const std::string& host, int port,
Expand Down Expand Up @@ -103,7 +102,7 @@ void ChatCompletionCmd::Exec(const std::string& host, int port,
return;
}

std::string url = "http://" + address + "/v1/chat/completions";
auto url = "http://" + address + "/v1/chat/completions";
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_POST, 1L);

Expand Down Expand Up @@ -151,18 +150,18 @@ void ChatCompletionCmd::Exec(const std::string& host, int port,
json_data["model"] = model_handle;
json_data["stream"] = true;

std::string json_payload = json_data.toStyledString();

curl_easy_setopt(curl, CURLOPT_POSTFIELDS, json_payload.c_str());
auto json_str = json_data.toStyledString();
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, json_str.c_str());
curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, json_str.length());
curl_easy_setopt(curl, CURLOPT_TCP_KEEPALIVE, 1L);

std::string ai_chat;
StreamingCallback callback;
callback.ai_chat = &ai_chat;

curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &callback);

CURLcode res = curl_easy_perform(curl);
auto res = curl_easy_perform(curl);

if (res != CURLE_OK) {
CLI_LOG("CURL request failed: " << curl_easy_strerror(res));
Expand Down
2 changes: 1 addition & 1 deletion engine/cli/commands/model_status_cmd.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ bool ModelStatusCmd::IsLoaded(const std::string& host, int port,
auto res = curl_utils::SimpleGetJson(url.ToFullPath());
if (res.has_error()) {
auto root = json_helper::ParseJsonString(res.error());
CLI_LOG(root["message"].asString());
CTL_WRN(root["message"].asString());
return false;
}

Expand Down
29 changes: 29 additions & 0 deletions engine/common/model_metadata.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#pragma once

#include <sstream>
#include "common/tokenizer.h"

struct ModelMetadata {
uint32_t version;
uint64_t tensor_count;
uint64_t metadata_kv_count;
std::shared_ptr<Tokenizer> tokenizer;

std::string ToString() const {
std::ostringstream ss;
ss << "ModelMetadata {\n"
<< "version: " << version << "\n"
<< "tensor_count: " << tensor_count << "\n"
<< "metadata_kv_count: " << metadata_kv_count << "\n"
<< "tokenizer: ";

if (tokenizer) {
ss << "\n" << tokenizer->ToString();
} else {
ss << "null";
}

ss << "\n}";
return ss.str();
}
};
72 changes: 72 additions & 0 deletions engine/common/tokenizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#pragma once

#include <sstream>
#include <string>

struct Tokenizer {
std::string eos_token = "";
bool add_eos_token = true;

std::string bos_token = "";
bool add_bos_token = true;

std::string unknown_token = "";
std::string padding_token = "";

std::string chat_template = "";

bool add_generation_prompt = true;

// Helper function for common fields
std::string BaseToString() const {
std::ostringstream ss;
ss << "eos_token: \"" << eos_token << "\"\n"
<< "add_eos_token: " << (add_eos_token ? "true" : "false") << "\n"
<< "bos_token: \"" << bos_token << "\"\n"
<< "add_bos_token: " << (add_bos_token ? "true" : "false") << "\n"
<< "unknown_token: \"" << unknown_token << "\"\n"
<< "padding_token: \"" << padding_token << "\"\n"
<< "chat_template: \"" << chat_template << "\"\n"
<< "add_generation_prompt: "
<< (add_generation_prompt ? "true" : "false") << "\"";
return ss.str();
}

virtual ~Tokenizer() = default;

virtual std::string ToString() = 0;
};

struct GgufTokenizer : public Tokenizer {
std::string pre = "";

~GgufTokenizer() override = default;

std::string ToString() override {
std::ostringstream ss;
ss << "GgufTokenizer {\n";
// Add base class members
ss << BaseToString() << "\n";
// Add derived class members
ss << "pre: \"" << pre << "\"\n";
ss << "}";
return ss.str();
}
};

struct SafeTensorTokenizer : public Tokenizer {
bool add_prefix_space = true;

~SafeTensorTokenizer() = default;

std::string ToString() override {
std::ostringstream ss;
ss << "SafeTensorTokenizer {\n";
// Add base class members
ss << BaseToString() << "\n";
// Add derived class members
ss << "add_prefix_space: " << (add_prefix_space ? "true" : "false") << "\n";
ss << "}";
return ss.str();
}
};
16 changes: 14 additions & 2 deletions engine/controllers/engines.cc
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ void Engines::GetEngineReleases(
void Engines::GetEngineVariants(
const HttpRequestPtr& req,
std::function<void(const HttpResponsePtr&)>&& callback,
const std::string& engine, const std::string& version) const {
const std::string& engine, const std::string& version,
std::optional<std::string> show) const {
if (engine.empty()) {
Json::Value res;
res["message"] = "Engine name is required";
Expand All @@ -142,7 +143,18 @@ void Engines::GetEngineVariants(
return;
}

auto result = engine_service_->GetEngineVariants(engine, version);
auto show_value = show.value_or("all");
if (show_value != "all" && show_value != "compatible") {
Json::Value res;
res["message"] = "Invalid show value. Can either be `all` or `compatible`";
auto resp = cortex_utils::CreateCortexHttpJsonResponse(res);
resp->setStatusCode(k400BadRequest);
callback(resp);
return;
}

auto result = engine_service_->GetEngineVariants(engine, version,
show_value == "compatible");

auto normalize_version = string_utils::RemoveSubstring(version, "v");
Json::Value releases(Json::arrayValue);
Expand Down
12 changes: 5 additions & 7 deletions engine/controllers/engines.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,11 @@ class Engines : public drogon::HttpController<Engines, false> {
METHOD_ADD(Engines::GetEngineReleases, "/{1}/releases", Get);
ADD_METHOD_TO(Engines::GetEngineReleases, "/v1/engines/{1}/releases", Get);

METHOD_ADD(Engines::GetEngineVariants, "/{1}/releases/{2}", Get);
ADD_METHOD_TO(Engines::GetEngineVariants, "/v1/engines/{1}/releases/{2}",
Get);
ADD_METHOD_TO(Engines::GetEngineVariants,
"/v1/engines/{engine}/releases/{version}?show={show}", Get);

METHOD_ADD(Engines::GetLatestEngineVersion, "/{1}/releases/latest", Get);
ADD_METHOD_TO(Engines::GetLatestEngineVersion,
"/v1/engines/{1}/releases/latest", Get);
"/v1/engines/{engine}/releases/latest", Get);

METHOD_LIST_END

Expand Down Expand Up @@ -89,8 +87,8 @@ class Engines : public drogon::HttpController<Engines, false> {

void GetEngineVariants(const HttpRequestPtr& req,
std::function<void(const HttpResponsePtr&)>&& callback,
const std::string& engine,
const std::string& version) const;
const std::string& engine, const std::string& version,
std::optional<std::string> show) const;

void GetInstalledEngineVariants(
const HttpRequestPtr& req,
Expand Down
17 changes: 5 additions & 12 deletions engine/controllers/files.cc
Original file line number Diff line number Diff line change
Expand Up @@ -216,10 +216,8 @@ void Files::RetrieveFileContent(
return;
}

auto [buffer, size] = std::move(res.value());
auto resp = HttpResponse::newHttpResponse();
resp->setBody(std::string(buffer.get(), size));
resp->setContentTypeCode(CT_APPLICATION_OCTET_STREAM);
auto resp =
cortex_utils::CreateCortexContentResponse(std::move(res.value()));
callback(resp);
} else {
if (!msg_res->rel_path.has_value()) {
Expand All @@ -243,10 +241,8 @@ void Files::RetrieveFileContent(
return;
}

auto [buffer, size] = std::move(content_res.value());
auto resp = HttpResponse::newHttpResponse();
resp->setBody(std::string(buffer.get(), size));
resp->setContentTypeCode(CT_APPLICATION_OCTET_STREAM);
auto resp = cortex_utils::CreateCortexContentResponse(
std::move(content_res.value()));
callback(resp);
}
}
Expand All @@ -261,9 +257,6 @@ void Files::RetrieveFileContent(
return;
}

auto [buffer, size] = std::move(res.value());
auto resp = HttpResponse::newHttpResponse();
resp->setBody(std::string(buffer.get(), size));
resp->setContentTypeCode(CT_APPLICATION_OCTET_STREAM);
auto resp = cortex_utils::CreateCortexContentResponse(std::move(res.value()));
callback(resp);
}
9 changes: 8 additions & 1 deletion engine/controllers/server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#include "trantor/utils/Logger.h"
#include "utils/cortex_utils.h"
#include "utils/function_calling/common.h"
#include "utils/http_util.h"

using namespace inferences;

Expand All @@ -27,6 +26,14 @@ void server::ChatCompletion(
std::function<void(const HttpResponsePtr&)>&& callback) {
LOG_DEBUG << "Start chat completion";
auto json_body = req->getJsonObject();
if (json_body == nullptr) {
Json::Value ret;
ret["message"] = "Body can't be empty";
auto resp = cortex_utils::CreateCortexHttpJsonResponse(ret);
resp->setStatusCode(k400BadRequest);
callback(resp);
return;
}
bool is_stream = (*json_body).get("stream", false).asBool();
auto model_id = (*json_body).get("model", "invalid_model").asString();
auto engine_type = [this, &json_body]() -> std::string {
Expand Down
1 change: 1 addition & 0 deletions engine/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ void RunServer(std::optional<std::string> host, std::optional<int> port,
auto model_src_svc = std::make_shared<services::ModelSourceService>();
auto model_service = std::make_shared<ModelService>(
download_service, inference_svc, engine_service);
inference_svc->SetModelService(model_service);

auto file_watcher_srv = std::make_shared<FileWatcherService>(
model_dir_path.string(), model_service);
Expand Down
Loading

0 comments on commit ca35c1f

Please sign in to comment.