From f99bf0f00855d22d90b1d50e338613a93509dc38 Mon Sep 17 00:00:00 2001 From: tikikun Date: Mon, 4 Dec 2023 17:53:18 +0700 Subject: [PATCH 1/2] add json schema for engine and model parameters --- .../specs/engineering/inference-parameters.md | 171 ++++++++++++++++++ docs/sidebars.js | 1 + 2 files changed, 172 insertions(+) create mode 100644 docs/docs/specs/engineering/inference-parameters.md diff --git a/docs/docs/specs/engineering/inference-parameters.md b/docs/docs/specs/engineering/inference-parameters.md new file mode 100644 index 0000000000..52eaa8a774 --- /dev/null +++ b/docs/docs/specs/engineering/inference-parameters.md @@ -0,0 +1,171 @@ +--- +title: "Inference Parameters" +slug: /specs/inference-parameters +description: Exhaustive list of json-schema for engine and models +--- + +# model_parameters + +```js + +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": ["messages"], + "properties": { + "messages": { + "type": "array", + "items": { + "type": "object" + } + }, + "model": { + "type": "string" + }, + "frequency_penalty": { + "type": ["number", "null"], + "minimum": -2.0, + "maximum": 2.0, + "default": 0 + }, + "logit_bias": { + "type": ["object", "null"], + "additionalProperties": { + "type": "number", + "minimum": -100, + "maximum": 100 + }, + "default": null + }, + "max_tokens": { + "type": ["integer", "null"] + }, + "n": { + "type": ["integer", "null"], + "default": 1 + }, + "presence_penalty": { + "type": ["number", "null"], + "minimum": -2.0, + "maximum": 2.0, + "default": 0 + }, + "response_format": { + "type": ["object", "null"], + "properties": { + "type": { + "type": "string" + } + } + }, + "seed": { + "type": ["integer", "null"] + }, + "stop": { + "type": ["string", "array", "null"], + "items": { + "type": "string" + } + }, + "stream": { + "type": ["boolean", "null"], + "default": false + }, + "temperature": { + "type": ["number", "null"], + "minimum": 0, + "maximum": 2, + "default": 1 + }, + "top_p": { + "type": ["number", "null"], + "minimum": 0, + "maximum": 1, + "default": 1 + }, + "tools": { + "type": ["array", "null"], + "items": { + "type": "object" + } + }, + "tool_choice": { + "type": ["string", "object", "null"] + }, + "user": { + "type": ["string", "null"] + }, + "function_call": { + "type": ["string", "object", "null"], + "deprecated": true + }, + "functions": { + "type": ["array", "null"], + "items": { + "type": "object" + }, + "deprecated": true + } + } +} + +``` + +# nitro engine_parameters + +```js +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "pre_prompt": { + "type": "string", + "description": "The prompt to use for internal configuration." + }, + "system_prompt": { + "type": "string", + "description": "The prefix for system prompt." + }, + "user_prompt": { + "type": "string", + "description": "The prefix for user prompt." + }, + "ai_prompt": { + "type": "string", + "description": "The prefix for assistant prompt." + }, + "ngl": { + "type": "integer", + "default": 100, + "minimum": 0, + "maximum": 100, + "description": "The number of layers to load onto the GPU for acceleration." + }, + "ctx_len": { + "type": "integer", + "default": 2048, + "minimum": 128, + "maximum": 4096, + "description": "The context length for model operations varies; the maximum depends on the specific model used." + }, + "n_parallel": { + "type": "integer", + "default": 1, + "description": "The number of parallel operations. Only set when enable continuous batching." + }, + "cont_batching": { + "type": "boolean", + "default": false, + "description": "Whether to use continuous batching." + }, + "cpu_threads": { + "type": "integer", + "description": "The number of threads for CPU-based inference." + }, + "embedding": { + "type": "boolean", + "description": "Whether to enable embedding." + } + } +} +``` diff --git a/docs/sidebars.js b/docs/sidebars.js index 384f47e9dd..e1d5da5c69 100644 --- a/docs/sidebars.js +++ b/docs/sidebars.js @@ -82,6 +82,7 @@ const sidebars = { "specs/engineering/chats", "specs/engineering/models", "specs/engineering/engine", + "specs/engineering/inference-parameters", "specs/engineering/threads", "specs/engineering/messages", "specs/engineering/assistants", From 6ac5b0c5f0b9dbeee0f7d3893bec9a14b200d147 Mon Sep 17 00:00:00 2001 From: tikikun Date: Tue, 5 Dec 2023 18:53:29 +0700 Subject: [PATCH 2/2] remove and add to retrieve model --- .../specs/engineering/inference-parameters.md | 171 ------------------ docs/openapi/specs/models.yaml | 125 ++++++++++--- docs/sidebars.js | 1 - 3 files changed, 96 insertions(+), 201 deletions(-) delete mode 100644 docs/docs/specs/engineering/inference-parameters.md diff --git a/docs/docs/specs/engineering/inference-parameters.md b/docs/docs/specs/engineering/inference-parameters.md deleted file mode 100644 index 52eaa8a774..0000000000 --- a/docs/docs/specs/engineering/inference-parameters.md +++ /dev/null @@ -1,171 +0,0 @@ ---- -title: "Inference Parameters" -slug: /specs/inference-parameters -description: Exhaustive list of json-schema for engine and models ---- - -# model_parameters - -```js - -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "required": ["messages"], - "properties": { - "messages": { - "type": "array", - "items": { - "type": "object" - } - }, - "model": { - "type": "string" - }, - "frequency_penalty": { - "type": ["number", "null"], - "minimum": -2.0, - "maximum": 2.0, - "default": 0 - }, - "logit_bias": { - "type": ["object", "null"], - "additionalProperties": { - "type": "number", - "minimum": -100, - "maximum": 100 - }, - "default": null - }, - "max_tokens": { - "type": ["integer", "null"] - }, - "n": { - "type": ["integer", "null"], - "default": 1 - }, - "presence_penalty": { - "type": ["number", "null"], - "minimum": -2.0, - "maximum": 2.0, - "default": 0 - }, - "response_format": { - "type": ["object", "null"], - "properties": { - "type": { - "type": "string" - } - } - }, - "seed": { - "type": ["integer", "null"] - }, - "stop": { - "type": ["string", "array", "null"], - "items": { - "type": "string" - } - }, - "stream": { - "type": ["boolean", "null"], - "default": false - }, - "temperature": { - "type": ["number", "null"], - "minimum": 0, - "maximum": 2, - "default": 1 - }, - "top_p": { - "type": ["number", "null"], - "minimum": 0, - "maximum": 1, - "default": 1 - }, - "tools": { - "type": ["array", "null"], - "items": { - "type": "object" - } - }, - "tool_choice": { - "type": ["string", "object", "null"] - }, - "user": { - "type": ["string", "null"] - }, - "function_call": { - "type": ["string", "object", "null"], - "deprecated": true - }, - "functions": { - "type": ["array", "null"], - "items": { - "type": "object" - }, - "deprecated": true - } - } -} - -``` - -# nitro engine_parameters - -```js -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "properties": { - "pre_prompt": { - "type": "string", - "description": "The prompt to use for internal configuration." - }, - "system_prompt": { - "type": "string", - "description": "The prefix for system prompt." - }, - "user_prompt": { - "type": "string", - "description": "The prefix for user prompt." - }, - "ai_prompt": { - "type": "string", - "description": "The prefix for assistant prompt." - }, - "ngl": { - "type": "integer", - "default": 100, - "minimum": 0, - "maximum": 100, - "description": "The number of layers to load onto the GPU for acceleration." - }, - "ctx_len": { - "type": "integer", - "default": 2048, - "minimum": 128, - "maximum": 4096, - "description": "The context length for model operations varies; the maximum depends on the specific model used." - }, - "n_parallel": { - "type": "integer", - "default": 1, - "description": "The number of parallel operations. Only set when enable continuous batching." - }, - "cont_batching": { - "type": "boolean", - "default": false, - "description": "Whether to use continuous batching." - }, - "cpu_threads": { - "type": "integer", - "description": "The number of threads for CPU-based inference." - }, - "embedding": { - "type": "boolean", - "description": "Whether to enable embedding." - } - } -} -``` diff --git a/docs/openapi/specs/models.yaml b/docs/openapi/specs/models.yaml index aa5cc4155c..97ced0b59c 100644 --- a/docs/openapi/specs/models.yaml +++ b/docs/openapi/specs/models.yaml @@ -169,53 +169,120 @@ components: format: uri description: "URL to the source of the model." example: "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF/blob/main/zephyr-7b-beta.Q4_K_M.gguf" - parameters: + engine_parameters: type: object properties: + pre_prompt: + type: string + description: "Predefined prompt used for setting up internal configurations." + default: "" + example: "Initial setup complete." + system_prompt: + type: string + description: "Prefix used for system-level prompts." + default: "SYSTEM: " + user_prompt: + type: string + description: "Prefix used for user prompts." + default: "USER: " + ai_prompt: + type: string + description: "Prefix used for assistant prompts." + default: "ASSISTANT: " + ngl: + type: integer + description: "Number of neural network layers loaded onto the GPU for acceleration." + minimum: 0 + maximum: 100 + default: 100 + example: 100 ctx_len: type: integer - description: "Context length." + description: "Context length for model operations, varies based on the specific model." + minimum: 128 + maximum: 4096 + default: 2048 + example: 2048 + n_parallel: + type: integer + description: "Number of parallel operations, relevant when continuous batching is enabled." + minimum: 1 + maximum: 10 + default: 1 + example: 4 + cont_batching: + type: boolean + description: "Indicates if continuous batching is used for processing." + default: false + example: false + cpu_threads: + type: integer + description: "Number of threads allocated for CPU-based inference." + minimum: 1 + example: 8 + embedding: + type: boolean + description: "Indicates if embedding layers are enabled in the model." + default: true + example: true + model_parameters: + type: object + properties: + ctx_len: + type: integer + description: "Maximum context length the model can handle." + minimum: 0 + maximum: 4096 + default: 2048 example: 2048 ngl: type: integer - description: "Number of layers." + description: "Number of layers in the neural network." + minimum: 1 + maximum: 100 + default: 100 example: 100 embedding: type: boolean - description: "Indicates if embedding is enabled." + description: "Indicates if embedding layers are used." + default: true example: true n_parallel: type: integer - description: "Number of parallel processes." + description: "Number of parallel processes the model can run." + minimum: 1 + maximum: 10 + default: 1 example: 4 - # pre_prompt: - # type: string - # description: "Predefined prompt for initiating the chat." - # example: "A chat between a curious user and an artificial intelligence" - # user_prompt: - # type: string - # description: "Format of user's prompt." - # example: "USER: " - # ai_prompt: - # type: string - # description: "Format of AI's response." - # example: "ASSISTANT: " temperature: - type: string - description: "Temperature setting for the model." - example: "0.7" + type: number + description: "Controls randomness in model's responses. Higher values lead to more random responses." + minimum: 0.0 + maximum: 2.0 + default: 0.7 + example: 0.7 token_limit: - type: string - description: "Token limit for the model." - example: "2048" + type: integer + description: "Maximum number of tokens the model can generate in a single response." + minimum: 1 + maximum: 4096 + default: 2048 + example: 2048 top_k: - type: string - description: "Top-k setting for the model." - example: "0" + type: integer + description: "Limits the model to consider only the top k most likely next tokens at each step." + minimum: 0 + maximum: 100 + default: 0 + example: 0 top_p: - type: string - description: "Top-p setting for the model." - example: "1" + type: number + description: "Nucleus sampling parameter. The model considers the smallest set of tokens whose cumulative probability exceeds the top_p value." + minimum: 0.0 + maximum: 1.0 + default: 1.0 + example: 1.0 + metadata: type: object properties: diff --git a/docs/sidebars.js b/docs/sidebars.js index e1d5da5c69..384f47e9dd 100644 --- a/docs/sidebars.js +++ b/docs/sidebars.js @@ -82,7 +82,6 @@ const sidebars = { "specs/engineering/chats", "specs/engineering/models", "specs/engineering/engine", - "specs/engineering/inference-parameters", "specs/engineering/threads", "specs/engineering/messages", "specs/engineering/assistants",