From f99bf0f00855d22d90b1d50e338613a93509dc38 Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Mon, 4 Dec 2023 17:53:18 +0700
Subject: [PATCH 1/2] add json schema for engine and model parameters

---
 .../specs/engineering/inference-parameters.md | 171 ++++++++++++++++++
 docs/sidebars.js                              |   1 +
 2 files changed, 172 insertions(+)
 create mode 100644 docs/docs/specs/engineering/inference-parameters.md

diff --git a/docs/docs/specs/engineering/inference-parameters.md b/docs/docs/specs/engineering/inference-parameters.md
new file mode 100644
index 0000000000..52eaa8a774
--- /dev/null
+++ b/docs/docs/specs/engineering/inference-parameters.md
@@ -0,0 +1,171 @@
+---
+title: "Inference Parameters"
+slug: /specs/inference-parameters
+description: Exhaustive list of json-schema for engine and models
+---
+
+# model_parameters
+
+```js
+
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "required": ["messages"],
+  "properties": {
+    "messages": {
+      "type": "array",
+      "items": {
+        "type": "object"
+      }
+    },
+    "model": {
+      "type": "string"
+    },
+    "frequency_penalty": {
+      "type": ["number", "null"],
+      "minimum": -2.0,
+      "maximum": 2.0,
+      "default": 0
+    },
+    "logit_bias": {
+      "type": ["object", "null"],
+      "additionalProperties": {
+        "type": "number",
+        "minimum": -100,
+        "maximum": 100
+      },
+      "default": null
+    },
+    "max_tokens": {
+      "type": ["integer", "null"]
+    },
+    "n": {
+      "type": ["integer", "null"],
+      "default": 1
+    },
+    "presence_penalty": {
+      "type": ["number", "null"],
+      "minimum": -2.0,
+      "maximum": 2.0,
+      "default": 0
+    },
+    "response_format": {
+      "type": ["object", "null"],
+      "properties": {
+        "type": {
+          "type": "string"
+        }
+      }
+    },
+    "seed": {
+      "type": ["integer", "null"]
+    },
+    "stop": {
+      "type": ["string", "array", "null"],
+      "items": {
+        "type": "string"
+      }
+    },
+    "stream": {
+      "type": ["boolean", "null"],
+      "default": false
+    },
+    "temperature": {
+      "type": ["number", "null"],
+      "minimum": 0,
+      "maximum": 2,
+      "default": 1
+    },
+    "top_p": {
+      "type": ["number", "null"],
+      "minimum": 0,
+      "maximum": 1,
+      "default": 1
+    },
+    "tools": {
+      "type": ["array", "null"],
+      "items": {
+        "type": "object"
+      }
+    },
+    "tool_choice": {
+      "type": ["string", "object", "null"]
+    },
+    "user": {
+      "type": ["string", "null"]
+    },
+    "function_call": {
+      "type": ["string", "object", "null"],
+      "deprecated": true
+    },
+    "functions": {
+      "type": ["array", "null"],
+      "items": {
+        "type": "object"
+      },
+      "deprecated": true
+    }
+  }
+}
+
+```
+
+# nitro engine_parameters
+
+```js
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "properties": {
+    "pre_prompt": {
+      "type": "string",
+      "description": "The prompt to use for internal configuration."
+    },
+    "system_prompt": {
+      "type": "string",
+      "description": "The prefix for system prompt."
+    },
+    "user_prompt": {
+      "type": "string",
+      "description": "The prefix for user prompt."
+    },
+    "ai_prompt": {
+      "type": "string",
+      "description": "The prefix for assistant prompt."
+    },
+    "ngl": {
+      "type": "integer",
+      "default": 100,
+      "minimum": 0,
+      "maximum": 100,
+      "description": "The number of layers to load onto the GPU for acceleration."
+    },
+    "ctx_len": {
+      "type": "integer",
+      "default": 2048,
+      "minimum": 128,
+      "maximum": 4096,
+      "description": "The context length for model operations varies; the maximum depends on the specific model used."
+    },
+    "n_parallel": {
+      "type": "integer",
+      "default": 1,
+      "description": "The number of parallel operations. Only set when enable continuous batching."
+    },
+    "cont_batching": {
+      "type": "boolean",
+      "default": false,
+      "description": "Whether to use continuous batching."
+    },
+    "cpu_threads": {
+      "type": "integer",
+      "description": "The number of threads for CPU-based inference."
+    },
+    "embedding": {
+      "type": "boolean",
+      "description": "Whether to enable embedding."
+    }
+  }
+}
+```
diff --git a/docs/sidebars.js b/docs/sidebars.js
index 384f47e9dd..e1d5da5c69 100644
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -82,6 +82,7 @@ const sidebars = {
         "specs/engineering/chats",
         "specs/engineering/models",
         "specs/engineering/engine",
+        "specs/engineering/inference-parameters",
         "specs/engineering/threads",
         "specs/engineering/messages",
         "specs/engineering/assistants",

From 6ac5b0c5f0b9dbeee0f7d3893bec9a14b200d147 Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Tue, 5 Dec 2023 18:53:29 +0700
Subject: [PATCH 2/2] remove and add to retrieve model

---
 .../specs/engineering/inference-parameters.md | 171 ------------------
 docs/openapi/specs/models.yaml                | 125 ++++++++++---
 docs/sidebars.js                              |   1 -
 3 files changed, 96 insertions(+), 201 deletions(-)
 delete mode 100644 docs/docs/specs/engineering/inference-parameters.md

diff --git a/docs/docs/specs/engineering/inference-parameters.md b/docs/docs/specs/engineering/inference-parameters.md
deleted file mode 100644
index 52eaa8a774..0000000000
--- a/docs/docs/specs/engineering/inference-parameters.md
+++ /dev/null
@@ -1,171 +0,0 @@
----
-title: "Inference Parameters"
-slug: /specs/inference-parameters
-description: Exhaustive list of json-schema for engine and models
----
-
-# model_parameters
-
-```js
-
-{
-  "$schema": "http://json-schema.org/draft-07/schema#",
-  "type": "object",
-  "required": ["messages"],
-  "properties": {
-    "messages": {
-      "type": "array",
-      "items": {
-        "type": "object"
-      }
-    },
-    "model": {
-      "type": "string"
-    },
-    "frequency_penalty": {
-      "type": ["number", "null"],
-      "minimum": -2.0,
-      "maximum": 2.0,
-      "default": 0
-    },
-    "logit_bias": {
-      "type": ["object", "null"],
-      "additionalProperties": {
-        "type": "number",
-        "minimum": -100,
-        "maximum": 100
-      },
-      "default": null
-    },
-    "max_tokens": {
-      "type": ["integer", "null"]
-    },
-    "n": {
-      "type": ["integer", "null"],
-      "default": 1
-    },
-    "presence_penalty": {
-      "type": ["number", "null"],
-      "minimum": -2.0,
-      "maximum": 2.0,
-      "default": 0
-    },
-    "response_format": {
-      "type": ["object", "null"],
-      "properties": {
-        "type": {
-          "type": "string"
-        }
-      }
-    },
-    "seed": {
-      "type": ["integer", "null"]
-    },
-    "stop": {
-      "type": ["string", "array", "null"],
-      "items": {
-        "type": "string"
-      }
-    },
-    "stream": {
-      "type": ["boolean", "null"],
-      "default": false
-    },
-    "temperature": {
-      "type": ["number", "null"],
-      "minimum": 0,
-      "maximum": 2,
-      "default": 1
-    },
-    "top_p": {
-      "type": ["number", "null"],
-      "minimum": 0,
-      "maximum": 1,
-      "default": 1
-    },
-    "tools": {
-      "type": ["array", "null"],
-      "items": {
-        "type": "object"
-      }
-    },
-    "tool_choice": {
-      "type": ["string", "object", "null"]
-    },
-    "user": {
-      "type": ["string", "null"]
-    },
-    "function_call": {
-      "type": ["string", "object", "null"],
-      "deprecated": true
-    },
-    "functions": {
-      "type": ["array", "null"],
-      "items": {
-        "type": "object"
-      },
-      "deprecated": true
-    }
-  }
-}
-
-```
-
-# nitro engine_parameters
-
-```js
-{
-  "$schema": "http://json-schema.org/draft-07/schema#",
-  "type": "object",
-  "properties": {
-    "pre_prompt": {
-      "type": "string",
-      "description": "The prompt to use for internal configuration."
-    },
-    "system_prompt": {
-      "type": "string",
-      "description": "The prefix for system prompt."
-    },
-    "user_prompt": {
-      "type": "string",
-      "description": "The prefix for user prompt."
-    },
-    "ai_prompt": {
-      "type": "string",
-      "description": "The prefix for assistant prompt."
-    },
-    "ngl": {
-      "type": "integer",
-      "default": 100,
-      "minimum": 0,
-      "maximum": 100,
-      "description": "The number of layers to load onto the GPU for acceleration."
-    },
-    "ctx_len": {
-      "type": "integer",
-      "default": 2048,
-      "minimum": 128,
-      "maximum": 4096,
-      "description": "The context length for model operations varies; the maximum depends on the specific model used."
-    },
-    "n_parallel": {
-      "type": "integer",
-      "default": 1,
-      "description": "The number of parallel operations. Only set when enable continuous batching."
-    },
-    "cont_batching": {
-      "type": "boolean",
-      "default": false,
-      "description": "Whether to use continuous batching."
-    },
-    "cpu_threads": {
-      "type": "integer",
-      "description": "The number of threads for CPU-based inference."
-    },
-    "embedding": {
-      "type": "boolean",
-      "description": "Whether to enable embedding."
-    }
-  }
-}
-```
diff --git a/docs/openapi/specs/models.yaml b/docs/openapi/specs/models.yaml
index aa5cc4155c..97ced0b59c 100644
--- a/docs/openapi/specs/models.yaml
+++ b/docs/openapi/specs/models.yaml
@@ -169,53 +169,120 @@ components:
           format: uri
           description: "URL to the source of the model."
           example: "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF/blob/main/zephyr-7b-beta.Q4_K_M.gguf"
-        parameters:
+        engine_parameters:
           type: object
           properties:
+            pre_prompt:
+              type: string
+              description: "Predefined prompt used for setting up internal configurations."
+              default: ""
+              example: "Initial setup complete."
+            system_prompt:
+              type: string
+              description: "Prefix used for system-level prompts."
+              default: "SYSTEM: "
+            user_prompt:
+              type: string
+              description: "Prefix used for user prompts."
+              default:  "USER: "
+            ai_prompt:
+              type: string
+              description: "Prefix used for assistant prompts."
+              default: "ASSISTANT: "
+            ngl:
+              type: integer
+              description: "Number of neural network layers loaded onto the GPU for acceleration."
+              minimum: 0
+              maximum: 100
+              default: 100
+              example: 100
             ctx_len:
               type: integer
-              description: "Context length."
+              description: "Context length for model operations, varies based on the specific model."
+              minimum: 128
+              maximum: 4096
+              default: 2048
+              example: 2048
+            n_parallel:
+              type: integer
+              description: "Number of parallel operations, relevant when continuous batching is enabled."
+              minimum: 1
+              maximum: 10
+              default: 1
+              example: 4
+            cont_batching:
+              type: boolean
+              description: "Indicates if continuous batching is used for processing."
+              default: false
+              example: false
+            cpu_threads:
+              type: integer
+              description: "Number of threads allocated for CPU-based inference."
+              minimum: 1
+              example: 8
+            embedding:
+              type: boolean
+              description: "Indicates if embedding layers are enabled in the model."
+              default: true
+              example: true
+        model_parameters:
+          type: object
+          properties:
+            ctx_len:
+              type: integer
+              description: "Maximum context length the model can handle."
+              minimum: 0
+              maximum: 4096
+              default: 2048
               example: 2048
             ngl:
               type: integer
-              description: "Number of layers."
+              description: "Number of layers in the neural network."
+              minimum: 1
+              maximum: 100
+              default: 100
               example: 100
             embedding:
               type: boolean
-              description: "Indicates if embedding is enabled."
+              description: "Indicates if embedding layers are used."
+              default: true
               example: true
             n_parallel:
               type: integer
-              description: "Number of parallel processes."
+              description: "Number of parallel processes the model can run."
+              minimum: 1
+              maximum: 10
+              default: 1
               example: 4
-            # pre_prompt:
-            #   type: string
-            #   description: "Predefined prompt for initiating the chat."
-            #   example: "A chat between a curious user and an artificial intelligence"
-            # user_prompt:
-            #   type: string
-            #   description: "Format of user's prompt."
-            #   example: "USER: "
-            # ai_prompt:
-            #   type: string
-            #   description: "Format of AI's response."
-            #   example: "ASSISTANT: "
             temperature:
-              type: string
-              description: "Temperature setting for the model."
-              example: "0.7"
+              type: number
+              description: "Controls randomness in model's responses. Higher values lead to more random responses."
+              minimum: 0.0
+              maximum: 2.0
+              default: 0.7
+              example: 0.7
             token_limit:
-              type: string
-              description: "Token limit for the model."
-              example: "2048"
+              type: integer
+              description: "Maximum number of tokens the model can generate in a single response."
+              minimum: 1
+              maximum: 4096
+              default: 2048
+              example: 2048
             top_k:
-              type: string
-              description: "Top-k setting for the model."
-              example: "0"
+              type: integer
+              description: "Limits the model to consider only the top k most likely next tokens at each step."
+              minimum: 0
+              maximum: 100
+              default: 0
+              example: 0
             top_p:
-              type: string
-              description: "Top-p setting for the model."
-              example: "1"
+              type: number
+              description: "Nucleus sampling parameter. The model considers the smallest set of tokens whose cumulative probability exceeds the top_p value."
+              minimum: 0.0
+              maximum: 1.0
+              default: 1.0
+              example: 1.0
+
         metadata:
           type: object
           properties:
diff --git a/docs/sidebars.js b/docs/sidebars.js
index e1d5da5c69..384f47e9dd 100644
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -82,7 +82,6 @@ const sidebars = {
         "specs/engineering/chats",
         "specs/engineering/models",
         "specs/engineering/engine",
-        "specs/engineering/inference-parameters",
         "specs/engineering/threads",
         "specs/engineering/messages",
         "specs/engineering/assistants",