From 7ed697d874dc22d39f5faa7d7d6519ca9a504c88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Rame=CC=81?= <thomas.rame@outlook.com>
Date: Wed, 20 Mar 2024 11:45:13 +0100
Subject: [PATCH] fix(llm): the token limit can be different on embeddings
 compared to the model limit

---
 src/features/llm-langchain.ts |  9 +++++++--
 src/gpt/index.ts              | 11 +++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/features/llm-langchain.ts b/src/features/llm-langchain.ts
index 4b2156d..3477738 100644
--- a/src/features/llm-langchain.ts
+++ b/src/features/llm-langchain.ts
@@ -384,9 +384,12 @@ export class LangchainWithLocalVectorStoreLlmManager implements LlmManager {
         // Note: below we do `+1` to take in account the pontentially added document
         const currentTokensFingerprintOfBatching = Math.max(documentsChunks[currentChunk].length - 1, 0);
 
-        if (documentTokens.length >= this.gptInstance.modelTokenLimit) {
+        if (documentTokens.length >= this.gptInstance.embeddingsTokenLimit) {
           throw new Error('an initiative document should not be huge and triggering the llm limit');
-        } else if (currentChunkTokensCounter + documentTokens.length + (currentTokensFingerprintOfBatching + 1) >= this.gptInstance.modelTokenLimit) {
+        } else if (
+          currentChunkTokensCounter + documentTokens.length + (currentTokensFingerprintOfBatching + 1) >=
+          this.gptInstance.embeddingsTokenLimit
+        ) {
           // If adding this document to previous ones is over the tokens limit for, use a new chunk
           currentChunk += 1;
           documentsChunks.push([]);
@@ -474,6 +477,7 @@ CONTEXTE :
 
     // To help the LLM we give inside the context tools we are looking for
     // Since we cannot give the 8k+ tools from our database, we try to provide a subset meaningful according to extracted tech references we retrieved
+    // Note: we did not check the `embeddingsTokenLimit` since it has never been reached, if needed take example at documents computation to prepare chunks
     const rawToolsVectors = await this.toolsVectorStore.embeddings.embedDocuments(rawToolsFromAnalysis.filter((item) => item.trim() !== ''));
 
     const contextTools: string[] = [];
@@ -675,6 +679,7 @@ CONTEXTE :
   }
 
   public truncateContentBasedOnTokens(content: string, maximumTokens: number): string {
+    // Note the token limit we use is about the model, not for embeddings (adjust if needed)
     if (maximumTokens > this.gptInstance.modelTokenLimit) {
       console.warn(
         `the tokens truncate ceil specified (${maximumTokens}) is above the llm limit of ${this.gptInstance.modelTokenLimit} tokens, so defaulting to the latter`
diff --git a/src/gpt/index.ts b/src/gpt/index.ts
index a2bf081..5af1c0c 100644
--- a/src/gpt/index.ts
+++ b/src/gpt/index.ts
@@ -4,6 +4,7 @@ export interface GptSettings {
   model: string;
   countModel: TiktokenModel; // The counter does not understand precise GPT versions
   modelTokenLimit: number; // Precise token maximum can be found on https://www.scriptbyai.com/token-limit-openai-chatgpt/
+  embeddingsTokenLimit: number; // Didn't find a list but considering `16385` as default is good enough, and adjust if needed according to the provider
   per1000TokensCost: number; // This is about input tokens (since our outputs should be small, we don't consider them here)
 }
 
@@ -26,12 +27,14 @@ export const gptInstances: Record<GptInstance, GptSettings> = {
     model: 'gpt-3.5-turbo-1106',
     countModel: 'gpt-3.5-turbo',
     modelTokenLimit: 16385,
+    embeddingsTokenLimit: 16385,
     per1000TokensCost: 0.001,
   },
   v4: {
     model: 'gpt-4-1106-preview',
     countModel: 'gpt-4',
     modelTokenLimit: 16385,
+    embeddingsTokenLimit: 16385,
     per1000TokensCost: 0.01,
   },
   // MistralAI
@@ -39,18 +42,21 @@ export const gptInstances: Record<GptInstance, GptSettings> = {
     model: 'mistral-tiny', // mistral7b
     countModel: 'gpt-4',
     modelTokenLimit: 16385,
+    embeddingsTokenLimit: 16385,
     per1000TokensCost: 0.00014,
   },
   deprecatedMistralSmall: {
     model: 'mistral-small', // mixtral8x7b
     countModel: 'gpt-4',
     modelTokenLimit: 16385,
+    embeddingsTokenLimit: 16385,
     per1000TokensCost: 0.0006,
   },
   deprecatedMistralMedium: {
     model: 'mistral-medium', // ...
     countModel: 'gpt-4',
     modelTokenLimit: 16385,
+    embeddingsTokenLimit: 16385,
     per1000TokensCost: 0.0025,
   },
   mistral7b: {
@@ -58,6 +64,7 @@ export const gptInstances: Record<GptInstance, GptSettings> = {
     model: 'open-mistral-7b', // mistral7b
     countModel: 'gpt-4',
     modelTokenLimit: 32768,
+    embeddingsTokenLimit: 16385,
     per1000TokensCost: 0.0002,
   },
   mistral8x7b: {
@@ -65,24 +72,28 @@ export const gptInstances: Record<GptInstance, GptSettings> = {
     model: 'open-mixtral-8x7b', // mixtral8x7b
     countModel: 'gpt-4',
     modelTokenLimit: 32768,
+    embeddingsTokenLimit: 16385,
     per1000TokensCost: 0.00065,
   },
   mistralSmall: {
     model: 'mistral-small-latest',
     countModel: 'gpt-4',
     modelTokenLimit: 32768,
+    embeddingsTokenLimit: 16385,
     per1000TokensCost: 0.0055,
   },
   mistralMedium: {
     model: 'mistral-medium-latest',
     countModel: 'gpt-4',
     modelTokenLimit: 32768,
+    embeddingsTokenLimit: 16385,
     per1000TokensCost: 0.0075,
   },
   mistralLarge: {
     model: 'mistral-large-latest',
     countModel: 'gpt-4',
     modelTokenLimit: 32768,
+    embeddingsTokenLimit: 16385,
     per1000TokensCost: 0.022,
   },
 };