From 7ed697d874dc22d39f5faa7d7d6519ca9a504c88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Rame=CC=81?= Date: Wed, 20 Mar 2024 11:45:13 +0100 Subject: [PATCH] fix(llm): the token limit can be different on embeddings compared to the model limit --- src/features/llm-langchain.ts | 9 +++++++-- src/gpt/index.ts | 11 +++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/features/llm-langchain.ts b/src/features/llm-langchain.ts index 4b2156d..3477738 100644 --- a/src/features/llm-langchain.ts +++ b/src/features/llm-langchain.ts @@ -384,9 +384,12 @@ export class LangchainWithLocalVectorStoreLlmManager implements LlmManager { // Note: below we do `+1` to take in account the pontentially added document const currentTokensFingerprintOfBatching = Math.max(documentsChunks[currentChunk].length - 1, 0); - if (documentTokens.length >= this.gptInstance.modelTokenLimit) { + if (documentTokens.length >= this.gptInstance.embeddingsTokenLimit) { throw new Error('an initiative document should not be huge and triggering the llm limit'); - } else if (currentChunkTokensCounter + documentTokens.length + (currentTokensFingerprintOfBatching + 1) >= this.gptInstance.modelTokenLimit) { + } else if ( + currentChunkTokensCounter + documentTokens.length + (currentTokensFingerprintOfBatching + 1) >= + this.gptInstance.embeddingsTokenLimit + ) { // If adding this document to previous ones is over the tokens limit for, use a new chunk currentChunk += 1; documentsChunks.push([]); @@ -474,6 +477,7 @@ CONTEXTE : // To help the LLM we give inside the context tools we are looking for // Since we cannot give the 8k+ tools from our database, we try to provide a subset meaningful according to extracted tech references we retrieved + // Note: we did not check the `embeddingsTokenLimit` since it has never been reached, if needed take example at documents computation to prepare chunks const rawToolsVectors = await this.toolsVectorStore.embeddings.embedDocuments(rawToolsFromAnalysis.filter((item) => item.trim() !== '')); const contextTools: string[] = []; @@ -675,6 +679,7 @@ CONTEXTE : } public truncateContentBasedOnTokens(content: string, maximumTokens: number): string { + // Note the token limit we use is about the model, not for embeddings (adjust if needed) if (maximumTokens > this.gptInstance.modelTokenLimit) { console.warn( `the tokens truncate ceil specified (${maximumTokens}) is above the llm limit of ${this.gptInstance.modelTokenLimit} tokens, so defaulting to the latter` diff --git a/src/gpt/index.ts b/src/gpt/index.ts index a2bf081..5af1c0c 100644 --- a/src/gpt/index.ts +++ b/src/gpt/index.ts @@ -4,6 +4,7 @@ export interface GptSettings { model: string; countModel: TiktokenModel; // The counter does not understand precise GPT versions modelTokenLimit: number; // Precise token maximum can be found on https://www.scriptbyai.com/token-limit-openai-chatgpt/ + embeddingsTokenLimit: number; // Didn't find a list but considering `16385` as default is good enough, and adjust if needed according to the provider per1000TokensCost: number; // This is about input tokens (since our outputs should be small, we don't consider them here) } @@ -26,12 +27,14 @@ export const gptInstances: Record = { model: 'gpt-3.5-turbo-1106', countModel: 'gpt-3.5-turbo', modelTokenLimit: 16385, + embeddingsTokenLimit: 16385, per1000TokensCost: 0.001, }, v4: { model: 'gpt-4-1106-preview', countModel: 'gpt-4', modelTokenLimit: 16385, + embeddingsTokenLimit: 16385, per1000TokensCost: 0.01, }, // MistralAI @@ -39,18 +42,21 @@ export const gptInstances: Record = { model: 'mistral-tiny', // mistral7b countModel: 'gpt-4', modelTokenLimit: 16385, + embeddingsTokenLimit: 16385, per1000TokensCost: 0.00014, }, deprecatedMistralSmall: { model: 'mistral-small', // mixtral8x7b countModel: 'gpt-4', modelTokenLimit: 16385, + embeddingsTokenLimit: 16385, per1000TokensCost: 0.0006, }, deprecatedMistralMedium: { model: 'mistral-medium', // ... countModel: 'gpt-4', modelTokenLimit: 16385, + embeddingsTokenLimit: 16385, per1000TokensCost: 0.0025, }, mistral7b: { @@ -58,6 +64,7 @@ export const gptInstances: Record = { model: 'open-mistral-7b', // mistral7b countModel: 'gpt-4', modelTokenLimit: 32768, + embeddingsTokenLimit: 16385, per1000TokensCost: 0.0002, }, mistral8x7b: { @@ -65,24 +72,28 @@ export const gptInstances: Record = { model: 'open-mixtral-8x7b', // mixtral8x7b countModel: 'gpt-4', modelTokenLimit: 32768, + embeddingsTokenLimit: 16385, per1000TokensCost: 0.00065, }, mistralSmall: { model: 'mistral-small-latest', countModel: 'gpt-4', modelTokenLimit: 32768, + embeddingsTokenLimit: 16385, per1000TokensCost: 0.0055, }, mistralMedium: { model: 'mistral-medium-latest', countModel: 'gpt-4', modelTokenLimit: 32768, + embeddingsTokenLimit: 16385, per1000TokensCost: 0.0075, }, mistralLarge: { model: 'mistral-large-latest', countModel: 'gpt-4', modelTokenLimit: 32768, + embeddingsTokenLimit: 16385, per1000TokensCost: 0.022, }, };