From 4d2b5b7a543cce56ca08ea486b7aec4eb7f9b254 Mon Sep 17 00:00:00 2001 From: Brace Sproul Date: Tue, 23 Jul 2024 15:10:37 -0700 Subject: [PATCH] core[minor]: Add FakeVectorStore to core testing utils (#6186) * core[minor]: Add FakeVectorStore to core testing utils * chore: lint files --- langchain-core/src/utils/testing/index.ts | 196 +++++++++++++++++++++- 1 file changed, 195 insertions(+), 1 deletion(-) diff --git a/langchain-core/src/utils/testing/index.ts b/langchain-core/src/utils/testing/index.ts index 6591b67f26c9..73e79f892fc1 100644 --- a/langchain-core/src/utils/testing/index.ts +++ b/langchain-core/src/utils/testing/index.ts @@ -2,6 +2,7 @@ /* eslint-disable @typescript-eslint/no-explicit-any */ /* eslint-disable @typescript-eslint/no-unused-vars */ +import { similarity as ml_distance_similarity } from "ml-distance"; import { z } from "zod"; import { BaseCallbackConfig, @@ -34,12 +35,17 @@ import { BaseRetriever } from "../../retrievers/index.js"; import { Runnable, RunnableLambda } from "../../runnables/base.js"; import { StructuredTool, ToolParams } from "../../tools/index.js"; import { BaseTracer, Run } from "../../tracers/base.js"; -import { Embeddings, EmbeddingsParams } from "../../embeddings.js"; +import { + Embeddings, + EmbeddingsInterface, + EmbeddingsParams, +} from "../../embeddings.js"; import { StructuredOutputMethodParams, BaseLanguageModelInput, StructuredOutputMethodOptions, } from "../../language_models/base.js"; +import { VectorStore } from "../../vectorstores.js"; /** * Parser for comma-separated values. It splits the input text by commas @@ -709,3 +715,191 @@ export class SingleRunExtractor extends BaseTracer { return this.runPromise; } } + +/** + * Interface representing a vector in memory. It includes the content + * (text), the corresponding embedding (vector), and any associated + * metadata. + */ +interface MemoryVector { + content: string; + embedding: number[]; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + metadata: Record; +} + +/** + * Interface for the arguments that can be passed to the + * `FakeVectorStore` constructor. It includes an optional `similarity` + * function. + */ +export interface FakeVectorStoreArgs { + similarity?: typeof ml_distance_similarity.cosine; +} + +/** + * Class that extends `VectorStore` to store vectors in memory. Provides + * methods for adding documents, performing similarity searches, and + * creating instances from texts, documents, or an existing index. + */ +export class FakeVectorStore extends VectorStore { + declare FilterType: (doc: Document) => boolean; + + memoryVectors: MemoryVector[] = []; + + similarity: typeof ml_distance_similarity.cosine; + + _vectorstoreType(): string { + return "memory"; + } + + constructor( + embeddings: EmbeddingsInterface, + { similarity, ...rest }: FakeVectorStoreArgs = {} + ) { + super(embeddings, rest); + + this.similarity = similarity ?? ml_distance_similarity.cosine; + } + + /** + * Method to add documents to the memory vector store. It extracts the + * text from each document, generates embeddings for them, and adds the + * resulting vectors to the store. + * @param documents Array of `Document` instances to be added to the store. + * @returns Promise that resolves when all documents have been added. + */ + async addDocuments(documents: Document[]): Promise { + const texts = documents.map(({ pageContent }) => pageContent); + return this.addVectors( + await this.embeddings.embedDocuments(texts), + documents + ); + } + + /** + * Method to add vectors to the memory vector store. It creates + * `MemoryVector` instances for each vector and document pair and adds + * them to the store. + * @param vectors Array of vectors to be added to the store. + * @param documents Array of `Document` instances corresponding to the vectors. + * @returns Promise that resolves when all vectors have been added. + */ + async addVectors(vectors: number[][], documents: Document[]): Promise { + const memoryVectors = vectors.map((embedding, idx) => ({ + content: documents[idx].pageContent, + embedding, + metadata: documents[idx].metadata, + })); + + this.memoryVectors = this.memoryVectors.concat(memoryVectors); + } + + /** + * Method to perform a similarity search in the memory vector store. It + * calculates the similarity between the query vector and each vector in + * the store, sorts the results by similarity, and returns the top `k` + * results along with their scores. + * @param query Query vector to compare against the vectors in the store. + * @param k Number of top results to return. + * @param filter Optional filter function to apply to the vectors before performing the search. + * @returns Promise that resolves with an array of tuples, each containing a `Document` and its similarity score. + */ + async similaritySearchVectorWithScore( + query: number[], + k: number, + filter?: this["FilterType"] + ): Promise<[Document, number][]> { + const filterFunction = (memoryVector: MemoryVector) => { + if (!filter) { + return true; + } + + const doc = new Document({ + metadata: memoryVector.metadata, + pageContent: memoryVector.content, + }); + return filter(doc); + }; + const filteredMemoryVectors = this.memoryVectors.filter(filterFunction); + const searches = filteredMemoryVectors + .map((vector, index) => ({ + similarity: this.similarity(query, vector.embedding), + index, + })) + .sort((a, b) => (a.similarity > b.similarity ? -1 : 0)) + .slice(0, k); + + const result: [Document, number][] = searches.map((search) => [ + new Document({ + metadata: filteredMemoryVectors[search.index].metadata, + pageContent: filteredMemoryVectors[search.index].content, + }), + search.similarity, + ]); + + return result; + } + + /** + * Static method to create a `FakeVectorStore` instance from an array of + * texts. It creates a `Document` for each text and metadata pair, and + * adds them to the store. + * @param texts Array of texts to be added to the store. + * @param metadatas Array or single object of metadata corresponding to the texts. + * @param embeddings `Embeddings` instance used to generate embeddings for the texts. + * @param dbConfig Optional `FakeVectorStoreArgs` to configure the `FakeVectorStore` instance. + * @returns Promise that resolves with a new `FakeVectorStore` instance. + */ + static async fromTexts( + texts: string[], + metadatas: object[] | object, + embeddings: EmbeddingsInterface, + dbConfig?: FakeVectorStoreArgs + ): Promise { + const docs: Document[] = []; + for (let i = 0; i < texts.length; i += 1) { + const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas; + const newDoc = new Document({ + pageContent: texts[i], + metadata, + }); + docs.push(newDoc); + } + return FakeVectorStore.fromDocuments(docs, embeddings, dbConfig); + } + + /** + * Static method to create a `FakeVectorStore` instance from an array of + * `Document` instances. It adds the documents to the store. + * @param docs Array of `Document` instances to be added to the store. + * @param embeddings `Embeddings` instance used to generate embeddings for the documents. + * @param dbConfig Optional `FakeVectorStoreArgs` to configure the `FakeVectorStore` instance. + * @returns Promise that resolves with a new `FakeVectorStore` instance. + */ + static async fromDocuments( + docs: Document[], + embeddings: EmbeddingsInterface, + dbConfig?: FakeVectorStoreArgs + ): Promise { + const instance = new this(embeddings, dbConfig); + await instance.addDocuments(docs); + return instance; + } + + /** + * Static method to create a `FakeVectorStore` instance from an existing + * index. It creates a new `FakeVectorStore` instance without adding any + * documents or vectors. + * @param embeddings `Embeddings` instance used to generate embeddings for the documents. + * @param dbConfig Optional `FakeVectorStoreArgs` to configure the `FakeVectorStore` instance. + * @returns Promise that resolves with a new `FakeVectorStore` instance. + */ + static async fromExistingIndex( + embeddings: EmbeddingsInterface, + dbConfig?: FakeVectorStoreArgs + ): Promise { + const instance = new this(embeddings, dbConfig); + return instance; + } +}