diff --git a/docs/core_docs/docs/integrations/vectorstores/pgvector.mdx b/docs/core_docs/docs/integrations/vectorstores/pgvector.mdx index 16ba9c29045b..64e578acff98 100644 --- a/docs/core_docs/docs/integrations/vectorstores/pgvector.mdx +++ b/docs/core_docs/docs/integrations/vectorstores/pgvector.mdx @@ -74,3 +74,24 @@ before using the constructor. import ConnectionReuseExample from "@examples/indexes/vector_stores/pgvector_vectorstore/pgvector_pool.ts"; {ConnectionReuseExample} + +### Create HNSW Index + +By default, the extension performs a sequential scan search, with 100% recall. You might consider creating an HNSW index for approximate nearest neighbor (ANN) search to speed up similaritySearchVectorWithScore execution time. To create the HNSW index on your vector column, use the `createHnswIndex()` method: + +The method parameters include: + +**dimensions**: Defines the number of dimensions in your vector data type, up to 2000. For example, use 1536 for OpenAI's `text-embedding-ada-002` and Amazon's `amazon.titan-embed-text-v1` models. + +**m?**: The max number of connections per layer (16 by default). Index build time improves with smaller values, while higher values can speed up search queries. + +**efConstruction?**: The size of the dynamic candidate list for constructing the graph (64 by default). A higher value can potentially improve the index quality at the cost of index build time. + +**distanceFunction?**: The distance function name you want to use, is automatically selected based on the distanceStrategy. + +More info at the [`Pgvector GitHub project`](https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw) and the HNSW paper from Malkov Yu A. and Yashunin D. A.. 2020. [`Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs`](https://arxiv.org/pdf/1603.09320) + +import HnswExample from "@examples/indexes/vector_stores/pgvector_vectorstore/pgvector_hnsw.ts"; + +{HnswExample} + diff --git a/examples/src/indexes/vector_stores/pgvector_vectorstore/pgvector_hnsw.ts b/examples/src/indexes/vector_stores/pgvector_vectorstore/pgvector_hnsw.ts new file mode 100644 index 000000000000..d3c66f3fdd9d --- /dev/null +++ b/examples/src/indexes/vector_stores/pgvector_vectorstore/pgvector_hnsw.ts @@ -0,0 +1,54 @@ +import { OpenAIEmbeddings } from "@langchain/openai"; +import { + DistanceStrategy, + PGVectorStore, +} from "@langchain/community/vectorstores/pgvector"; +import { PoolConfig } from "pg"; + +// First, follow set-up instructions at +// https://js.langchain.com/docs/modules/indexes/vector_stores/integrations/pgvector + +const config = { + postgresConnectionOptions: { + type: "postgres", + host: "127.0.0.1", + port: 5433, + user: "myuser", + password: "ChangeMe", + database: "api", + } as PoolConfig, + tableName: "testlangchain", + columns: { + idColumnName: "id", + vectorColumnName: "vector", + contentColumnName: "content", + metadataColumnName: "metadata", + }, + // supported distance strategies: cosine (default), innerProduct, or euclidean + distanceStrategy: "cosine" as DistanceStrategy, +}; + +const pgvectorStore = await PGVectorStore.initialize( + new OpenAIEmbeddings(), + config +); + +// create the index +await pgvectorStore.createHnswIndex({ + dimensions: 1536, + efConstruction: 64, + m: 16, +}); + +await pgvectorStore.addDocuments([ + { pageContent: "what's this", metadata: { a: 2, b: ["tag1", "tag2"] } }, + { pageContent: "Cat drinks milk", metadata: { a: 1, b: ["tag2"] } }, +]); + +const model = new OpenAIEmbeddings(); +const query = await model.embedQuery("water"); +const results = await pgvectorStore.similaritySearchVectorWithScore(query, 1); + +console.log(results); + +await pgvectorStore.end(); diff --git a/libs/langchain-community/src/vectorstores/pgvector.ts b/libs/langchain-community/src/vectorstores/pgvector.ts index af5ac76bc047..6a58c4ba5cd0 100644 --- a/libs/langchain-community/src/vectorstores/pgvector.ts +++ b/libs/langchain-community/src/vectorstores/pgvector.ts @@ -677,4 +677,55 @@ export class PGVectorStore extends VectorStore { this.client?.release(); return this.pool.end(); } + + /** + * Method to create the HNSW index on the vector column. + * + * @param dimensions - Defines the number of dimensions in your vector data type, up to 2000. For example, use 1536 for OpenAI's text-embedding-ada-002 and Amazon's amazon.titan-embed-text-v1 models. + * @param m - The max number of connections per layer (16 by default). Index build time improves with smaller values, while higher values can speed up search queries. + * @param efConstruction - The size of the dynamic candidate list for constructing the graph (64 by default). A higher value can potentially improve the index quality at the cost of index build time. + * @param distanceFunction - The distance function name you want to use, is automatically selected based on the distanceStrategy. + * @returns Promise that resolves with the query response of creating the index. + */ + async createHnswIndex(config: { + dimensions: number; + m?: number; + efConstruction?: number; + distanceFunction?: string; + }): Promise { + let idxDistanceFunction = config?.distanceFunction || "vector_cosine_ops"; + + switch (this.distanceStrategy) { + case "cosine": + idxDistanceFunction = "vector_cosine_ops"; + break; + case "innerProduct": + idxDistanceFunction = "vector_ip_ops"; + break; + case "euclidean": + idxDistanceFunction = "vector_l2_ops"; + break; + default: + throw new Error(`Unknown distance strategy: ${this.distanceStrategy}`); + } + + const createIndexQuery = `CREATE INDEX IF NOT EXISTS ${ + this.vectorColumnName + }_embedding_hnsw_idx + ON ${this.computedTableName} USING hnsw ((${ + this.vectorColumnName + }::vector(${config.dimensions})) ${idxDistanceFunction}) + WITH ( + m=${config?.m || 16}, + ef_construction=${config?.efConstruction || 64} + );`; + + try { + await this.pool.query(createIndexQuery); + } catch (e) { + console.error( + `Failed to create HNSW index on table ${this.computedTableName}, error: ${e}` + ); + } + } } diff --git a/libs/langchain-community/src/vectorstores/tests/pgvector/pgvector.int.test.ts b/libs/langchain-community/src/vectorstores/tests/pgvector/pgvector.int.test.ts index c22627443912..a1a241c80e8a 100644 --- a/libs/langchain-community/src/vectorstores/tests/pgvector/pgvector.int.test.ts +++ b/libs/langchain-community/src/vectorstores/tests/pgvector/pgvector.int.test.ts @@ -2,21 +2,30 @@ import { expect, test } from "@jest/globals"; import pg, { PoolConfig } from "pg"; import { OpenAIEmbeddings } from "@langchain/openai"; import { PGVectorStore, PGVectorStoreArgs } from "../../pgvector.js"; +// import { BedrockEmbeddings } from "../../../embeddings/bedrock.js"; + +const embeddingsEngine = new OpenAIEmbeddings(); + +// const embeddingsEngine = new BedrockEmbeddings({ +// region: "us-east-1", +// }); + +const postgresConnectionOptions = { + type: "postgres", + host: "127.0.0.1", + port: 5432, + user: "myuser", + password: "ChangeMe", + database: "api", +} as PoolConfig; describe("PGVectorStore", () => { let pgvectorVectorStore: PGVectorStore; const tableName = "testlangchain"; beforeAll(async () => { - const config = { - postgresConnectionOptions: { - type: "postgres", - host: "127.0.0.1", - port: 5432, - user: "myuser", - password: "ChangeMe", - database: "api", - } as PoolConfig, + const config: PGVectorStoreArgs = { + postgresConnectionOptions, tableName: "testlangchain", // collectionTableName: "langchain_pg_collection", // collectionName: "langchain", @@ -29,7 +38,7 @@ describe("PGVectorStore", () => { }; pgvectorVectorStore = await PGVectorStore.initialize( - new OpenAIEmbeddings(), + embeddingsEngine, config ); }); @@ -297,14 +306,7 @@ describe("PGVectorStore with collection", () => { beforeAll(async () => { const config = { - postgresConnectionOptions: { - type: "postgres", - host: "127.0.0.1", - port: 5432, - user: "myuser", - password: "ChangeMe", - database: "api", - } as PoolConfig, + postgresConnectionOptions, tableName, collectionTableName, collectionName: "langchain", @@ -317,7 +319,7 @@ describe("PGVectorStore with collection", () => { }; pgvectorVectorStore = await PGVectorStore.initialize( - new OpenAIEmbeddings(), + embeddingsEngine, config ); }); @@ -535,13 +537,7 @@ describe("PGVectorStore with schema", () => { let pool: pg.Pool; beforeAll(async () => { - pool = new pg.Pool({ - host: "127.0.0.1", - port: 5432, - user: "myuser", - password: "ChangeMe", - database: "api", - }); + pool = new pg.Pool(postgresConnectionOptions); const config: PGVectorStoreArgs = { pool, @@ -560,7 +556,7 @@ describe("PGVectorStore with schema", () => { await pool.query(`CREATE SCHEMA IF NOT EXISTS ${schema}`); pgvectorVectorStore = await PGVectorStore.initialize( - new OpenAIEmbeddings(), + embeddingsEngine, config ); computedTableName = pgvectorVectorStore.computedTableName; @@ -773,3 +769,74 @@ describe("PGVectorStore with schema", () => { } }); }); + +describe("PGVectorStore with HNSW index", () => { + let pgvectorVectorStore: PGVectorStore; + const tableName = "testlangchain"; + + beforeAll(async () => { + const config: PGVectorStoreArgs = { + postgresConnectionOptions, + tableName: "testlangchain", + columns: { + idColumnName: "id", + vectorColumnName: "vector", + contentColumnName: "content", + metadataColumnName: "metadata", + }, + distanceStrategy: "cosine", + }; + + pgvectorVectorStore = await PGVectorStore.initialize( + embeddingsEngine, + config + ); + + // Create the index + await pgvectorVectorStore.createHnswIndex({ dimensions: 1536 }); + }); + + afterEach(async () => { + // Drop table, then recreate it for the next test. + await pgvectorVectorStore.pool.query(`DROP TABLE "${tableName}"`); + await pgvectorVectorStore.ensureTableInDatabase(); + await pgvectorVectorStore.createHnswIndex({ dimensions: 1536 }); + }); + + afterAll(async () => { + await pgvectorVectorStore.end(); + }); + + test("Ensure table has HNSW index", async () => { + const result = await pgvectorVectorStore.pool.query( + `SELECT indexname, tablename, indexdef FROM pg_indexes where indexname='vector_embedding_hnsw_idx';` + ); + const { indexdef } = result.rows[0]; + expect(result.rowCount).toBe(1); + expect(indexdef.includes("USING hnsw")).toBe(true); + }); + + test("Test embeddings creation", async () => { + const documents = [ + { + pageContent: "hello", + metadata: { a: 1 }, + }, + { + pageContent: "Cat drinks milk", + metadata: { a: 2 }, + }, + { pageContent: "hi", metadata: { a: 1 } }, + ]; + await pgvectorVectorStore.addDocuments(documents); + + const query = await embeddingsEngine.embedQuery("milk"); + const results = await pgvectorVectorStore.similaritySearchVectorWithScore( + query, + 1 + ); + + expect(results).toHaveLength(1); + expect(results[0][0].pageContent).toEqual("Cat drinks milk"); + }); +});