From ca5eeb3fb0458d46ef260c4685e0f6421bdce03c Mon Sep 17 00:00:00 2001 From: bracesproul Date: Tue, 25 Jun 2024 10:52:23 -0700 Subject: [PATCH 1/5] core[minor]: Add ID field to document --- langchain-core/src/documents/document.ts | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/langchain-core/src/documents/document.ts b/langchain-core/src/documents/document.ts index 3e662303379a..fdec6773cc4d 100644 --- a/langchain-core/src/documents/document.ts +++ b/langchain-core/src/documents/document.ts @@ -5,6 +5,8 @@ export interface DocumentInput< pageContent: string; metadata?: Metadata; + + id?: string; } export interface DocumentInterface< @@ -14,6 +16,8 @@ export interface DocumentInterface< pageContent: string; metadata: Metadata; + + id?: string; } /** @@ -28,9 +32,21 @@ export class Document< metadata: Metadata; + /** + * An optional identifier for the document. + * + * Ideally this should be unique across the document collection and formatted + * as a UUID, but this will not be enforced. + * + * This field is optional at the moment, but may become a required field + * in the future (wil be assigned automatically if not provided). + */ + id?: string; + constructor(fields: DocumentInput) { this.pageContent = fields.pageContent !== undefined ? fields.pageContent.toString() : ""; this.metadata = fields.metadata ?? ({} as Metadata); + this.id = fields.id; } } From b11a6da56f7f6642b790efbcfa5d8942640eb9ee Mon Sep 17 00:00:00 2001 From: bracesproul Date: Tue, 25 Jun 2024 10:54:46 -0700 Subject: [PATCH 2/5] add more docstring --- langchain-core/src/documents/document.ts | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/langchain-core/src/documents/document.ts b/langchain-core/src/documents/document.ts index fdec6773cc4d..9c379417721a 100644 --- a/langchain-core/src/documents/document.ts +++ b/langchain-core/src/documents/document.ts @@ -6,6 +6,12 @@ export interface DocumentInput< metadata?: Metadata; + /** + * An optional identifier for the document. + * + * Ideally this should be unique across the document collection and formatted + * as a UUID, but this will not be enforced. + */ id?: string; } @@ -17,6 +23,12 @@ export interface DocumentInterface< metadata: Metadata; + /** + * An optional identifier for the document. + * + * Ideally this should be unique across the document collection and formatted + * as a UUID, but this will not be enforced. + */ id?: string; } @@ -32,14 +44,13 @@ export class Document< metadata: Metadata; + // This field is optional at the moment, but may become a required field + // in the future (wil be assigned automatically if not provided). /** * An optional identifier for the document. * * Ideally this should be unique across the document collection and formatted * as a UUID, but this will not be enforced. - * - * This field is optional at the moment, but may become a required field - * in the future (wil be assigned automatically if not provided). */ id?: string; From a62aee8cb2e1f48fe08be05f0d0c15f088e130bf Mon Sep 17 00:00:00 2001 From: bracesproul Date: Tue, 25 Jun 2024 10:57:23 -0700 Subject: [PATCH 3/5] chore: lint files --- langchain-core/src/documents/document.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/langchain-core/src/documents/document.ts b/langchain-core/src/documents/document.ts index 9c379417721a..901c61ba23b5 100644 --- a/langchain-core/src/documents/document.ts +++ b/langchain-core/src/documents/document.ts @@ -8,8 +8,8 @@ export interface DocumentInput< /** * An optional identifier for the document. - * - * Ideally this should be unique across the document collection and formatted + * + * Ideally this should be unique across the document collection and formatted * as a UUID, but this will not be enforced. */ id?: string; @@ -25,8 +25,8 @@ export interface DocumentInterface< /** * An optional identifier for the document. - * - * Ideally this should be unique across the document collection and formatted + * + * Ideally this should be unique across the document collection and formatted * as a UUID, but this will not be enforced. */ id?: string; @@ -48,8 +48,8 @@ export class Document< // in the future (wil be assigned automatically if not provided). /** * An optional identifier for the document. - * - * Ideally this should be unique across the document collection and formatted + * + * Ideally this should be unique across the document collection and formatted * as a UUID, but this will not be enforced. */ id?: string; From d37d2f409fc6837562b06a0c1a7afd7835816c67 Mon Sep 17 00:00:00 2001 From: bracesproul Date: Tue, 25 Jun 2024 12:20:58 -0700 Subject: [PATCH 4/5] update typeorm id document field --- langchain-core/src/documents/document.ts | 5 +++-- libs/langchain-community/src/vectorstores/typeorm.ts | 2 -- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/langchain-core/src/documents/document.ts b/langchain-core/src/documents/document.ts index 901c61ba23b5..3f17c0524dc9 100644 --- a/langchain-core/src/documents/document.ts +++ b/langchain-core/src/documents/document.ts @@ -44,8 +44,9 @@ export class Document< metadata: Metadata; - // This field is optional at the moment, but may become a required field - // in the future (wil be assigned automatically if not provided). + // The ID field is optional at the moment. + // It will likely become required in a future major release after + // it has been adopted by enough vectorstore implementations. /** * An optional identifier for the document. * diff --git a/libs/langchain-community/src/vectorstores/typeorm.ts b/libs/langchain-community/src/vectorstores/typeorm.ts index 68b70c931630..336ba59ccf85 100644 --- a/libs/langchain-community/src/vectorstores/typeorm.ts +++ b/libs/langchain-community/src/vectorstores/typeorm.ts @@ -23,8 +23,6 @@ export interface TypeORMVectorStoreArgs { */ export class TypeORMVectorStoreDocument extends Document { embedding: string; - - id?: string; } const defaultDocumentTableName = "documents"; From 84087c05049377fdd56a0dacccc7b5f163e10da9 Mon Sep 17 00:00:00 2001 From: bracesproul Date: Tue, 25 Jun 2024 13:36:10 -0700 Subject: [PATCH 5/5] fix tests --- langchain/src/document_loaders/tests/csv-blob.test.ts | 2 ++ langchain/src/document_loaders/tests/json-blob.test.ts | 4 ++++ langchain/src/document_loaders/tests/jsonl-blob.test.ts | 2 ++ 3 files changed, 8 insertions(+) diff --git a/langchain/src/document_loaders/tests/csv-blob.test.ts b/langchain/src/document_loaders/tests/csv-blob.test.ts index 5790ce883c66..9e7b9adbb839 100644 --- a/langchain/src/document_loaders/tests/csv-blob.test.ts +++ b/langchain/src/document_loaders/tests/csv-blob.test.ts @@ -46,6 +46,7 @@ test("Test CSV loader from blob", async () => { expect(docs.length).toBe(2); expect(docs[0]).toMatchInlineSnapshot(` Document { + "id": undefined, "metadata": { "blobType": "text/csv", "line": 1, @@ -57,6 +58,7 @@ test("Test CSV loader from blob", async () => { `); expect(docs[1]).toMatchInlineSnapshot(` Document { + "id": undefined, "metadata": { "blobType": "text/csv", "line": 2, diff --git a/langchain/src/document_loaders/tests/json-blob.test.ts b/langchain/src/document_loaders/tests/json-blob.test.ts index f29c25c78d2f..cc363e840bb0 100644 --- a/langchain/src/document_loaders/tests/json-blob.test.ts +++ b/langchain/src/document_loaders/tests/json-blob.test.ts @@ -39,6 +39,7 @@ test("Test JSON loader from blob", async () => { expect(docs.length).toBe(2); expect(docs[0]).toMatchInlineSnapshot(` Document { + "id": undefined, "metadata": { "blobType": "application/json", "line": 1, @@ -49,6 +50,7 @@ test("Test JSON loader from blob", async () => { `); expect(docs[1]).toMatchInlineSnapshot(` Document { + "id": undefined, "metadata": { "blobType": "application/json", "line": 2, @@ -87,6 +89,7 @@ test("Test JSON loader from blob", async () => { expect(docs.length).toBe(10); expect(docs[0]).toMatchInlineSnapshot(` Document { + "id": undefined, "metadata": { "blobType": "application/json", "line": 1, @@ -97,6 +100,7 @@ test("Test JSON loader from blob", async () => { `); expect(docs[1]).toMatchInlineSnapshot(` Document { + "id": undefined, "metadata": { "blobType": "application/json", "line": 2, diff --git a/langchain/src/document_loaders/tests/jsonl-blob.test.ts b/langchain/src/document_loaders/tests/jsonl-blob.test.ts index 57e96964ad72..2d1753d60ba8 100644 --- a/langchain/src/document_loaders/tests/jsonl-blob.test.ts +++ b/langchain/src/document_loaders/tests/jsonl-blob.test.ts @@ -40,6 +40,7 @@ test("Test JSONL loader from blob", async () => { expect(docs.length).toBe(2); expect(docs[0]).toMatchInlineSnapshot(` Document { + "id": undefined, "metadata": { "blobType": "application/jsonl+json", "line": 1, @@ -50,6 +51,7 @@ test("Test JSONL loader from blob", async () => { `); expect(docs[1]).toMatchInlineSnapshot(` Document { + "id": undefined, "metadata": { "blobType": "application/jsonl+json", "line": 2,