diff --git a/langchain/src/document_loaders/fs/unstructured.ts b/langchain/src/document_loaders/fs/unstructured.ts index edb60008aca1..e3b62a9cfbc3 100644 --- a/langchain/src/document_loaders/fs/unstructured.ts +++ b/langchain/src/document_loaders/fs/unstructured.ts @@ -126,6 +126,11 @@ type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & { unknown?: UnknownHandling; }; +type UnstructuredMemoryLoaderOptions = { + buffer: Buffer; + fileName: string; +}; + /** * @deprecated - Import from "@langchain/community/document_loaders/fs/unstructured" instead. This entrypoint will be removed in 0.3.0. * @@ -139,6 +144,10 @@ type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & { export class UnstructuredLoader extends BaseDocumentLoader { public filePath: string; + private buffer?: Buffer; + + private fileName?: string; + private apiUrl = "https://api.unstructured.io/general/v0/general"; private apiKey?: string; @@ -175,7 +184,9 @@ export class UnstructuredLoader extends BaseDocumentLoader { private maxCharacters?: number; constructor( - filePathOrLegacyApiUrl: string, + filePathOrLegacyApiUrlOrMemoryBuffer: + | string + | UnstructuredMemoryLoaderOptions, optionsOrLegacyFilePath: UnstructuredLoaderOptions | string = {} ) { super(); @@ -183,11 +194,20 @@ export class UnstructuredLoader extends BaseDocumentLoader { // Temporary shim to avoid breaking existing users // Remove when API keys are enforced by Unstructured and existing code will break anyway const isLegacySyntax = typeof optionsOrLegacyFilePath === "string"; - if (isLegacySyntax) { + const isMemorySyntax = + typeof filePathOrLegacyApiUrlOrMemoryBuffer === "object"; + + if (isMemorySyntax) { + this.buffer = filePathOrLegacyApiUrlOrMemoryBuffer.buffer; + this.fileName = filePathOrLegacyApiUrlOrMemoryBuffer.fileName; + } else if (isLegacySyntax) { this.filePath = optionsOrLegacyFilePath; - this.apiUrl = filePathOrLegacyApiUrl; + this.apiUrl = filePathOrLegacyApiUrlOrMemoryBuffer; } else { - this.filePath = filePathOrLegacyApiUrl; + this.filePath = filePathOrLegacyApiUrlOrMemoryBuffer; + } + + if (!isLegacySyntax) { const options = optionsOrLegacyFilePath; this.apiKey = options.apiKey; this.apiUrl = options.apiUrl ?? this.apiUrl; @@ -209,14 +229,20 @@ export class UnstructuredLoader extends BaseDocumentLoader { } async _partition() { - const { readFile, basename } = await this.imports(); + let { buffer } = this; + let { fileName } = this; + + if (!buffer) { + const { readFile, basename } = await this.imports(); - const buffer = await readFile(this.filePath); - const fileName = basename(this.filePath); + buffer = await readFile(this.filePath); + fileName = basename(this.filePath); + + // I'm aware this reads the file into memory first, but we have lots of work + // to do on then consuming Documents in a streaming fashion anyway, so not + // worried about this for now. + } - // I'm aware this reads the file into memory first, but we have lots of work - // to do on then consuming Documents in a streaming fashion anyway, so not - // worried about this for now. const formData = new FormData(); formData.append("files", new Blob([buffer]), fileName); formData.append("strategy", this.strategy); diff --git a/langchain/src/document_loaders/tests/unstructured.int.test.ts b/langchain/src/document_loaders/tests/unstructured.int.test.ts index e30913e10a2d..b0b0712118a6 100644 --- a/langchain/src/document_loaders/tests/unstructured.int.test.ts +++ b/langchain/src/document_loaders/tests/unstructured.int.test.ts @@ -3,6 +3,7 @@ import * as url from "node:url"; import * as path from "node:path"; +import { readFile } from "node:fs/promises"; import { test, expect } from "@jest/globals"; import { UnstructuredDirectoryLoader, @@ -29,6 +30,34 @@ test.skip("Test Unstructured base loader", async () => { } }); +test.skip("Test Unstructured base loader with buffer", async () => { + const filePath = path.resolve( + path.dirname(url.fileURLToPath(import.meta.url)), + "./example_data/example.txt" + ); + + const options = { + apiKey: process.env.UNSTRUCTURED_API_KEY!, + }; + + const buffer = await readFile(filePath); + const fileName = "example.txt"; + + const loader = new UnstructuredLoader( + { + buffer, + fileName, + }, + options + ); + const docs = await loader.load(); + + expect(docs.length).toBe(3); + for (const doc of docs) { + expect(typeof doc.pageContent).toBe("string"); + } +}); + test.skip("Test Unstructured base loader with fast strategy", async () => { const filePath = path.resolve( path.dirname(url.fileURLToPath(import.meta.url)), diff --git a/libs/langchain-community/src/document_loaders/fs/unstructured.ts b/libs/langchain-community/src/document_loaders/fs/unstructured.ts index f9040b11110a..d069d306c33d 100644 --- a/libs/langchain-community/src/document_loaders/fs/unstructured.ts +++ b/libs/langchain-community/src/document_loaders/fs/unstructured.ts @@ -120,6 +120,11 @@ type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & { unknown?: UnknownHandling; }; +type UnstructuredMemoryLoaderOptions = { + buffer: Buffer; + fileName: string; +}; + /** * A document loader that uses the Unstructured API to load unstructured * documents. It supports both the new syntax with options object and the @@ -131,6 +136,10 @@ type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & { export class UnstructuredLoader extends BaseDocumentLoader { public filePath: string; + private buffer?: Buffer; + + private fileName?: string; + private apiUrl = "https://api.unstructured.io/general/v0/general"; private apiKey?: string; @@ -167,7 +176,9 @@ export class UnstructuredLoader extends BaseDocumentLoader { private maxCharacters?: number; constructor( - filePathOrLegacyApiUrl: string, + filePathOrLegacyApiUrlOrMemoryBuffer: + | string + | UnstructuredMemoryLoaderOptions, optionsOrLegacyFilePath: UnstructuredLoaderOptions | string = {} ) { super(); @@ -175,11 +186,20 @@ export class UnstructuredLoader extends BaseDocumentLoader { // Temporary shim to avoid breaking existing users // Remove when API keys are enforced by Unstructured and existing code will break anyway const isLegacySyntax = typeof optionsOrLegacyFilePath === "string"; - if (isLegacySyntax) { + const isMemorySyntax = + typeof filePathOrLegacyApiUrlOrMemoryBuffer === "object"; + + if (isMemorySyntax) { + this.buffer = filePathOrLegacyApiUrlOrMemoryBuffer.buffer; + this.fileName = filePathOrLegacyApiUrlOrMemoryBuffer.fileName; + } else if (isLegacySyntax) { this.filePath = optionsOrLegacyFilePath; - this.apiUrl = filePathOrLegacyApiUrl; + this.apiUrl = filePathOrLegacyApiUrlOrMemoryBuffer; } else { - this.filePath = filePathOrLegacyApiUrl; + this.filePath = filePathOrLegacyApiUrlOrMemoryBuffer; + } + + if (!isLegacySyntax) { const options = optionsOrLegacyFilePath; this.apiKey = options.apiKey ?? getEnvironmentVariable("UNSTRUCTURED_API_KEY"); @@ -205,14 +225,20 @@ export class UnstructuredLoader extends BaseDocumentLoader { } async _partition() { - const { readFile, basename } = await this.imports(); + let buffer = this.buffer; + let fileName = this.fileName; + + if (!buffer) { + const { readFile, basename } = await this.imports(); - const buffer = await readFile(this.filePath); - const fileName = basename(this.filePath); + buffer = await readFile(this.filePath); + fileName = basename(this.filePath); + + // I'm aware this reads the file into memory first, but we have lots of work + // to do on then consuming Documents in a streaming fashion anyway, so not + // worried about this for now. + } - // I'm aware this reads the file into memory first, but we have lots of work - // to do on then consuming Documents in a streaming fashion anyway, so not - // worried about this for now. const formData = new FormData(); formData.append("files", new Blob([buffer]), fileName); formData.append("strategy", this.strategy); diff --git a/libs/langchain-community/src/document_loaders/tests/unstructured.int.test.ts b/libs/langchain-community/src/document_loaders/tests/unstructured.int.test.ts index e30913e10a2d..b0b0712118a6 100644 --- a/libs/langchain-community/src/document_loaders/tests/unstructured.int.test.ts +++ b/libs/langchain-community/src/document_loaders/tests/unstructured.int.test.ts @@ -3,6 +3,7 @@ import * as url from "node:url"; import * as path from "node:path"; +import { readFile } from "node:fs/promises"; import { test, expect } from "@jest/globals"; import { UnstructuredDirectoryLoader, @@ -29,6 +30,34 @@ test.skip("Test Unstructured base loader", async () => { } }); +test.skip("Test Unstructured base loader with buffer", async () => { + const filePath = path.resolve( + path.dirname(url.fileURLToPath(import.meta.url)), + "./example_data/example.txt" + ); + + const options = { + apiKey: process.env.UNSTRUCTURED_API_KEY!, + }; + + const buffer = await readFile(filePath); + const fileName = "example.txt"; + + const loader = new UnstructuredLoader( + { + buffer, + fileName, + }, + options + ); + const docs = await loader.load(); + + expect(docs.length).toBe(3); + for (const doc of docs) { + expect(typeof doc.pageContent).toBe("string"); + } +}); + test.skip("Test Unstructured base loader with fast strategy", async () => { const filePath = path.resolve( path.dirname(url.fileURLToPath(import.meta.url)),