From 62c5196a0811c4192296c29b7c056071d8f93653 Mon Sep 17 00:00:00 2001 From: bracesproul Date: Fri, 26 Jul 2024 12:19:43 -0700 Subject: [PATCH] unstructured[major]: Unstructured partner package --- libs/langchain-unstructured/.eslintrc.cjs | 74 +++++ libs/langchain-unstructured/.gitignore | 3 + libs/langchain-unstructured/.prettierrc | 19 ++ libs/langchain-unstructured/.release-it.json | 10 + libs/langchain-unstructured/LICENSE | 21 ++ libs/langchain-unstructured/jest.config.cjs | 21 ++ libs/langchain-unstructured/jest.env.cjs | 12 + .../langchain.config.js | 22 ++ libs/langchain-unstructured/package.json | 79 +++++ .../scripts/jest-setup-after-env.js | 9 + .../src/document_loaders.ts | 304 ++++++++++++++++++ libs/langchain-unstructured/src/index.ts | 1 + .../src/tests/document_loaders.int.test.ts | 5 + .../src/tests/document_loaders.test.ts | 53 +++ libs/langchain-unstructured/tsconfig.cjs.json | 8 + libs/langchain-unstructured/tsconfig.json | 23 ++ libs/langchain-unstructured/turbo.json | 11 + yarn.lock | 96 +++++- 18 files changed, 763 insertions(+), 8 deletions(-) create mode 100644 libs/langchain-unstructured/.eslintrc.cjs create mode 100644 libs/langchain-unstructured/.gitignore create mode 100644 libs/langchain-unstructured/.prettierrc create mode 100644 libs/langchain-unstructured/.release-it.json create mode 100644 libs/langchain-unstructured/LICENSE create mode 100644 libs/langchain-unstructured/jest.config.cjs create mode 100644 libs/langchain-unstructured/jest.env.cjs create mode 100644 libs/langchain-unstructured/langchain.config.js create mode 100644 libs/langchain-unstructured/package.json create mode 100644 libs/langchain-unstructured/scripts/jest-setup-after-env.js create mode 100644 libs/langchain-unstructured/src/document_loaders.ts create mode 100644 libs/langchain-unstructured/src/index.ts create mode 100644 libs/langchain-unstructured/src/tests/document_loaders.int.test.ts create mode 100644 libs/langchain-unstructured/src/tests/document_loaders.test.ts create mode 100644 libs/langchain-unstructured/tsconfig.cjs.json create mode 100644 libs/langchain-unstructured/tsconfig.json create mode 100644 libs/langchain-unstructured/turbo.json diff --git a/libs/langchain-unstructured/.eslintrc.cjs b/libs/langchain-unstructured/.eslintrc.cjs new file mode 100644 index 000000000000..e3033ac0160c --- /dev/null +++ b/libs/langchain-unstructured/.eslintrc.cjs @@ -0,0 +1,74 @@ +module.exports = { + extends: [ + "airbnb-base", + "eslint:recommended", + "prettier", + "plugin:@typescript-eslint/recommended", + ], + parserOptions: { + ecmaVersion: 12, + parser: "@typescript-eslint/parser", + project: "./tsconfig.json", + sourceType: "module", + }, + plugins: ["@typescript-eslint", "no-instanceof"], + ignorePatterns: [ + ".eslintrc.cjs", + "scripts", + "node_modules", + "dist", + "dist-cjs", + "*.js", + "*.cjs", + "*.d.ts", + ], + rules: { + "no-process-env": 2, + "no-instanceof/no-instanceof": 2, + "@typescript-eslint/explicit-module-boundary-types": 0, + "@typescript-eslint/no-empty-function": 0, + "@typescript-eslint/no-shadow": 0, + "@typescript-eslint/no-empty-interface": 0, + "@typescript-eslint/no-use-before-define": ["error", "nofunc"], + "@typescript-eslint/no-unused-vars": ["warn", { args: "none" }], + "@typescript-eslint/no-floating-promises": "error", + "@typescript-eslint/no-misused-promises": "error", + camelcase: 0, + "class-methods-use-this": 0, + "import/extensions": [2, "ignorePackages"], + "import/no-extraneous-dependencies": [ + "error", + { devDependencies: ["**/*.test.ts"] }, + ], + "import/no-unresolved": 0, + "import/prefer-default-export": 0, + "keyword-spacing": "error", + "max-classes-per-file": 0, + "max-len": 0, + "no-await-in-loop": 0, + "no-bitwise": 0, + "no-console": 0, + "no-restricted-syntax": 0, + "no-shadow": 0, + "no-continue": 0, + "no-void": 0, + "no-underscore-dangle": 0, + "no-use-before-define": 0, + "no-useless-constructor": 0, + "no-return-await": 0, + "consistent-return": 0, + "no-else-return": 0, + "func-names": 0, + "no-lonely-if": 0, + "prefer-rest-params": 0, + "new-cap": ["error", { properties: false, capIsNew: false }], + }, + overrides: [ + { + files: ["**/*.test.ts"], + rules: { + "@typescript-eslint/no-unused-vars": "off", + }, + }, + ], +}; diff --git a/libs/langchain-unstructured/.gitignore b/libs/langchain-unstructured/.gitignore new file mode 100644 index 000000000000..ae701c7fe705 --- /dev/null +++ b/libs/langchain-unstructured/.gitignore @@ -0,0 +1,3 @@ +node_modules +dist +.yarn diff --git a/libs/langchain-unstructured/.prettierrc b/libs/langchain-unstructured/.prettierrc new file mode 100644 index 000000000000..ba08ff04f677 --- /dev/null +++ b/libs/langchain-unstructured/.prettierrc @@ -0,0 +1,19 @@ +{ + "$schema": "https://json.schemastore.org/prettierrc", + "printWidth": 80, + "tabWidth": 2, + "useTabs": false, + "semi": true, + "singleQuote": false, + "quoteProps": "as-needed", + "jsxSingleQuote": false, + "trailingComma": "es5", + "bracketSpacing": true, + "arrowParens": "always", + "requirePragma": false, + "insertPragma": false, + "proseWrap": "preserve", + "htmlWhitespaceSensitivity": "css", + "vueIndentScriptAndStyle": false, + "endOfLine": "lf" +} diff --git a/libs/langchain-unstructured/.release-it.json b/libs/langchain-unstructured/.release-it.json new file mode 100644 index 000000000000..522ee6abf705 --- /dev/null +++ b/libs/langchain-unstructured/.release-it.json @@ -0,0 +1,10 @@ +{ + "github": { + "release": true, + "autoGenerate": true, + "tokenRef": "GITHUB_TOKEN_RELEASE" + }, + "npm": { + "versionArgs": ["--workspaces-update=false"] + } +} diff --git a/libs/langchain-unstructured/LICENSE b/libs/langchain-unstructured/LICENSE new file mode 100644 index 000000000000..8cd8f501eb49 --- /dev/null +++ b/libs/langchain-unstructured/LICENSE @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) 2023 LangChain + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/libs/langchain-unstructured/jest.config.cjs b/libs/langchain-unstructured/jest.config.cjs new file mode 100644 index 000000000000..994826496bc5 --- /dev/null +++ b/libs/langchain-unstructured/jest.config.cjs @@ -0,0 +1,21 @@ +/** @type {import('ts-jest').JestConfigWithTsJest} */ +module.exports = { + preset: "ts-jest/presets/default-esm", + testEnvironment: "./jest.env.cjs", + modulePathIgnorePatterns: ["dist/", "docs/"], + moduleNameMapper: { + "^(\\.{1,2}/.*)\\.js$": "$1", + }, + transform: { + "^.+\\.tsx?$": ["@swc/jest"], + }, + transformIgnorePatterns: [ + "/node_modules/", + "\\.pnp\\.[^\\/]+$", + "./scripts/jest-setup-after-env.js", + ], + setupFiles: ["dotenv/config"], + testTimeout: 20_000, + passWithNoTests: true, + collectCoverageFrom: ["src/**/*.ts"], +}; diff --git a/libs/langchain-unstructured/jest.env.cjs b/libs/langchain-unstructured/jest.env.cjs new file mode 100644 index 000000000000..2ccedccb8672 --- /dev/null +++ b/libs/langchain-unstructured/jest.env.cjs @@ -0,0 +1,12 @@ +const { TestEnvironment } = require("jest-environment-node"); + +class AdjustedTestEnvironmentToSupportFloat32Array extends TestEnvironment { + constructor(config, context) { + // Make `instanceof Float32Array` return true in tests + // to avoid https://github.com/xenova/transformers.js/issues/57 and https://github.com/jestjs/jest/issues/2549 + super(config, context); + this.global.Float32Array = Float32Array; + } +} + +module.exports = AdjustedTestEnvironmentToSupportFloat32Array; diff --git a/libs/langchain-unstructured/langchain.config.js b/libs/langchain-unstructured/langchain.config.js new file mode 100644 index 000000000000..46b1a2b31264 --- /dev/null +++ b/libs/langchain-unstructured/langchain.config.js @@ -0,0 +1,22 @@ +import { resolve, dirname } from "node:path"; +import { fileURLToPath } from "node:url"; + +/** + * @param {string} relativePath + * @returns {string} + */ +function abs(relativePath) { + return resolve(dirname(fileURLToPath(import.meta.url)), relativePath); +} + +export const config = { + internals: [/node\:/, /@langchain\/core\//], + entrypoints: { + index: "index", + }, + requiresOptionalDependency: [], + tsConfigPath: resolve("./tsconfig.json"), + cjsSource: "./dist-cjs", + cjsDestination: "./dist", + abs, +}; diff --git a/libs/langchain-unstructured/package.json b/libs/langchain-unstructured/package.json new file mode 100644 index 000000000000..b885114d30c7 --- /dev/null +++ b/libs/langchain-unstructured/package.json @@ -0,0 +1,79 @@ +{ + "name": "@langchain/unstructured", + "version": "0.0.0", + "description": "Sample integration for LangChain.js", + "type": "module", + "engines": { + "node": ">=18" + }, + "main": "./index.js", + "types": "./index.d.ts", + "repository": { + "type": "git", + "url": "git@github.com:langchain-ai/langchainjs.git" + }, + "homepage": "https://github.com/langchain-ai/langchainjs/tree/main/libs/langchain-unstructured/", + "scripts": { + "build": "yarn turbo:command build:internal --filter=@langchain/unstructured", + "build:internal": "yarn lc-build:v2 --create-entrypoints --pre --tree-shaking", + "lint:eslint": "NODE_OPTIONS=--max-old-space-size=4096 eslint --cache --ext .ts,.js src/", + "lint:dpdm": "dpdm --exit-code circular:1 --no-warning --no-tree src/*.ts src/**/*.ts", + "lint": "yarn lint:eslint && yarn lint:dpdm", + "lint:fix": "yarn lint:eslint --fix && yarn lint:dpdm", + "clean": "rm -rf .turbo dist/", + "prepack": "yarn build", + "test": "NODE_OPTIONS=--experimental-vm-modules jest --testPathIgnorePatterns=\\.int\\.test.ts --testTimeout 30000 --maxWorkers=50%", + "test:watch": "NODE_OPTIONS=--experimental-vm-modules jest --watch --testPathIgnorePatterns=\\.int\\.test.ts", + "test:single": "NODE_OPTIONS=--experimental-vm-modules yarn run jest --config jest.config.cjs --testTimeout 100000", + "test:int": "NODE_OPTIONS=--experimental-vm-modules jest --testPathPattern=\\.int\\.test.ts --testTimeout 100000 --maxWorkers=50%", + "format": "prettier --config .prettierrc --write \"src\"", + "format:check": "prettier --config .prettierrc --check \"src\"" + }, + "author": "LangChain", + "license": "MIT", + "dependencies": { + "@langchain/core": ">0.1.0 <0.3.0", + "unstructured-client": "^0.13.0" + }, + "devDependencies": { + "@jest/globals": "^29.5.0", + "@langchain/scripts": "~0.0.14", + "@swc/core": "^1.3.90", + "@swc/jest": "^0.2.29", + "@tsconfig/recommended": "^1.0.3", + "@typescript-eslint/eslint-plugin": "^6.12.0", + "@typescript-eslint/parser": "^6.12.0", + "dotenv": "^16.3.1", + "dpdm": "^3.12.0", + "eslint": "^8.33.0", + "eslint-config-airbnb-base": "^15.0.0", + "eslint-config-prettier": "^8.6.0", + "eslint-plugin-import": "^2.27.5", + "eslint-plugin-no-instanceof": "^1.0.1", + "eslint-plugin-prettier": "^4.2.1", + "jest": "^29.5.0", + "jest-environment-node": "^29.6.4", + "prettier": "^2.8.3", + "release-it": "^15.10.1", + "rollup": "^4.5.2", + "ts-jest": "^29.1.0", + "typescript": "<5.2.0" + }, + "publishConfig": { + "access": "public" + }, + "exports": { + ".": { + "types": "./index.d.ts", + "import": "./index.js", + "require": "./index.cjs" + }, + "./package.json": "./package.json" + }, + "files": [ + "dist/", + "index.cjs", + "index.js", + "index.d.ts" + ] +} diff --git a/libs/langchain-unstructured/scripts/jest-setup-after-env.js b/libs/langchain-unstructured/scripts/jest-setup-after-env.js new file mode 100644 index 000000000000..7323083d0ea5 --- /dev/null +++ b/libs/langchain-unstructured/scripts/jest-setup-after-env.js @@ -0,0 +1,9 @@ +import { awaitAllCallbacks } from "@langchain/core/callbacks/promises"; +import { afterAll, jest } from "@jest/globals"; + +afterAll(awaitAllCallbacks); + +// Allow console.log to be disabled in tests +if (process.env.DISABLE_CONSOLE_LOGS === "true") { + console.log = jest.fn(); +} diff --git a/libs/langchain-unstructured/src/document_loaders.ts b/libs/langchain-unstructured/src/document_loaders.ts new file mode 100644 index 000000000000..8f684cefbb9a --- /dev/null +++ b/libs/langchain-unstructured/src/document_loaders.ts @@ -0,0 +1,304 @@ +import { SDKOptions, UnstructuredClient } from "unstructured-client"; +import * as fs from "node:fs"; +import * as path from "node:path"; +import { BaseDocumentLoader } from "@langchain/core/document_loaders/base"; +import { getEnvironmentVariable } from "@langchain/core/utils/env"; +import { Document, DocumentInterface } from "@langchain/core/documents"; +import { + PartitionParameters, + Strategy as StrategyEnum, +} from "unstructured-client/sdk/models/shared"; + +/** + * The strategy to use for partitioning PDF/image. + * Options are: + * - "fast" + * - "hi_res" + * - "auto" + * - "ocr_only" + * @default "auto" + */ +export type UnstructuredLoaderStrategy = + | "fast" + | "hi_res" + | "auto" + | "ocr_only"; + +/** + * Options for the UnstructuredMemoryLoader. + */ +export type UnstructuredMemoryLoaderOptions = + | { + /** + * The buffer containing the file content. + */ + buffer: Buffer; + /** + * The name of the file when using a buffer. + */ + filePath: string; + } + | { + /** + * The path or list of paths to the file(s). + */ + filePath: string | string[]; + buffer?: never; + }; + +export interface UnstructuredLoaderOptions + extends SDKOptions, + Omit { + partitionViaApi?: boolean; + postProcessors?: ((str: string) => string)[]; + // SDK parameters + apiKey?: string; + client?: UnstructuredClient; + strategy?: UnstructuredLoaderStrategy; +} + +/** + * Represents an element returned by the Unstructured API. It has + * properties for the element type, text content, and metadata. + */ +type Element = { + type: string; + text: string; + // this is purposefully loosely typed + metadata: { + [key: string]: unknown; + }; +}; + +const _DEFAULT_URL = "https://api.unstructuredapp.io/general/v0/general"; + +// eslint-disable-next-line @typescript-eslint/no-explicit-any +export type UnstructuredDocumentMetadata = Record & { + category: string; +}; + +/** + * Unstructured document loader interface. + * + * Partition and load files using either the `unstructured-client` sdk and the + * Unstructured API or locally using the `unstructured` library. + * + * API: + * This package is configured to work with the Unstructured API by default. + * To use the Unstructured API, set + * `partitionViaApi: true` and define `apiKey`. If you are running the unstructured + * API locally, you can change the API rule by defining `url` when you initialize the + * loader. The hosted Unstructured API requires an API key. See the links below to + * learn more about our API offerings and get an API key. + * + * Local: + * To partition files locally, you must have the `unstructured` package installed. + * You can install it with `pip install unstructured`. + * By default the file loader uses the Unstructured `partition` function and will + * automatically detect the file type. + * + * In addition to document specific partition parameters, Unstructured has a rich set + * of "chunking" parameters for post-processing elements into more useful text segments + * for uses cases such as Retrieval Augmented Generation (RAG). You can pass additional + * Unstructured kwargs to the loader to configure different unstructured settings. + * + * Setup: + * Install the package: + * ```bash + * npm install @langchain/unstructured + * ``` + * Set the API key in your environment: + * ```bash + * export UNSTRUCTURED_API_KEY="your-api-key" + * ``` + * + * Instantiate: + * ```typescript + * import { UnstructuredLoader } from "@langchain/unstructured"; + * + * const loader = new UnstructuredLoader({ + * filePath: ["example.pdf", "fake.pdf"], + * apiKey: process.env.UNSTRUCTURED_API_KEY, + * partitionViaApi: true, + * chunkingStrategy: "by_title", + * strategy: "fast", + * }); + * ``` + * + * Load: + * ```typescript + * const docs = await loader.load(); + * + * console.log(docs[0].pageContent.slice(0, 100)); + * console.log(docs[0].metadata); + * ``` + * + * References + * ---------- + * https://docs.unstructured.io/api-reference/api-services/sdk + * https://docs.unstructured.io/api-reference/api-services/overview + * https://docs.unstructured.io/open-source/core-functionality/partitioning + * https://docs.unstructured.io/open-source/core-functionality/chunking + */ +export class UnstructuredLoader< + Metadata extends UnstructuredDocumentMetadata = UnstructuredDocumentMetadata +> extends BaseDocumentLoader { + client: UnstructuredClient; + + filePath?: string | string[]; + + buffer?: Buffer; + + partitionViaApi?: boolean; + + postProcessors?: ((str: string) => string)[]; + + strategy: UnstructuredLoaderStrategy = "auto"; + + unstructuredFields?: Omit; + + constructor( + fileOrBuffer: UnstructuredMemoryLoaderOptions, + fields?: UnstructuredLoaderOptions + ) { + super(); + const { + partitionViaApi, + postProcessors, + apiKey, + client, + strategy, + security, + httpClient, + server, + serverURL, + retryConfig, + timeoutMs, + ...unstructuredFields + } = { ...fields }; + + if (fileOrBuffer.filePath && fileOrBuffer.buffer) { + throw new Error( + "`filePath` and `buffer` cannot be defined simultaneously." + ); + } else if (!fileOrBuffer.filePath && !fileOrBuffer.buffer) { + throw new Error("Either `filePath` or `buffer` must be defined."); + } + + if (client) { + const disallowedParams: [string, unknown][] = [ + ["apiKey", apiKey], + ["serverURL", serverURL], + ]; + const badParams = disallowedParams + .filter(([_, value]) => value !== undefined) + .map(([param]) => param); + + if (badParams.length > 0) { + throw new Error( + `If you are passing a custom 'client', you cannot also pass these params: ${badParams.join( + ", " + )}.` + ); + } + this.client = client; + } else { + const unstructuredApiKey = + apiKey || getEnvironmentVariable("UNSTRUCTURED_API_KEY"); + const unstructuredUrl = + serverURL || getEnvironmentVariable("UNSTRUCTURED_URL") || _DEFAULT_URL; + + this.client = new UnstructuredClient({ + security: unstructuredApiKey + ? { apiKeyAuth: unstructuredApiKey } + : security, + serverURL: unstructuredUrl, + retryConfig, + timeoutMs, + httpClient, + server, + }); + } + + this.filePath = fileOrBuffer?.filePath; + this.buffer = fileOrBuffer?.buffer; + this.partitionViaApi = partitionViaApi; + this.postProcessors = postProcessors; + this.strategy = strategy || this.strategy; + this.unstructuredFields = unstructuredFields; + } + + mapStrategyToEnum(): StrategyEnum { + switch (this.strategy) { + case "fast": + return StrategyEnum.Fast; + case "hi_res": + return StrategyEnum.HiRes; + case "ocr_only": + return StrategyEnum.OcrOnly; + case "auto": + default: + return StrategyEnum.Auto; + } + } + + async _partition(filePath: string): Promise { + let { buffer } = this; + const fileName = path.basename(filePath); + + if (!buffer) { + // Buffer is false, we must read the file + buffer = await fs.promises.readFile(filePath); + } + + const res = await this.client.general.partition({ + partitionParameters: { + ...this.unstructuredFields, + files: { + content: new Uint8Array(buffer), + fileName, + }, + strategy: this.mapStrategyToEnum(), + }, + }); + + if (!res.elements || res.elements.length === 0) { + throw new Error("No elements were returned from the Unstructured API."); + } + + return res.elements.filter( + (el) => "text" in el && typeof el.text === "string" + ) as Element[]; + } + + async load(): Promise[]> { + let elements: Element[]; + + if (Array.isArray(this.filePath)) { + // Handle multiple files + elements = (await Promise.all(this.filePath.map(this._partition))).flat(); + } else if (this.filePath) { + elements = await this._partition(this.filePath); + } else { + throw new Error("filePath must be defined."); + } + + const documents: DocumentInterface[] = []; + for (const element of elements) { + const { metadata, text } = element; + if (typeof text === "string") { + documents.push( + new Document({ + pageContent: text, + metadata: { + ...metadata, + category: element.type, + } as Metadata, + }) + ); + } + } + + return documents; + } +} diff --git a/libs/langchain-unstructured/src/index.ts b/libs/langchain-unstructured/src/index.ts new file mode 100644 index 000000000000..ca8d30ea8cf5 --- /dev/null +++ b/libs/langchain-unstructured/src/index.ts @@ -0,0 +1 @@ +export * from "./document_loaders.js"; diff --git a/libs/langchain-unstructured/src/tests/document_loaders.int.test.ts b/libs/langchain-unstructured/src/tests/document_loaders.int.test.ts new file mode 100644 index 000000000000..7fce4ce53302 --- /dev/null +++ b/libs/langchain-unstructured/src/tests/document_loaders.int.test.ts @@ -0,0 +1,5 @@ +import { test } from "@jest/globals"; + +test("Test chat model", async () => { + // Your integration test here +}); diff --git a/libs/langchain-unstructured/src/tests/document_loaders.test.ts b/libs/langchain-unstructured/src/tests/document_loaders.test.ts new file mode 100644 index 000000000000..a522bfcb6686 --- /dev/null +++ b/libs/langchain-unstructured/src/tests/document_loaders.test.ts @@ -0,0 +1,53 @@ +import { test, expect } from "@jest/globals"; +import { UnstructuredClient } from "unstructured-client"; +import { UnstructuredLoader } from "../document_loaders.js"; + +test("Can not pass apiKey or serverURL if passing a custom client", () => { + const customClient = new UnstructuredClient(); + + expect(() => { + const loader = new UnstructuredLoader( + { + filePath: "filePath", + }, + { + client: customClient, + apiKey: "apiKey", + serverURL: "serverURL", + } + ); + if (loader) { + // Loader should never be true, since it should throw an error. + } + }).toThrowError(); + + expect(() => { + const loader = new UnstructuredLoader( + { + filePath: "filePath", + }, + { + client: customClient, + serverURL: "serverURL", + } + ); + if (loader) { + // Loader should never be true, since it should throw an error. + } + }).toThrowError(); + + expect(() => { + const loader = new UnstructuredLoader( + { + filePath: "filePath", + }, + { + client: customClient, + apiKey: "apiKey", + } + ); + if (loader) { + // Loader should never be true, since it should throw an error. + } + }).toThrowError(); +}); diff --git a/libs/langchain-unstructured/tsconfig.cjs.json b/libs/langchain-unstructured/tsconfig.cjs.json new file mode 100644 index 000000000000..3b7026ea406c --- /dev/null +++ b/libs/langchain-unstructured/tsconfig.cjs.json @@ -0,0 +1,8 @@ +{ + "extends": "./tsconfig.json", + "compilerOptions": { + "module": "commonjs", + "declaration": false + }, + "exclude": ["node_modules", "dist", "docs", "**/tests"] +} diff --git a/libs/langchain-unstructured/tsconfig.json b/libs/langchain-unstructured/tsconfig.json new file mode 100644 index 000000000000..bc85d83b6229 --- /dev/null +++ b/libs/langchain-unstructured/tsconfig.json @@ -0,0 +1,23 @@ +{ + "extends": "@tsconfig/recommended", + "compilerOptions": { + "outDir": "../dist", + "rootDir": "./src", + "target": "ES2021", + "lib": ["ES2021", "ES2022.Object", "DOM"], + "module": "ES2020", + "moduleResolution": "nodenext", + "esModuleInterop": true, + "declaration": true, + "noImplicitReturns": true, + "noFallthroughCasesInSwitch": true, + "noUnusedLocals": true, + "noUnusedParameters": true, + "useDefineForClassFields": true, + "strictPropertyInitialization": false, + "allowJs": true, + "strict": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist", "docs"] +} diff --git a/libs/langchain-unstructured/turbo.json b/libs/langchain-unstructured/turbo.json new file mode 100644 index 000000000000..d024cee15c81 --- /dev/null +++ b/libs/langchain-unstructured/turbo.json @@ -0,0 +1,11 @@ +{ + "extends": ["//"], + "pipeline": { + "build": { + "outputs": ["**/dist/**"] + }, + "build:internal": { + "dependsOn": ["^build:internal"] + } + } +} diff --git a/yarn.lock b/yarn.lock index 57851482b66f..7eec1706ebf1 100644 --- a/yarn.lock +++ b/yarn.lock @@ -12315,7 +12315,7 @@ __metadata: languageName: unknown linkType: soft -"@langchain/scripts@workspace:*, @langchain/scripts@workspace:libs/langchain-scripts, @langchain/scripts@~0.0.20": +"@langchain/scripts@workspace:*, @langchain/scripts@workspace:libs/langchain-scripts, @langchain/scripts@~0.0.14, @langchain/scripts@~0.0.20": version: 0.0.0-use.local resolution: "@langchain/scripts@workspace:libs/langchain-scripts" dependencies: @@ -12416,6 +12416,37 @@ __metadata: languageName: unknown linkType: soft +"@langchain/unstructured@workspace:libs/langchain-unstructured": + version: 0.0.0-use.local + resolution: "@langchain/unstructured@workspace:libs/langchain-unstructured" + dependencies: + "@jest/globals": ^29.5.0 + "@langchain/core": ">0.1.0 <0.3.0" + "@langchain/scripts": ~0.0.14 + "@swc/core": ^1.3.90 + "@swc/jest": ^0.2.29 + "@tsconfig/recommended": ^1.0.3 + "@typescript-eslint/eslint-plugin": ^6.12.0 + "@typescript-eslint/parser": ^6.12.0 + dotenv: ^16.3.1 + dpdm: ^3.12.0 + eslint: ^8.33.0 + eslint-config-airbnb-base: ^15.0.0 + eslint-config-prettier: ^8.6.0 + eslint-plugin-import: ^2.27.5 + eslint-plugin-no-instanceof: ^1.0.1 + eslint-plugin-prettier: ^4.2.1 + jest: ^29.5.0 + jest-environment-node: ^29.6.4 + prettier: ^2.8.3 + release-it: ^15.10.1 + rollup: ^4.5.2 + ts-jest: ^29.1.0 + typescript: <5.2.0 + unstructured-client: ^0.13.0 + languageName: unknown + linkType: soft + "@langchain/weaviate@workspace:*, @langchain/weaviate@workspace:libs/langchain-weaviate": version: 0.0.0-use.local resolution: "@langchain/weaviate@workspace:libs/langchain-weaviate" @@ -13333,6 +13364,24 @@ __metadata: languageName: node linkType: hard +"@pdf-lib/standard-fonts@npm:^1.0.0": + version: 1.0.0 + resolution: "@pdf-lib/standard-fonts@npm:1.0.0" + dependencies: + pako: ^1.0.6 + checksum: 7dc629b83862424a64b10c7ae34d789e0045a1a589f34a66a7f8e197f177cdb410969424e5d90f67b35c848db8b045cfa0a664941bdfb2d9b5413dbf44232981 + languageName: node + linkType: hard + +"@pdf-lib/upng@npm:^1.0.1": + version: 1.0.1 + resolution: "@pdf-lib/upng@npm:1.0.1" + dependencies: + pako: ^1.0.10 + checksum: acd8ac0974a3c2ed12c4e21d6340c4f77f8dde6727a74075b2faf69fb9dc4051b9e576479caf8e870f67d1bb37b953dfe50c4784892b466f01a29b55272d5e1f + languageName: node + linkType: hard + "@petamoriken/float16@npm:^3.8.6": version: 3.8.7 resolution: "@petamoriken/float16@npm:3.8.7" @@ -20609,6 +20658,13 @@ __metadata: languageName: node linkType: hard +"async@npm:^3.2.5": + version: 3.2.5 + resolution: "async@npm:3.2.5" + checksum: 5ec77f1312301dee02d62140a6b1f7ee0edd2a0f983b6fd2b0849b969f245225b990b47b8243e7b9ad16451a53e7f68e753700385b706198ced888beedba3af4 + languageName: node + linkType: hard + "asynciterator.prototype@npm:^1.0.0": version: 1.0.0 resolution: "asynciterator.prototype@npm:1.0.0" @@ -34555,6 +34611,13 @@ __metadata: languageName: node linkType: hard +"pako@npm:^1.0.10, pako@npm:^1.0.11, pako@npm:^1.0.6, pako@npm:~1.0.2, pako@npm:~1.0.5": + version: 1.0.11 + resolution: "pako@npm:1.0.11" + checksum: 1be2bfa1f807608c7538afa15d6f25baa523c30ec870a3228a89579e474a4d992f4293859524e46d5d87fd30fa17c5edf34dbef0671251d9749820b488660b16 + languageName: node + linkType: hard + "pako@npm:~0.2.0": version: 0.2.9 resolution: "pako@npm:0.2.9" @@ -34562,13 +34625,6 @@ __metadata: languageName: node linkType: hard -"pako@npm:~1.0.2, pako@npm:~1.0.5": - version: 1.0.11 - resolution: "pako@npm:1.0.11" - checksum: 1be2bfa1f807608c7538afa15d6f25baa523c30ec870a3228a89579e474a4d992f4293859524e46d5d87fd30fa17c5edf34dbef0671251d9749820b488660b16 - languageName: node - linkType: hard - "param-case@npm:^3.0.4": version: 3.0.4 resolution: "param-case@npm:3.0.4" @@ -34824,6 +34880,18 @@ __metadata: languageName: node linkType: hard +"pdf-lib@npm:^1.17.1": + version: 1.17.1 + resolution: "pdf-lib@npm:1.17.1" + dependencies: + "@pdf-lib/standard-fonts": ^1.0.0 + "@pdf-lib/upng": ^1.0.1 + pako: ^1.0.11 + tslib: ^1.11.1 + checksum: 0dae766f23de60ff071368073990cca0d30fb5d104c50a17fee00f0659a491f66e45ce80b1bbfe254e6915a5bc9079f42501dfff2e37f8f76a8807d3e321b19a + languageName: node + linkType: hard + "pdf-parse@npm:1.1.1, pdf-parse@npm:^1.1.1": version: 1.1.1 resolution: "pdf-parse@npm:1.1.1" @@ -40614,6 +40682,18 @@ __metadata: languageName: node linkType: hard +"unstructured-client@npm:^0.13.0": + version: 0.13.0 + resolution: "unstructured-client@npm:0.13.0" + dependencies: + async: ^3.2.5 + pdf-lib: ^1.17.1 + peerDependencies: + zod: ">= 3" + checksum: d116bdf0e68fd96c77019512b6822015e2fdc3a081bb5649a8d184e0cc0973e22b66a2ab9cc7f1e8add337937bd481a7156c88ac3cf283376db2a905d8466912 + languageName: node + linkType: hard + "untildify@npm:^4.0.0": version: 4.0.0 resolution: "untildify@npm:4.0.0"