From 804a7db0c806404db3d3088150de73c16f4141a8 Mon Sep 17 00:00:00 2001 From: Minjun Kim Date: Sat, 23 Nov 2024 20:31:37 -0500 Subject: [PATCH] Added docs and reformatted the code --- .../document_loaders/web_loaders/oracleai.mdx | 73 ++ examples/src/document_loaders/oracleai.ts | 40 + libs/langchain-community/.gitignore | 4 + libs/langchain-community/langchain.config.js | 2 + libs/langchain-community/package.json | 21 +- .../tests/example_data/oracleai/example.html | 31 +- .../document_loaders/tests/oracleai.test.ts | 707 ++++++++---------- .../document_loaders/tests/oracleaiDB.test.js | 26 - .../src/document_loaders/web/oracleai.ts | 486 ++++++------ package.json | 8 +- yarn.lock | 104 +-- 11 files changed, 766 insertions(+), 736 deletions(-) create mode 100644 docs/core_docs/docs/integrations/document_loaders/web_loaders/oracleai.mdx create mode 100644 examples/src/document_loaders/oracleai.ts delete mode 100644 libs/langchain-community/src/document_loaders/tests/oracleaiDB.test.js diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/oracleai.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/oracleai.mdx new file mode 100644 index 000000000000..c7ef4769bd75 --- /dev/null +++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/oracleai.mdx @@ -0,0 +1,73 @@ +--- +hide_table_of_contents: true +--- + +# Oracle AI + +This example goes over how to load documents using Oracle AI Vector Search. + +## Setup + +You'll need to install the [oracledb](https://www.npmjs.com/package/oracledb) package: + +```bash npm2yarn +npm install @langchain/community @langchain/core oracledb +``` + +## Usage + +### Connect to Oracle Database +You'll need to provide the username, password, hostname and service_name: + +```typescript +import oracledb from 'oracledb'; + +let connection: oracledb.Connection; + +// Replace the placeholders with your information +const username = ""; +const password = ""; +const dsn = "/"; + +try { + connection = await oracledb.getConnection({ + user: username, + password:password, + connectString: dsn + }); + console.log("Connection Successful"); +} catch (err) { + console.error('Connection failed:', err); + throw err; +} +``` + +### Load Documents +As for loading documents, you have 3 options: +- Loading a local file. +- Loading from a local directory. +- Loading from the Oracle Database. + +When loading from the Oracle Database, you must provide the table's name, owner's name, and the name of the column to load. Optionally, you can provide extra column names to be included in the returned documents' metadata: + +```typescript +import { OracleDocLoader, OracleLoadFromType } from "@langchain/community/document_loaders/web/oracleai"; + +/* +// Loading a local file (replace with the path of the file you want to load.) +const loader = new OracleDocLoader(connection, , OracleLoadFromType.FILE); + + +// Loading from a local directory (replace with the path of the directory you want to load from.) +const loader = new OracleDocLoader(connection, , OracleLoadFromType.DIR); +*/ + +// Loading from Oracle Database table (replace the placeholders with your information, optionally add a [metadata_cols] parameter to include columns as metadata.) +const loader = new OracleDocLoader(connection, , OracleLoadFromType.TABLE, , ); + +// Load the docs +const docs = loader.load(); +console.log("Number of docs loaded:", docs.length); +console.log("Document-0:", docs[0].page_content); // content +``` + diff --git a/examples/src/document_loaders/oracleai.ts b/examples/src/document_loaders/oracleai.ts new file mode 100644 index 000000000000..c80fc818cdcf --- /dev/null +++ b/examples/src/document_loaders/oracleai.ts @@ -0,0 +1,40 @@ +import oracledb from 'oracledb'; +import { OracleDocLoader, OracleLoadFromType } from "@langchain/community/document_loaders/web/oracleai"; + +let connection: oracledb.Connection; + +// Replace the placeholders with your information +const username = ""; +const pwd = ""; +const dsn = "/"; + +try { + connection = await oracledb.getConnection({ + user: username, + password: pwd, + connectString: dsn + }); + console.log("Connection Successful"); +} catch (err) { + console.error('Connection failed:', err); + throw err; +} + +// Loading a local file (replace with the path of the file you want to load.) +const loader = new OracleDocLoader(connection, "src/document_loaders/example_data/bitcoin.pdf", OracleLoadFromType.FILE); + +/* +// Loading from a local directory (replace with the path of the directory you want to load from.) +const loader = new OracleDocLoader(connection, , OracleLoadFromType.DIR); + + +// Loading from Oracle Database table (replace the placeholders with your information, optionally add a [metadata_cols] parameter to include columns as metadata.) +const loader = new OracleDocLoader(connection, , OracleLoadFromType.TABLE, , ); +*/ + +// Load the docs +const docs = loader.load(); +console.log("Number of docs loaded:", docs.length); +console.log("Document-0:", docs[0].page_content); // content + + diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore index e6ae5fa54a4f..79f805faafbd 100644 --- a/libs/langchain-community/.gitignore +++ b/libs/langchain-community/.gitignore @@ -922,6 +922,10 @@ document_loaders/web/notionapi.cjs document_loaders/web/notionapi.js document_loaders/web/notionapi.d.ts document_loaders/web/notionapi.d.cts +document_loaders/web/oracleai.cjs +document_loaders/web/oracleai.js +document_loaders/web/oracleai.d.ts +document_loaders/web/oracleai.d.cts document_loaders/web/pdf.cjs document_loaders/web/pdf.js document_loaders/web/pdf.d.ts diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js index 4a402c6941e8..c2add9919f5e 100644 --- a/libs/langchain-community/langchain.config.js +++ b/libs/langchain-community/langchain.config.js @@ -286,6 +286,7 @@ export const config = { "document_loaders/web/github": "document_loaders/web/github", "document_loaders/web/taskade": "document_loaders/web/taskade", "document_loaders/web/notionapi": "document_loaders/web/notionapi", + "document_loaders/web/oracleai": "document_loaders/web/oracleai", "document_loaders/web/pdf": "document_loaders/web/pdf", "document_loaders/web/recursive_url": "document_loaders/web/recursive_url", "document_loaders/web/s3": "document_loaders/web/s3", @@ -505,6 +506,7 @@ export const config = { "document_loaders/web/pdf", "document_loaders/web/taskade", "document_loaders/web/notionapi", + "document_loaders/web/oracleai", "document_loaders/web/recursive_url", "document_loaders/web/s3", "document_loaders/web/sitemap", diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 9220f5723b45..c34b967ab1f7 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -39,9 +39,11 @@ "binary-extensions": "^2.2.0", "expr-eval": "^2.0.2", "flat": "^5.0.2", + "htmlparser2": "^9.1.0", "js-yaml": "^4.1.0", "langchain": ">=0.2.3 <0.3.0 || >=0.3.4 <0.4.0", "langsmith": "^0.2.0", + "oracledb": "^6.7.0", "uuid": "^10.0.0", "zod": "^3.22.3", "zod-to-json-schema": "^3.22.5" @@ -116,11 +118,11 @@ "@types/d3-dsv": "^3.0.7", "@types/flat": "^5.0.2", "@types/html-to-text": "^9", - "@types/jest": "^29.5.14", "@types/jsdom": "^21.1.1", "@types/jsonwebtoken": "^9", "@types/lodash": "^4", "@types/mozilla-readability": "^0.2.1", + "@types/oracledb": "^6", "@types/pdf-parse": "^1.1.1", "@types/pg": "^8.11.0", "@types/pg-copy-streams": "^1.2.2", @@ -179,7 +181,7 @@ "interface-datastore": "^8.2.11", "ioredis": "^5.3.2", "it-all": "^3.0.4", - "jest": "^29.7.0", + "jest": "^29.5.0", "jest-environment-node": "^29.6.4", "jsdom": "^22.1.0", "jsonwebtoken": "^9.0.2", @@ -209,7 +211,7 @@ "rollup": "^3.19.1", "sonix-speech-recognition": "^2.1.1", "srt-parser-2": "^1.2.3", - "ts-jest": "^29.2.5", + "ts-jest": "^29.1.0", "typeorm": "^0.3.20", "typescript": "~5.1.6", "typesense": "^1.5.3", @@ -2792,6 +2794,15 @@ "import": "./document_loaders/web/notionapi.js", "require": "./document_loaders/web/notionapi.cjs" }, + "./document_loaders/web/oracleai": { + "types": { + "import": "./document_loaders/web/oracleai.d.ts", + "require": "./document_loaders/web/oracleai.d.cts", + "default": "./document_loaders/web/oracleai.d.ts" + }, + "import": "./document_loaders/web/oracleai.js", + "require": "./document_loaders/web/oracleai.cjs" + }, "./document_loaders/web/pdf": { "types": { "import": "./document_loaders/web/pdf.d.ts", @@ -4026,6 +4037,10 @@ "document_loaders/web/notionapi.js", "document_loaders/web/notionapi.d.ts", "document_loaders/web/notionapi.d.cts", + "document_loaders/web/oracleai.cjs", + "document_loaders/web/oracleai.js", + "document_loaders/web/oracleai.d.ts", + "document_loaders/web/oracleai.d.cts", "document_loaders/web/pdf.cjs", "document_loaders/web/pdf.js", "document_loaders/web/pdf.d.ts", diff --git a/libs/langchain-community/src/document_loaders/tests/example_data/oracleai/example.html b/libs/langchain-community/src/document_loaders/tests/example_data/oracleai/example.html index fbfa6c5ce47c..7672eb6e9e13 100644 --- a/libs/langchain-community/src/document_loaders/tests/example_data/oracleai/example.html +++ b/libs/langchain-community/src/document_loaders/tests/example_data/oracleai/example.html @@ -1,25 +1,28 @@ - - - - - + + + + + Sample HTML Page - - + +
-

Welcome to My Sample HTML Page

+

Welcome to My Sample HTML Page

-

Introduction

-

This is a small HTML file with a header, main content section, and a footer.

-

Feel free to modify and experiment with the code!

+

Introduction

+

+ This is a small HTML file with a header, main content section, and a + footer. +

+

Feel free to modify and experiment with the code!

-

Footer Content - © 2024

+

Footer Content - © 2024

- - \ No newline at end of file + + diff --git a/libs/langchain-community/src/document_loaders/tests/oracleai.test.ts b/libs/langchain-community/src/document_loaders/tests/oracleai.test.ts index dad25a0b90ba..9a0125d6f27e 100644 --- a/libs/langchain-community/src/document_loaders/tests/oracleai.test.ts +++ b/libs/langchain-community/src/document_loaders/tests/oracleai.test.ts @@ -1,9 +1,13 @@ import { jest } from "@jest/globals"; -import { ParseOracleDocMetadata, OracleDocLoader, OracleLoadFromType, TableRow } from "../web/oracleai.js"; -import oracledb from "oracledb"; +import { Connection, Result } from "oracledb"; +import { + ParseOracleDocMetadata, + OracleDocLoader, + OracleLoadFromType, + TableRow, +} from "../web/oracleai.js"; describe("ParseOracleDocMetadata", () => { - jest.mock("oracledb"); let parser: ParseOracleDocMetadata; beforeEach(() => { @@ -63,409 +67,358 @@ describe("ParseOracleDocMetadata", () => { }); describe("OracleDocLoader", () => { - let doc_count: number; - let executeMock: jest.Mock<(sql: string, bindVars?: any) => {}> - let connMock: jest.Mocked; - let loader: OracleDocLoader; - const baseDirPath = "./src/document_loaders/tests/example_data/oracleai"; - const baseMockData = "MockData" - - beforeEach(() => { - doc_count = 0; - executeMock = jest.fn(); - - executeMock.mockImplementation(async (sql: string, bindVars?: {}) => { - if (bindVars) { - doc_count++; - return { - outBinds: { - mdata: { getData: jest.fn().mockImplementation( () => bindVars.blob.val.toString() ) }, - text: { getData: jest.fn().mockImplementation( () => baseMockData + doc_count ) } } - }; - } - else { - return { - rows: [['MockUser']] - }; - } - }); - - connMock = {execute: executeMock} as unknown as jest.Mocked; - }); + let executeMock: jest.Mock<(sql: string, bindVars?: any) => object>; + let connMock: jest.Mocked; + let loader: OracleDocLoader; + const baseDirPath = "./src/document_loaders/tests/example_data/oracleai"; + const baseMockData = "MockData"; - test("should load a single file properly", async () => { - loader = new OracleDocLoader(connMock, baseDirPath + "/example.html", OracleLoadFromType.FILE); - const res = await loader.load(); - console.log(res) - expect(res.length).toEqual(1); - expect(res[0].pageContent).toEqual(baseMockData + "1"); - expect(res[0].metadata.title).toBeTruthy(); - expect(res[0].metadata.title).toEqual("Sample HTML Page"); - expect(res[0].metadata.viewport).toBeTruthy(); - expect(res[0].metadata.viewport).toEqual("width=device-width, initial-scale=1.0"); - }); + beforeEach(() => { + executeMock = jest.fn(); + connMock = { execute: executeMock } as unknown as jest.Mocked; + }); - test("should load a directory properly", async () => { - loader = new OracleDocLoader(connMock, baseDirPath, OracleLoadFromType.DIR); - const res = await loader.load(); - - expect(res.length).toEqual(3); - for (let i = 0; i < res.length; i += 1) { - expect(res[i].pageContent).toEqual(baseMockData + (i+1)); - if (res[i].metadata.title) { - expect(res[i].metadata.title).toEqual("Sample HTML Page"); - expect(res[i].metadata.viewport).toBeTruthy(); - expect(res[i].metadata.viewport).toEqual("width=device-width, initial-scale=1.0"); - } - } + test("should load a single file properly", async () => { + executeMock.mockImplementation(async (sql: string, bindVars?: any) => { + if (bindVars) { + return { + outBinds: { + mdata: { + getData: jest + .fn() + .mockImplementation(() => bindVars.blob.val.toString()), + }, + text: { + getData: jest.fn().mockImplementation(() => baseMockData + 1), + }, + }, + }; + } else { + return { + rows: [["MockUser"]], + }; + } }); -}); -describe('OracleDocLoader - loadFromTable', () => { - let conn: Partial; - let executeMock: any; - - beforeEach(() => { - executeMock = jest.fn(); - conn = { - execute: executeMock, - }; + loader = new OracleDocLoader( + connMock, + baseDirPath + "/example.html", + OracleLoadFromType.FILE + ); + const res = await loader.load(); + console.log(res); + expect(res.length).toEqual(1); + expect(res[0].pageContent).toEqual(baseMockData + "1"); + expect(res[0].metadata.title).toBeTruthy(); + expect(res[0].metadata.title).toEqual("Sample HTML Page"); + expect(res[0].metadata.viewport).toBeTruthy(); + expect(res[0].metadata.viewport).toEqual( + "width=device-width, initial-scale=1.0" + ); + }); + + test("should load a directory properly", async () => { + let doc_count = 0; + executeMock.mockImplementation(async (sql: string, bindVars?: any) => { + if (bindVars) { + doc_count += 1; + return { + outBinds: { + mdata: { + getData: jest + .fn() + .mockImplementation(() => bindVars.blob.val.toString()), + }, + text: { + getData: jest + .fn() + .mockImplementation(() => baseMockData + doc_count), + }, + }, + }; + } else { + return { + rows: [["MockUser"]], + }; + } }); - - test('loadFromTable with valid parameters', async () => { - // Mock the execute method for the column type query - executeMock.mockResolvedValueOnce({ + + loader = new OracleDocLoader(connMock, baseDirPath, OracleLoadFromType.DIR); + const res = await loader.load(); + + expect(res.length).toEqual(3); + for (let i = 0; i < res.length; i += 1) { + expect(res[i].pageContent).toEqual(baseMockData + (i + 1)); + if (res[i].metadata.title) { + expect(res[i].metadata.title).toEqual("Sample HTML Page"); + expect(res[i].metadata.viewport).toBeTruthy(); + expect(res[i].metadata.viewport).toEqual( + "width=device-width, initial-scale=1.0" + ); + } + } + }); + + test("loadFromTable with valid parameters", async () => { + // Mock the execute method for the column type query + executeMock.mockImplementationOnce(() => { + return { rows: [ - { COLUMN_NAME: 'COL1', DATA_TYPE: 'VARCHAR2' }, - { COLUMN_NAME: 'COL2', DATA_TYPE: 'NUMBER' }, - { COLUMN_NAME: 'COL3', DATA_TYPE: 'DATE' }, + { COLUMN_NAME: "COL1", DATA_TYPE: "VARCHAR2" }, + { COLUMN_NAME: "COL2", DATA_TYPE: "NUMBER" }, + { COLUMN_NAME: "COL3", DATA_TYPE: "DATE" }, ], - } as oracledb.Result<{ COLUMN_NAME: string; DATA_TYPE: string }>); - - // Mock the execute method for getting username - executeMock.mockResolvedValueOnce({ - rows: [{ USER: 'TESTUSER' }], - } as oracledb.Result<{ USER: string }>); - - // Mock the execute method for the main query - executeMock.mockResolvedValueOnce({ + } as Result<{ COLUMN_NAME: string; DATA_TYPE: string }>; + }); + + // Mock the execute method for getting username + executeMock.mockImplementationOnce(() => { + return { + rows: [{ USER: "TESTUSER" }], + } as Result<{ USER: string }>; + }); + + // Mock the execute method for the main query + executeMock.mockImplementationOnce(() => { + return { rows: [ { - MDATA: { getData: jest.fn().mockImplementation( () => 'Title1' ) }, - TEXT: 'Text content 1', - ROWID: 'AAABBBCCC', - COL1: 'Value1', + MDATA: { + getData: jest + .fn() + .mockImplementation( + () => + 'Title1' + ), + }, + TEXT: "Text content 1", + ROWID: "AAABBBCCC", + COL1: "Value1", COL2: 123, - COL3: new Date('2021-01-01'), + COL3: new Date("2021-01-01"), }, { - MDATA: { getData: jest.fn().mockImplementation( () => 'Title2' ) }, - TEXT: 'Text content 2', - ROWID: 'AAABBBCCD', - COL1: 'Value2', + MDATA: { + getData: jest + .fn() + .mockImplementation( + () => + 'Title2' + ), + }, + TEXT: "Text content 2", + ROWID: "AAABBBCCD", + COL1: "Value2", COL2: 456, - COL3: new Date('2021-02-01'), + COL3: new Date("2021-02-01"), }, ], - }); - - const loader = new OracleDocLoader( - conn as oracledb.Connection, - 'MYTABLE', - OracleLoadFromType.TABLE, - 'MYSCHEMA', - 'MYCOLUMN', - ['COL1', 'COL2', 'COL3'] - ); - - const documents = await loader.load(); - - expect(documents).toHaveLength(2); - - expect(documents[0].pageContent).toBe('Text content 1'); - expect(documents[0].metadata).toEqual({ - title: 'Title1', - author: 'Author1', - _oid: expect.any(String), - _rowid: 'AAABBBCCC', - COL1: 'Value1', - COL2: 123, - COL3: new Date('2021-01-01'), - }); - - expect(documents[1].pageContent).toBe('Text content 2'); - expect(documents[1].metadata).toEqual({ - title: 'Title2', - author: 'Author2', - _oid: expect.any(String), - _rowid: 'AAABBBCCD', - COL1: 'Value2', - COL2: 456, - COL3: new Date('2021-02-01'), - }); - }); - - test('loadFromTable with missing owner', async () => { - const loader = new OracleDocLoader( - conn as oracledb.Connection, - 'MYTABLE', - OracleLoadFromType.TABLE, - undefined, // owner is missing - 'MYCOLUMN', - ['COL1'] - ); - - await expect(loader.load()).rejects.toThrow( - "Owner and column name must be specified for loading from a table" - ); - }); - - test('loadFromTable with missing column name', async () => { - const loader = new OracleDocLoader( - conn as oracledb.Connection, - 'MYTABLE', - OracleLoadFromType.TABLE, - 'MYSCHEMA', - undefined, // column name is missing - ['COL1'] - ); - - await expect(loader.load()).rejects.toThrow( - "Owner and column name must be specified for loading from a table" - ); + }; }); - - test('loadFromTable with mdata_cols exceeding 3 columns', async () => { - const loader = new OracleDocLoader( - conn as oracledb.Connection, - 'MYTABLE', - OracleLoadFromType.TABLE, - 'MYSCHEMA', - 'MYCOLUMN', - ['COL1', 'COL2', 'COL3', 'COL4'] // 4 columns, exceeding limit - ); - - await expect(loader.load()).rejects.toThrow( - "Exceeds the max number of columns you can request for metadata." - ); + + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["COL1", "COL2", "COL3"] + ); + + const documents = await loader.load(); + + expect(documents).toHaveLength(2); + + expect(documents[0].pageContent).toBe("Text content 1"); + expect(documents[0].metadata).toEqual({ + title: "Title1", + author: "Author1", + _oid: expect.any(String), + _rowid: "AAABBBCCC", + COL1: "Value1", + COL2: 123, + COL3: new Date("2021-01-01"), }); - - test('loadFromTable with invalid column names in mdata_cols', async () => { - const loader = new OracleDocLoader( - conn as oracledb.Connection, - 'MYTABLE', - OracleLoadFromType.TABLE, - 'MYSCHEMA', - 'MYCOLUMN', - ['INVALID-COL1'] // invalid column name - ); - - await expect(loader.load()).rejects.toThrow( - "Invalid column name in mdata_cols: INVALID-COL1" - ); + + expect(documents[1].pageContent).toBe("Text content 2"); + expect(documents[1].metadata).toEqual({ + title: "Title2", + author: "Author2", + _oid: expect.any(String), + _rowid: "AAABBBCCD", + COL1: "Value2", + COL2: 456, + COL3: new Date("2021-02-01"), }); - - test('loadFromTable with mdata_cols containing unsupported data types', async () => { - // Mock the execute method for the column type query - executeMock.mockResolvedValueOnce({ + }); + + test("loadFromTable with missing owner", async () => { + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + undefined, // owner is missing + "MYCOLUMN", + ["COL1"] + ); + + await expect(loader.load()).rejects.toThrow( + "Owner and column name must be specified for loading from a table" + ); + }); + + test("loadFromTable with missing column name", async () => { + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + undefined, // column name is missing + ["COL1"] + ); + + await expect(loader.load()).rejects.toThrow( + "Owner and column name must be specified for loading from a table" + ); + }); + + test("loadFromTable with mdata_cols exceeding 3 columns", async () => { + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["COL1", "COL2", "COL3", "COL4"] // 4 columns, exceeding limit + ); + + await expect(loader.load()).rejects.toThrow( + "Exceeds the max number of columns you can request for metadata." + ); + }); + + test("loadFromTable with invalid column names in mdata_cols", async () => { + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["INVALID-COL1"] // invalid column name + ); + + await expect(loader.load()).rejects.toThrow( + "Invalid column name in mdata_cols: INVALID-COL1" + ); + }); + + test("loadFromTable with mdata_cols containing unsupported data types", async () => { + // Mock the execute method for the column type query + executeMock.mockImplementationOnce(() => { + return { rows: [ - { COLUMN_NAME: 'COL1', DATA_TYPE: 'CLOB' }, // Unsupported data type + { COLUMN_NAME: "COL1", DATA_TYPE: "CLOB" }, // Unsupported data type ], - } as oracledb.Result<{ COLUMN_NAME: string; DATA_TYPE: string }>); - - const loader = new OracleDocLoader( - conn as oracledb.Connection, - 'MYTABLE', - OracleLoadFromType.TABLE, - 'MYSCHEMA', - 'MYCOLUMN', - ['COL1'] - ); - - await expect(loader.load()).rejects.toThrow( - 'The datatype for the column COL1 is not supported' - ); + } as Result<{ COLUMN_NAME: string; DATA_TYPE: string }>; }); - - test('loadFromTable with empty table', async () => { - // Mock the execute method for the column type query - executeMock.mockResolvedValueOnce({ - rows: [ - { COLUMN_NAME: 'COL1', DATA_TYPE: 'VARCHAR2' }, - ], - } as oracledb.Result<{ COLUMN_NAME: string; DATA_TYPE: string }>); - - // Mock the execute method for getting username - executeMock.mockResolvedValueOnce({ - rows: [{ USER: 'TESTUSER' }], - } as oracledb.Result<{ USER: string }>); - - // Mock the execute method for the main query (empty result set) - executeMock.mockResolvedValueOnce({ + + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["COL1"] + ); + + await expect(loader.load()).rejects.toThrow( + "The datatype for the column COL1 is not supported" + ); + }); + + test("loadFromTable with empty table", async () => { + // Mock the execute method for the column type query + executeMock.mockImplementationOnce(() => { + return { + rows: [{ COLUMN_NAME: "COL1", DATA_TYPE: "VARCHAR2" }], + } as Result<{ COLUMN_NAME: string; DATA_TYPE: string }>; + }); + + // Mock the execute method for getting username + executeMock.mockImplementationOnce(() => { + return { + rows: [{ USER: "TESTUSER" }], + } as Result<{ USER: string }>; + }); + + // Mock the execute method for the main query (empty result set) + executeMock.mockImplementationOnce(() => { + return { rows: [], - } as oracledb.Result); - - const loader = new OracleDocLoader( - conn as oracledb.Connection, - 'MYTABLE', - OracleLoadFromType.TABLE, - 'MYSCHEMA', - 'MYCOLUMN', - ['COL1'] - ); - - const documents = await loader.load(); - - expect(documents).toHaveLength(0); + } as Result; }); - - test('loadFromTable with null column data', async () => { - // Mock the execute method for the column type query - executeMock.mockResolvedValueOnce({ - rows: [ - { COLUMN_NAME: 'COL1', DATA_TYPE: 'VARCHAR2' }, - ], - } as oracledb.Result<{ COLUMN_NAME: string; DATA_TYPE: string }>); - - // Mock the execute method for getting username - executeMock.mockResolvedValueOnce({ - rows: [{ USER: 'TESTUSER' }], - } as oracledb.Result<{ USER: string }>); - - // Mock the execute method for the main query with null TEXT and MDATA - executeMock.mockResolvedValueOnce({ + + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["COL1"] + ); + + const documents = await loader.load(); + + expect(documents).toHaveLength(0); + }); + + test("loadFromTable with null column data", async () => { + // Mock the execute method for the column type query + executeMock.mockImplementationOnce(() => { + return { + rows: [{ COLUMN_NAME: "COL1", DATA_TYPE: "VARCHAR2" }], + } as Result<{ COLUMN_NAME: string; DATA_TYPE: string }>; + }); + + // Mock the execute method for getting username + executeMock.mockImplementationOnce(() => { + return { + rows: [{ USER: "TESTUSER" }], + } as Result<{ USER: string }>; + }); + + // Mock the execute method for the main query with null TEXT and MDATA + executeMock.mockImplementationOnce(() => { + return { rows: [ { MDATA: null, TEXT: null, - ROWID: 'AAABBBCCC', - COL1: 'Value1', + ROWID: "AAABBBCCC", + COL1: "Value1", }, ], - } as oracledb.Result); - - const loader = new OracleDocLoader( - conn as oracledb.Connection, - 'MYTABLE', - OracleLoadFromType.TABLE, - 'MYSCHEMA', - 'MYCOLUMN', - ['COL1'] - ); - - const documents = await loader.load(); - - expect(documents).toHaveLength(1); - - expect(documents[0].pageContent).toBe(''); - expect(documents[0].metadata).toEqual({ - _oid: expect.any(String), - _rowid: 'AAABBBCCC', - COL1: 'Value1', - }); + } as Result; }); - }); - describe('OracleDocLoader - Integration Tests', () => { - let connection: oracledb.Connection; - const expectedDate1 = new Date('2021-01-01') - const expectedDate2 = new Date('2021-02-01') - - beforeAll(async () => { - try { - // Create a connection pool or a single connection - connection = await oracledb.getConnection({ - user: 'myuser', - password: 'mypassword', - connectString: 'localhost:1521/FREEPDB1', - }); - - // Drop the table if it exists - try { - await connection.execute(`DROP TABLE MYTABLE PURGE`); - } catch (err: any) { - // If the table doesn't exist, ignore the error - if (err.errorNum !== 942) { - // ORA-00942: table or view does not exist - throw err; - } - } - - // Set up the database schema and data - await connection.execute(` - CREATE TABLE MYTABLE ( - ID NUMBER GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY, - MYCOLUMN CLOB, - COL1 VARCHAR2(100), - COL2 NUMBER, - COL3 DATE - ) - `); - - await connection.execute(` - INSERT INTO MYTABLE (MYCOLUMN, COL1, COL2, COL3) VALUES ( - 'Title1', - 'Value1', - 123, - :date1 - ) - `, {date1: expectedDate1}); - - await connection.execute(` - INSERT INTO MYTABLE (MYCOLUMN, COL1, COL2, COL3) VALUES ( - 'Title2', - 'Value2', - 456, - :date2 - ) - `, {date2: expectedDate2}); - - await connection.commit(); - } catch (err) { - console.error('Error during setup:', err); - throw err; // Rethrow the error to fail the tests if setup fails - } - }); - - afterAll(async () => { - try { - // Clean up the database - await connection.execute(`DROP TABLE MYTABLE PURGE`); - await connection.close(); - } catch (err) { - console.error('Error during teardown:', err); - // You might choose to ignore errors during teardown - } - }); - - test('loadFromTable with actual database connection', async () => { - const loader = new OracleDocLoader( - connection, - 'MYTABLE', - OracleLoadFromType.TABLE, - 'MYUSER', // Schema owner, replace with your actual username - 'MYCOLUMN', - ['COL1', 'COL2', 'COL3'] - ); - - const documents = await loader.load(); - - expect(documents).toHaveLength(2); - - expect(documents[0].metadata).toMatchObject({ - title: 'Title1', - author: 'Author1', - COL1: 'Value1', - COL2: 123, - COL3: expectedDate1, - }); - - expect(documents[1].metadata).toMatchObject({ - title: 'Title2', - author: 'Author2', - COL1: 'Value2', - COL2: 456, - COL3: expectedDate2, - }); + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["COL1"] + ); + + const documents = await loader.load(); + + expect(documents).toHaveLength(1); + + expect(documents[0].pageContent).toBe(""); + expect(documents[0].metadata).toEqual({ + _oid: expect.any(String), + _rowid: "AAABBBCCC", + COL1: "Value1", }); - }); \ No newline at end of file + }); +}); diff --git a/libs/langchain-community/src/document_loaders/tests/oracleaiDB.test.js b/libs/langchain-community/src/document_loaders/tests/oracleaiDB.test.js deleted file mode 100644 index 85d83e3c01c4..000000000000 --- a/libs/langchain-community/src/document_loaders/tests/oracleaiDB.test.js +++ /dev/null @@ -1,26 +0,0 @@ -import oracledb from 'oracledb'; - -async function testConnection() { - try { - const connection = await oracledb.getConnection({ - user: 'myuser', // Replace with your actual username - password: 'mypassword', // Replace with your actual password - connectString: 'localhost:1521/FREEPDB1', - }); - console.log('Connection successful!'); - - // Execute a query against your table - const result = await connection.execute(` - SELECT ID, MYCOLUMN, COL1, COL2, COL3 - FROM MYTABLE - `); - - console.log('Query result:', result.rows); - - await connection.close(); - } catch (err) { - console.error('Connection failed:', err); - } -} - -testConnection(); diff --git a/libs/langchain-community/src/document_loaders/web/oracleai.ts b/libs/langchain-community/src/document_loaders/web/oracleai.ts index 2044e6b2ac0d..86866265ea1d 100644 --- a/libs/langchain-community/src/document_loaders/web/oracleai.ts +++ b/libs/langchain-community/src/document_loaders/web/oracleai.ts @@ -1,113 +1,114 @@ +import * as fs from "node:fs"; +import * as path from "node:path"; import { Document } from "@langchain/core/documents"; import { BaseDocumentLoader } from "@langchain/core/document_loaders/base"; import { Parser } from "htmlparser2"; +import { createHash } from "crypto"; import oracledb from "oracledb"; -import crypto from "crypto"; -import fs from "fs"; -import path from 'path'; - - interface Metadata { - [key: string]: string; + [key: string]: string; } interface OutBinds { - mdata: oracledb.Lob | null; - text: oracledb.Lob | null; + mdata: oracledb.Lob | null; + text: oracledb.Lob | null; } export interface TableRow { - MDATA?: string | null; - TEXT?: string | null; - ROWID?: string; - [key: string]: any; + MDATA?: string | null; + TEXT?: string | null; + ROWID?: string; + [key: string]: any; } export class ParseOracleDocMetadata { - private metadata: Metadata; - private match: boolean; - - constructor() { - this.metadata = {}; - this.match = false; - } + private metadata: Metadata; - private handleStartTag(tag: string, attrs: { name: string; value: string | null }[]) { - if (tag === "meta") { - let entry: string | undefined; - let content: string | null = null; + private match: boolean; - attrs.forEach(({ name, value }) => { - if (name === "name") entry = value ?? ""; - if (name === "content") content = value; - }); + constructor() { + this.metadata = {}; + this.match = false; + } - if (entry) { - this.metadata[entry] = content ?? "N/A"; - } - } else if (tag === "title") { - this.match = true; - } - } + private handleStartTag( + tag: string, + attrs: { name: string; value: string | null }[] + ) { + if (tag === "meta") { + let entry: string | undefined; + let content: string | null = null; + + attrs.forEach(({ name, value }) => { + if (name === "name") entry = value ?? ""; + if (name === "content") content = value; + }); - private handleData(data: string) { - if (this.match) { - this.metadata["title"] = data; - this.match = false; - } + if (entry) { + this.metadata[entry] = content ?? "N/A"; + } + } else if (tag === "title") { + this.match = true; } + } - public getMetadata(): Metadata { - return this.metadata; + private handleData(data: string) { + if (this.match) { + this.metadata.title = data; + this.match = false; } + } - public parse(htmlString: string): void { - // We add this method to incorperate the feed method of HTMLParser in Python - interface Attribute { - name: string; - value: string | null; - } - - interface ParserOptions { - onopentag: (name: string, attrs: Record) => void; - ontext: (text: string) => void; - } + public getMetadata(): Metadata { + return this.metadata; + } - const parser = new Parser( - { - onopentag: (name: string, attrs: Record) => - this.handleStartTag( - name, - Object.entries(attrs).map(([name, value]): Attribute => ({ - name, - value: value as string | null, - })) - ), - ontext: (text: string) => this.handleData(text), - } as ParserOptions, - { decodeEntities: true } - ); - parser.write(htmlString); - parser.end(); + public parse(htmlString: string): void { + // We add this method to incorperate the feed method of HTMLParser in Python + interface Attribute { + name: string; + value: string | null; } - -} + interface ParserOptions { + onopentag: (name: string, attrs: Record) => void; + ontext: (text: string) => void; + } + const parser = new Parser( + { + onopentag: (name: string, attrs: Record) => + this.handleStartTag( + name, + Object.entries(attrs).map( + ([name, value]): Attribute => ({ + name, + value: value as string | null, + }) + ) + ), + ontext: (text: string) => this.handleData(text), + } as ParserOptions, + { decodeEntities: true } + ); + parser.write(htmlString); + parser.end(); + } +} class OracleDocReader { static generateObjectId(inputString: string | null = null) { const outLength = 32; // Output length const hashLen = 8; // Hash value length - if (!inputString) { - inputString = Array.from( - { length: 16 }, - () => "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" - .charAt(Math.floor(Math.random() * 62)) + const idString = + inputString ?? + Array.from({ length: 16 }, () => + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789".charAt( + Math.floor(Math.random() * 62) + ) ).join(""); - } // Timestamp const timestamp = Math.floor(Date.now() / 1000); @@ -115,16 +116,20 @@ class OracleDocReader { timestampBin.writeUInt32BE(timestamp); // Hash value - const hashValBin = crypto.createHash("sha256").update(inputString).digest(); + const hashValBin = createHash("sha256").update(idString).digest(); const truncatedHashVal = hashValBin.slice(0, hashLen); // Counter const counterBin = Buffer.alloc(4); - counterBin.writeUInt32BE(Math.floor(Math.random() * Math.pow(2, 32))); + counterBin.writeUInt32BE(Math.floor(Math.random() * 2 ** 32)); // Binary object ID - const objectId = Buffer.concat([timestampBin, truncatedHashVal, counterBin]); - let objectIdHex = objectId.toString("hex").padStart(outLength, "0"); + const objectId = Buffer.concat([ + timestampBin, + truncatedHashVal, + counterBin, + ]); + const objectIdHex = objectId.toString("hex").padStart(outLength, "0"); return objectIdHex.slice(0, outLength); } @@ -139,14 +144,17 @@ class OracleDocReader { try { // Read the file as binary data const data = await new Promise((resolve, reject) => { - fs.readFile(filePath, (err: NodeJS.ErrnoException | null, data: Buffer) => { - if (err) reject(err); - else resolve(data); - }); + fs.readFile( + filePath, + (err: NodeJS.ErrnoException | null, data: Buffer) => { + if (err) reject(err); + else resolve(data); + } + ); }); if (!data) { - return new Document({pageContent: "", metadata}); + return new Document({ pageContent: "", metadata }); } const bindVars = { @@ -190,42 +198,50 @@ class OracleDocReader { } // Execute a query to get the current session user - const userResult = await conn.execute( - `SELECT USER FROM dual` - ); + const userResult = await conn.execute(`SELECT USER FROM dual`); const username = userResult.rows?.[0]?.[0]; const docId = OracleDocReader.generateObjectId(`${username}$${filePath}`); - metadata["_oid"] = docId; - metadata["_file"] = filePath; + metadata._oid = docId; + metadata._file = filePath; textData = textData ?? ""; - return new Document({pageContent: textData, metadata}) + return new Document({ pageContent: textData, metadata }); } catch (ex) { console.error(`An exception occurred: ${ex}`); console.error(`Skip processing ${filePath}`); return null; } } - } export enum OracleLoadFromType { FILE, DIR, TABLE, -}; +} export class OracleDocLoader extends BaseDocumentLoader { private conn: oracledb.Connection; + private loadFrom: string; + private loadFromType: OracleLoadFromType; + private owner?: string; + private colname?: string; + private mdata_cols?: string[]; - constructor(conn: oracledb.Connection, loadFrom: string, loadFromType: OracleLoadFromType, - owner?: string, colname?: string, mdata_cols?: string[]) { + constructor( + conn: oracledb.Connection, + loadFrom: string, + loadFromType: OracleLoadFromType, + owner?: string, + colname?: string, + mdata_cols?: string[] + ) { super(); this.conn = conn; this.loadFrom = loadFrom; @@ -236,15 +252,22 @@ export class OracleDocLoader extends BaseDocumentLoader { } public async load(): Promise { - const documents: Document[] = [] - const m_params = {"plaintext": "false"} + const documents: Document[] = []; + const m_params = { plaintext: "false" }; switch (this.loadFromType) { case OracleLoadFromType.FILE: - const filepath = this.loadFrom - const doc = await OracleDocReader.readFile(this.conn, filepath, m_params) - if (doc) - documents.push(doc); + try { + const filepath = this.loadFrom; + const doc = await OracleDocReader.readFile( + this.conn, + filepath, + m_params + ); + if (doc) documents.push(doc); + } catch (err) { + console.error("Error reading file:", err); + } break; case OracleLoadFromType.DIR: @@ -256,196 +279,197 @@ export class OracleDocLoader extends BaseDocumentLoader { const stats = await fs.promises.lstat(filepath); if (stats.isFile()) { - const doc = await OracleDocReader.readFile(this.conn, filepath, m_params) - if (doc) - documents.push(doc); + const doc = await OracleDocReader.readFile( + this.conn, + filepath, + m_params + ); + if (doc) documents.push(doc); } } } catch (err) { - console.error('Error reading directory:', err); + console.error("Error reading directory:", err); } break; case OracleLoadFromType.TABLE: - return await this.loadFromTable(m_params); - default: - throw new Error("Invalid type to load from"); - } - return documents - } - - private isValidIdentifier(identifier: string): boolean { - return /^[A-Za-z_][A-Za-z0-9_]*$/.test(identifier); - } - - private async getUsername(): Promise { - const result = await this.conn.execute<{ USER: string }>('SELECT USER FROM dual'); - return (result.rows?.[0]?.USER) || "unknown_user"; - } - - - private async loadFromTable(m_params: any): Promise { - const results: Document[] = []; - try { - if (!this.owner || !this.colname) { - throw new Error("Owner and column name must be specified for loading from a table"); - } + try { + if (!this.owner || !this.colname) { + throw new Error( + "Owner and column name must be specified for loading from a table" + ); + } - // Validate identifiers to prevent SQL injection - if (!this.isValidIdentifier(this.owner)) { + // Validate identifiers to prevent SQL injection + if (!this.isValidIdentifier(this.owner)) { throw new Error("Invalid owner name"); - } + } - if (!this.isValidIdentifier(this.loadFrom)) { + if (!this.isValidIdentifier(this.loadFrom)) { throw new Error("Invalid table name"); - } + } - if (!this.isValidIdentifier(this.colname)) { + if (!this.isValidIdentifier(this.colname)) { throw new Error("Invalid column name"); - } + } - let mdataColsSql = ", t.ROWID"; + let mdataColsSql = ", t.ROWID"; - if (this.mdata_cols) { + if (this.mdata_cols) { if (this.mdata_cols.length > 3) { - throw new Error("Exceeds the max number of columns you can request for metadata."); + throw new Error( + "Exceeds the max number of columns you can request for metadata." + ); } - + // **First, check if the column names are valid identifiers** for (const col of this.mdata_cols) { - if (!this.isValidIdentifier(col)) { - throw new Error(`Invalid column name in mdata_cols: ${col}`); - } + if (!this.isValidIdentifier(col)) { + throw new Error(`Invalid column name in mdata_cols: ${col}`); + } } // Execute a query to get column data types const colSql = ` - SELECT COLUMN_NAME, DATA_TYPE - FROM ALL_TAB_COLUMNS - WHERE OWNER = :ownername AND TABLE_NAME = :tablename - `; + SELECT COLUMN_NAME, DATA_TYPE + FROM ALL_TAB_COLUMNS + WHERE OWNER = :ownername AND TABLE_NAME = :tablename + `; const colBinds = { - ownername: this.owner.toUpperCase(), - tablename: this.loadFrom.toUpperCase(), + ownername: this.owner.toUpperCase(), + tablename: this.loadFrom.toUpperCase(), }; - const colResult = await this.conn.execute<{ COLUMN_NAME: string; DATA_TYPE: string }>( - colSql, - colBinds, - { outFormat: oracledb.OUT_FORMAT_OBJECT } - ); + const colResult = await this.conn.execute<{ + COLUMN_NAME: string; + DATA_TYPE: string; + }>(colSql, colBinds, { outFormat: oracledb.OUT_FORMAT_OBJECT }); const colRows = colResult.rows; if (!colRows) { - throw new Error("Failed to retrieve column information"); + throw new Error("Failed to retrieve column information"); } const colTypes: Record = {}; for (const row of colRows) { - const colName = row.COLUMN_NAME; - const dataType = row.DATA_TYPE; - colTypes[colName] = dataType; + const colName = row.COLUMN_NAME; + const dataType = row.DATA_TYPE; + colTypes[colName] = dataType; } for (const col of this.mdata_cols) { - if (!this.isValidIdentifier(col)) { - throw new Error(`Invalid column name in mdata_cols: ${col}`); - } - - const dataType = colTypes[col]; - if (!dataType) { - throw new Error(`Column ${col} not found in table ${this.loadFrom}`); - } - - if ( - ![ - "NUMBER", - "BINARY_DOUBLE", - "BINARY_FLOAT", - "LONG", - "DATE", - "TIMESTAMP", - "VARCHAR2", - ].includes(dataType) - ) { - throw new Error(`The datatype for the column ${col} is not supported`); - } + if (!this.isValidIdentifier(col)) { + throw new Error(`Invalid column name in mdata_cols: ${col}`); + } + + const dataType = colTypes[col]; + if (!dataType) { + throw new Error( + `Column ${col} not found in table ${this.loadFrom}` + ); + } + + if ( + ![ + "NUMBER", + "BINARY_DOUBLE", + "BINARY_FLOAT", + "LONG", + "DATE", + "TIMESTAMP", + "VARCHAR2", + ].includes(dataType) + ) { + throw new Error( + `The datatype for the column ${col} is not supported` + ); + } } for (const col of this.mdata_cols) { - mdataColsSql += `, t.${col}`; + mdataColsSql += `, t.${col}`; } - } + } - const mainSql = ` - SELECT dbms_vector_chain.utl_to_text(t.${this.colname}, json(:params)) AS MDATA, - dbms_vector_chain.utl_to_text(t.${this.colname}) AS TEXT - ${mdataColsSql} - FROM ${this.owner}.${this.loadFrom} t - `; + const mainSql = ` + SELECT dbms_vector_chain.utl_to_text(t.${this.colname}, json(:params)) AS MDATA, + dbms_vector_chain.utl_to_text(t.${this.colname}) AS TEXT + ${mdataColsSql} + FROM ${this.owner}.${this.loadFrom} t + `; - const mainBinds = { + const mainBinds = { params: JSON.stringify(m_params), - }; + }; - const options = { + const options = { outFormat: oracledb.OUT_FORMAT_OBJECT, - }; + }; - // Get the username - const userResult = await this.conn.execute<{ USER: string }>('SELECT USER FROM dual'); - const username = userResult.rows?.[0]?.USER || "unknown_user"; + // Get the username + const userResult = await this.conn.execute<{ USER: string }>( + "SELECT USER FROM dual" + ); + const username = userResult.rows?.[0]?.USER || "unknown_user"; - // Execute the main SQL query - const result = await this.conn.execute(mainSql, mainBinds, options); - const rows = result.rows as TableRow[]; + // Execute the main SQL query + const result = await this.conn.execute(mainSql, mainBinds, options); + const rows = result.rows as TableRow[]; - if (rows) { + if (rows) { for (const row of rows) { - let metadata: Record = {}; - - if (row["MDATA"]) { - const data = (await (row["MDATA"] as unknown as oracledb.Lob).getData()).toString(); - if ( - data.trim().startsWith("") - ) { - const parser = new ParseOracleDocMetadata(); - parser.parse(data); - metadata = { ...metadata, ...parser.getMetadata() }; - } + let metadata: Record = {}; + + if (row.MDATA) { + const data = ( + await (row.MDATA as unknown as oracledb.Lob).getData() + ).toString(); + if ( + data.trim().startsWith("") + ) { + const parser = new ParseOracleDocMetadata(); + parser.parse(data); + metadata = { ...metadata, ...parser.getMetadata() }; } + } - const docId = OracleDocReader.generateObjectId( - `${username}$${this.owner}$${this.loadFrom}$${this.colname}$${row["ROWID"]}` - ); + const docId = OracleDocReader.generateObjectId( + `${username}$${this.owner}$${this.loadFrom}$${this.colname}$${row.ROWID}` + ); - metadata["_oid"] = docId; - metadata["_rowid"] = row["ROWID"]; + metadata._oid = docId; + metadata._rowid = row.ROWID; - if (this.mdata_cols) { - for (const colName of this.mdata_cols) { - metadata[colName] = row[colName]; - } + if (this.mdata_cols) { + for (const colName of this.mdata_cols) { + metadata[colName] = row[colName]; } + } - const text = row["TEXT"] as string; + const text = row.TEXT as string; - if (text === null || text === undefined) { - results.push(new Document({ pageContent: "", metadata })); - } else { - results.push(new Document({ pageContent: text, metadata })); - } + if (text === null || text === undefined) { + documents.push(new Document({ pageContent: "", metadata })); + } else { + documents.push(new Document({ pageContent: text, metadata })); + } } + } + break; + } catch (ex) { + console.error(`An exception occurred: ${ex}`); + throw ex; } - - return results; - } catch (ex) { - console.error(`An exception occurred: ${ex}`); - throw ex; + default: + throw new Error("Invalid type to load from"); } + return documents; + } + + private isValidIdentifier(identifier: string): boolean { + return /^[A-Za-z_][A-Za-z0-9_]*$/.test(identifier); } - } diff --git a/package.json b/package.json index 09f148fa452c..d647515dcbcf 100644 --- a/package.json +++ b/package.json @@ -46,16 +46,13 @@ "license": "MIT", "devDependencies": { "@tsconfig/recommended": "^1.0.2", - "@types/jest": "^29.5.14", - "@types/oracledb": "^6", + "@types/jest": "^29.5.3", "@types/semver": "^7", "commander": "^11.1.0", "dotenv": "^16.0.3", - "jest": "^29.7.0", "lint-staged": "^13.1.1", "prettier": "^2.8.3", "semver": "^7.5.4", - "ts-jest": "^29.2.5", "turbo": "^1.13.3", "typescript": "~5.1.6" }, @@ -72,8 +69,5 @@ "eslint --cache --fix" ], "*.md": "prettier --config .prettierrc --write" - }, - "dependencies": { - "oracledb": "^6.7.0" } } diff --git a/yarn.lock b/yarn.lock index 5d565c6d8c9c..c0139a751072 100644 --- a/yarn.lock +++ b/yarn.lock @@ -11758,11 +11758,11 @@ __metadata: "@types/d3-dsv": ^3.0.7 "@types/flat": ^5.0.2 "@types/html-to-text": ^9 - "@types/jest": ^29.5.14 "@types/jsdom": ^21.1.1 "@types/jsonwebtoken": ^9 "@types/lodash": ^4 "@types/mozilla-readability": ^0.2.1 + "@types/oracledb": ^6 "@types/pdf-parse": ^1.1.1 "@types/pg": ^8.11.0 "@types/pg-copy-streams": ^1.2.2 @@ -11819,12 +11819,13 @@ __metadata: hdb: 0.19.8 hnswlib-node: ^3.0.0 html-to-text: ^9.0.5 + htmlparser2: ^9.1.0 ibm-cloud-sdk-core: ^5.0.2 ignore: ^5.2.0 interface-datastore: ^8.2.11 ioredis: ^5.3.2 it-all: ^3.0.4 - jest: ^29.7.0 + jest: ^29.5.0 jest-environment-node: ^29.6.4 js-yaml: ^4.1.0 jsdom: ^22.1.0 @@ -11842,6 +11843,7 @@ __metadata: notion-to-md: ^3.1.0 officeparser: ^4.0.4 openai: "*" + oracledb: ^6.7.0 pdf-parse: 1.1.1 pg: ^8.11.0 pg-copy-streams: ^6.0.5 @@ -11857,7 +11859,7 @@ __metadata: rollup: ^3.19.1 sonix-speech-recognition: ^2.1.1 srt-parser-2: ^1.2.3 - ts-jest: ^29.2.5 + ts-jest: ^29.1.0 typeorm: ^0.3.20 typescript: ~5.1.6 typesense: ^1.5.3 @@ -19313,13 +19315,13 @@ __metadata: languageName: node linkType: hard -"@types/jest@npm:^29.5.14": - version: 29.5.14 - resolution: "@types/jest@npm:29.5.14" +"@types/jest@npm:^29.5.3": + version: 29.5.3 + resolution: "@types/jest@npm:29.5.3" dependencies: expect: ^29.0.0 pretty-format: ^29.0.0 - checksum: 18dba4623f26661641d757c63da2db45e9524c9be96a29ef713c703a9a53792df9ecee9f7365a0858ddbd6440d98fe6b65ca67895ca5884b73cbc7ffc11f3838 + checksum: e36bb92e0b9e5ea7d6f8832baa42f087fc1697f6cd30ec309a07ea4c268e06ec460f1f0cfd2581daf5eff5763475190ec1ad8ac6520c49ccfe4f5c0a48bfa676 languageName: node linkType: hard @@ -22529,7 +22531,7 @@ __metadata: languageName: node linkType: hard -"bs-logger@npm:0.x, bs-logger@npm:^0.2.6": +"bs-logger@npm:0.x": version: 0.2.6 resolution: "bs-logger@npm:0.2.6" dependencies: @@ -22979,7 +22981,7 @@ __metadata: languageName: node linkType: hard -"chalk@npm:^4.0.0, chalk@npm:^4.0.2, chalk@npm:^4.1.0, chalk@npm:^4.1.2": +"chalk@npm:^4.0.0, chalk@npm:^4.1.0, chalk@npm:^4.1.2": version: 4.1.2 resolution: "chalk@npm:4.1.2" dependencies: @@ -25983,17 +25985,6 @@ __metadata: languageName: node linkType: hard -"ejs@npm:^3.1.10": - version: 3.1.10 - resolution: "ejs@npm:3.1.10" - dependencies: - jake: ^10.8.5 - bin: - ejs: bin/cli.js - checksum: ce90637e9c7538663ae023b8a7a380b2ef7cc4096de70be85abf5a3b9641912dde65353211d05e24d56b1f242d71185c6d00e02cb8860701d571786d92c71f05 - languageName: node - linkType: hard - "electron-to-chromium@npm:^1.4.284": version: 1.4.322 resolution: "electron-to-chromium@npm:1.4.322" @@ -30315,6 +30306,18 @@ __metadata: languageName: node linkType: hard +"htmlparser2@npm:^9.1.0": + version: 9.1.0 + resolution: "htmlparser2@npm:9.1.0" + dependencies: + domelementtype: ^2.3.0 + domhandler: ^5.0.3 + domutils: ^3.1.0 + entities: ^4.5.0 + checksum: e5f8d5193967e4a500226f37bdf2c0f858cecb39dde14d0439f24bf2c461a4342778740d988fbaba652b0e4cb6052f7f2e99e69fc1a329a86c629032bb76e7c8 + languageName: node + linkType: hard + "http-cache-semantics@npm:^4.0.0, http-cache-semantics@npm:^4.1.0, http-cache-semantics@npm:^4.1.1": version: 4.1.1 resolution: "http-cache-semantics@npm:4.1.1" @@ -32002,20 +32005,6 @@ __metadata: languageName: node linkType: hard -"jake@npm:^10.8.5": - version: 10.9.2 - resolution: "jake@npm:10.9.2" - dependencies: - async: ^3.2.3 - chalk: ^4.0.2 - filelist: ^1.0.4 - minimatch: ^3.1.2 - bin: - jake: bin/cli.js - checksum: f2dc4a086b4f58446d02cb9be913c39710d9ea570218d7681bb861f7eeaecab7b458256c946aeaa7e548c5e0686cc293e6435501e4047174a3b6a504dcbfcaae - languageName: node - linkType: hard - "javascript-stringify@npm:^2.0.1": version: 2.1.0 resolution: "javascript-stringify@npm:2.1.0" @@ -32759,7 +32748,7 @@ __metadata: languageName: node linkType: hard -"jest@npm:^29.5.0, jest@npm:^29.7.0": +"jest@npm:^29.5.0": version: 29.7.0 resolution: "jest@npm:29.7.0" dependencies: @@ -33431,17 +33420,13 @@ __metadata: resolution: "langchainjs@workspace:." dependencies: "@tsconfig/recommended": ^1.0.2 - "@types/jest": ^29.5.14 - "@types/oracledb": ^6 + "@types/jest": ^29.5.3 "@types/semver": ^7 commander: ^11.1.0 dotenv: ^16.0.3 - jest: ^29.7.0 lint-staged: ^13.1.1 - oracledb: ^6.7.0 prettier: ^2.8.3 semver: ^7.5.4 - ts-jest: ^29.2.5 turbo: ^1.13.3 typescript: ~5.1.6 languageName: unknown @@ -34284,7 +34269,7 @@ __metadata: languageName: node linkType: hard -"make-error@npm:1.x, make-error@npm:^1.3.6": +"make-error@npm:1.x": version: 1.3.6 resolution: "make-error@npm:1.3.6" checksum: b86e5e0e25f7f777b77fabd8e2cbf15737972869d852a22b7e73c17623928fccb826d8e46b9951501d3f20e51ad74ba8c59ed584f610526a48f8ccf88aaec402 @@ -42108,43 +42093,6 @@ __metadata: languageName: node linkType: hard -"ts-jest@npm:^29.2.5": - version: 29.2.5 - resolution: "ts-jest@npm:29.2.5" - dependencies: - bs-logger: ^0.2.6 - ejs: ^3.1.10 - fast-json-stable-stringify: ^2.1.0 - jest-util: ^29.0.0 - json5: ^2.2.3 - lodash.memoize: ^4.1.2 - make-error: ^1.3.6 - semver: ^7.6.3 - yargs-parser: ^21.1.1 - peerDependencies: - "@babel/core": ">=7.0.0-beta.0 <8" - "@jest/transform": ^29.0.0 - "@jest/types": ^29.0.0 - babel-jest: ^29.0.0 - jest: ^29.0.0 - typescript: ">=4.3 <6" - peerDependenciesMeta: - "@babel/core": - optional: true - "@jest/transform": - optional: true - "@jest/types": - optional: true - babel-jest: - optional: true - esbuild: - optional: true - bin: - ts-jest: cli.js - checksum: d60d1e1d80936f6002b1bb27f7e062408bc733141b9d666565503f023c340a3196d506c836a4316c5793af81a5f910ab49bb9c13f66e2dc66de4e0f03851dbca - languageName: node - linkType: hard - "ts-md5@npm:^1.3.1": version: 1.3.1 resolution: "ts-md5@npm:1.3.1"