diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/oracleai.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/oracleai.mdx new file mode 100644 index 000000000000..7ed4077ca2c7 --- /dev/null +++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/oracleai.mdx @@ -0,0 +1,69 @@ +# Oracle AI + +This example goes over how to load documents using Oracle AI Vector Search. + +## Setup + +You'll need to install the [oracledb](https://www.npmjs.com/package/oracledb) package: + +```bash npm2yarn +npm install @langchain/community @langchain/core oracledb +``` + +## Usage + +### Connect to Oracle Database +You'll need to provide the username, password, hostname and service_name: + +```typescript +import oracledb from 'oracledb'; + +const connection: oracledb.Connection; + +// Replace the placeholders with your information +const username = ""; +const password = ""; +const dsn = "/"; + +try { + connection = await oracledb.getConnection({ + user: username, + password:password, + connectString: dsn + }); + console.log("Connection Successful"); +} catch (err) { + console.error('Connection failed:', err); +} +``` + +### Load Documents +As for loading documents, you have 3 options: +- Loading a local file. +- Loading from a local directory. +- Loading from the Oracle Database. + +When loading from the Oracle Database, you must provide the table's name, owner's name, and the name of the column to load. Optionally, you can provide extra column names to be included in the returned documents' metadata: + +```typescript +import { OracleDocLoader, OracleLoadFromType } from "@langchain/community/document_loaders/web/oracleai"; + +const loader: OracleDocLoader; +/* +// Loading a local file (replace with the path of the file you want to load.) +loader = new OracleDocLoader(connection, , OracleLoadFromType.FILE); + + +// Loading from a local directory (replace with the path of the directory you want to load from.) +loader = new OracleDocLoader(connection, , OracleLoadFromType.DIR); +*/ + +// Loading from Oracle Database table (replace the placeholders with your information, optionally add a [metadata_cols] parameter to include columns as metadata.) +loader = new OracleDocLoader(connection, , OracleLoadFromType.TABLE, , ); + +// Load the docs +const docs = loader.load(); +console.log("Number of docs loaded:", docs.length) +console.log("Document-0:", docs[0].page_content) // content +``` + diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore index 890c93717dea..e47aa99447a3 100644 --- a/libs/langchain-community/.gitignore +++ b/libs/langchain-community/.gitignore @@ -906,6 +906,10 @@ document_loaders/web/notionapi.cjs document_loaders/web/notionapi.js document_loaders/web/notionapi.d.ts document_loaders/web/notionapi.d.cts +document_loaders/web/oracleai.cjs +document_loaders/web/oracleai.js +document_loaders/web/oracleai.d.ts +document_loaders/web/oracleai.d.cts document_loaders/web/pdf.cjs document_loaders/web/pdf.js document_loaders/web/pdf.d.ts diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js index 63b495f92f2c..f28f31eed74d 100644 --- a/libs/langchain-community/langchain.config.js +++ b/libs/langchain-community/langchain.config.js @@ -280,6 +280,7 @@ export const config = { "document_loaders/web/github": "document_loaders/web/github", "document_loaders/web/taskade": "document_loaders/web/taskade", "document_loaders/web/notionapi": "document_loaders/web/notionapi", + "document_loaders/web/oracleai": "document_loaders/web/oracleai", "document_loaders/web/pdf": "document_loaders/web/pdf", "document_loaders/web/recursive_url": "document_loaders/web/recursive_url", "document_loaders/web/s3": "document_loaders/web/s3", @@ -494,6 +495,7 @@ export const config = { "document_loaders/web/pdf", "document_loaders/web/taskade", "document_loaders/web/notionapi", + "document_loaders/web/oracleai", "document_loaders/web/recursive_url", "document_loaders/web/s3", "document_loaders/web/sitemap", diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 1ac1e06563ba..a2ed7890ef30 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -39,6 +39,7 @@ "binary-extensions": "^2.2.0", "expr-eval": "^2.0.2", "flat": "^5.0.2", + "htmlparser2": "^9.1.0", "js-yaml": "^4.1.0", "langchain": ">=0.2.3 <0.3.0 || >=0.3.4 <0.4.0", "langsmith": "^0.2.0", @@ -2753,6 +2754,15 @@ "import": "./document_loaders/web/notionapi.js", "require": "./document_loaders/web/notionapi.cjs" }, + "./document_loaders/web/oracleai": { + "types": { + "import": "./document_loaders/web/oracleai.d.ts", + "require": "./document_loaders/web/oracleai.d.cts", + "default": "./document_loaders/web/oracleai.d.ts" + }, + "import": "./document_loaders/web/oracleai.js", + "require": "./document_loaders/web/oracleai.cjs" + }, "./document_loaders/web/pdf": { "types": { "import": "./document_loaders/web/pdf.d.ts", @@ -3971,6 +3981,10 @@ "document_loaders/web/notionapi.js", "document_loaders/web/notionapi.d.ts", "document_loaders/web/notionapi.d.cts", + "document_loaders/web/oracleai.cjs", + "document_loaders/web/oracleai.js", + "document_loaders/web/oracleai.d.ts", + "document_loaders/web/oracleai.d.cts", "document_loaders/web/pdf.cjs", "document_loaders/web/pdf.js", "document_loaders/web/pdf.d.ts", diff --git a/libs/langchain-community/src/document_loaders/tests/example_data/oracleai/example.html b/libs/langchain-community/src/document_loaders/tests/example_data/oracleai/example.html index fbfa6c5ce47c..7672eb6e9e13 100644 --- a/libs/langchain-community/src/document_loaders/tests/example_data/oracleai/example.html +++ b/libs/langchain-community/src/document_loaders/tests/example_data/oracleai/example.html @@ -1,25 +1,28 @@ - - - - - + + + + + Sample HTML Page - - + +
-

Welcome to My Sample HTML Page

+

Welcome to My Sample HTML Page

-

Introduction

-

This is a small HTML file with a header, main content section, and a footer.

-

Feel free to modify and experiment with the code!

+

Introduction

+

+ This is a small HTML file with a header, main content section, and a + footer. +

+

Feel free to modify and experiment with the code!

-

Footer Content - © 2024

+

Footer Content - © 2024

- - \ No newline at end of file + + diff --git a/libs/langchain-community/src/document_loaders/tests/oracleai.test.ts b/libs/langchain-community/src/document_loaders/tests/oracleai.test.ts index 320d5cb4b4d6..9a0125d6f27e 100644 --- a/libs/langchain-community/src/document_loaders/tests/oracleai.test.ts +++ b/libs/langchain-community/src/document_loaders/tests/oracleai.test.ts @@ -1,363 +1,424 @@ import { jest } from "@jest/globals"; -import { ParseOracleDocMetadata, OracleDocLoader, OracleLoadFromType, TableRow } from "../web/oracleai.js"; -import oracledb from "oracledb"; +import { Connection, Result } from "oracledb"; +import { + ParseOracleDocMetadata, + OracleDocLoader, + OracleLoadFromType, + TableRow, +} from "../web/oracleai.js"; describe("ParseOracleDocMetadata", () => { - let parser: ParseOracleDocMetadata; + let parser: ParseOracleDocMetadata; - beforeEach(() => { - parser = new ParseOracleDocMetadata(); - }); + beforeEach(() => { + parser = new ParseOracleDocMetadata(); + }); - test("should parse title and meta tags correctly", () => { - const htmlString = "Sample Title"; - parser.parse(htmlString); - const metadata = parser.getMetadata(); - expect(metadata).toEqual({ - title: "Sample Title", - description: "Sample Content", - }); + test("should parse title and meta tags correctly", () => { + const htmlString = + "Sample Title"; + parser.parse(htmlString); + const metadata = parser.getMetadata(); + expect(metadata).toEqual({ + title: "Sample Title", + description: "Sample Content", }); + }); - test("should handle missing meta content gracefully", () => { - const htmlString = "Sample Title"; - parser.parse(htmlString); - const metadata = parser.getMetadata(); - expect(metadata).toEqual({ - title: "Sample Title", - description: "N/A", - }); + test("should handle missing meta content gracefully", () => { + const htmlString = + "Sample Title"; + parser.parse(htmlString); + const metadata = parser.getMetadata(); + expect(metadata).toEqual({ + title: "Sample Title", + description: "N/A", }); + }); - test("should handle multiple meta tags", () => { - const htmlString = "Sample Title"; - parser.parse(htmlString); - const metadata = parser.getMetadata(); - expect(metadata).toEqual({ - title: "Sample Title", - description: "Sample Content", - author: "John Doe", - }); + test("should handle multiple meta tags", () => { + const htmlString = + "Sample Title"; + parser.parse(htmlString); + const metadata = parser.getMetadata(); + expect(metadata).toEqual({ + title: "Sample Title", + description: "Sample Content", + author: "John Doe", }); + }); - test("should handle no title tag", () => { - const htmlString = ""; - parser.parse(htmlString); - const metadata = parser.getMetadata(); - expect(metadata).toEqual({ - description: "Sample Content", - }); + test("should handle no title tag", () => { + const htmlString = + ""; + parser.parse(htmlString); + const metadata = parser.getMetadata(); + expect(metadata).toEqual({ + description: "Sample Content", }); + }); - test("should handle empty html string", () => { - const htmlString = ""; - parser.parse(htmlString); - const metadata = parser.getMetadata(); - expect(metadata).toEqual({}); - }); + test("should handle empty html string", () => { + const htmlString = ""; + parser.parse(htmlString); + const metadata = parser.getMetadata(); + expect(metadata).toEqual({}); + }); }); describe("OracleDocLoader", () => { - let executeMock: jest.Mock<(sql: string, bindVars?: any) => {}> - let connMock: jest.Mocked; - let loader: OracleDocLoader; - const baseDirPath = "./src/document_loaders/tests/example_data/oracleai"; - const baseMockData = "MockData" - - beforeEach(() => { - executeMock = jest.fn(); - connMock = {execute: executeMock} as unknown as jest.Mocked; - }); + let executeMock: jest.Mock<(sql: string, bindVars?: any) => object>; + let connMock: jest.Mocked; + let loader: OracleDocLoader; + const baseDirPath = "./src/document_loaders/tests/example_data/oracleai"; + const baseMockData = "MockData"; + + beforeEach(() => { + executeMock = jest.fn(); + connMock = { execute: executeMock } as unknown as jest.Mocked; + }); - test("should load a single file properly", async () => { - executeMock.mockImplementation(async (sql: string, bindVars?: {}) => { - if (bindVars) { - return { - outBinds: { - mdata: { getData: jest.fn().mockImplementation( () => bindVars.blob.val.toString() ) }, - text: { getData: jest.fn().mockImplementation( () => baseMockData + 1 ) } } - }; - } - else { - return { - rows: [['MockUser']] - }; - } - }); - - loader = new OracleDocLoader(connMock, baseDirPath + "/example.html", OracleLoadFromType.FILE); - const res = await loader.load(); - console.log(res) - expect(res.length).toEqual(1); - expect(res[0].pageContent).toEqual(baseMockData + "1"); - expect(res[0].metadata.title).toBeTruthy(); - expect(res[0].metadata.title).toEqual("Sample HTML Page"); - expect(res[0].metadata.viewport).toBeTruthy(); - expect(res[0].metadata.viewport).toEqual("width=device-width, initial-scale=1.0"); + test("should load a single file properly", async () => { + executeMock.mockImplementation(async (sql: string, bindVars?: any) => { + if (bindVars) { + return { + outBinds: { + mdata: { + getData: jest + .fn() + .mockImplementation(() => bindVars.blob.val.toString()), + }, + text: { + getData: jest.fn().mockImplementation(() => baseMockData + 1), + }, + }, + }; + } else { + return { + rows: [["MockUser"]], + }; + } }); - test("should load a directory properly", async () => { - let doc_count = 0; - executeMock.mockImplementation(async (sql: string, bindVars?: {}) => { - if (bindVars) { - doc_count++; - return { - outBinds: { - mdata: { getData: jest.fn().mockImplementation( () => bindVars.blob.val.toString() ) }, - text: { getData: jest.fn().mockImplementation( () => baseMockData + doc_count ) } } - }; - } - else { - return { - rows: [['MockUser']] - }; - } - }); - - loader = new OracleDocLoader(connMock, baseDirPath, OracleLoadFromType.DIR); - const res = await loader.load(); - - expect(res.length).toEqual(3); - for (let i = 0; i < res.length; i += 1) { - expect(res[i].pageContent).toEqual(baseMockData + (i+1)); - if (res[i].metadata.title) { - expect(res[i].metadata.title).toEqual("Sample HTML Page"); - expect(res[i].metadata.viewport).toBeTruthy(); - expect(res[i].metadata.viewport).toEqual("width=device-width, initial-scale=1.0"); - } - } + loader = new OracleDocLoader( + connMock, + baseDirPath + "/example.html", + OracleLoadFromType.FILE + ); + const res = await loader.load(); + console.log(res); + expect(res.length).toEqual(1); + expect(res[0].pageContent).toEqual(baseMockData + "1"); + expect(res[0].metadata.title).toBeTruthy(); + expect(res[0].metadata.title).toEqual("Sample HTML Page"); + expect(res[0].metadata.viewport).toBeTruthy(); + expect(res[0].metadata.viewport).toEqual( + "width=device-width, initial-scale=1.0" + ); + }); + + test("should load a directory properly", async () => { + let doc_count = 0; + executeMock.mockImplementation(async (sql: string, bindVars?: any) => { + if (bindVars) { + doc_count += 1; + return { + outBinds: { + mdata: { + getData: jest + .fn() + .mockImplementation(() => bindVars.blob.val.toString()), + }, + text: { + getData: jest + .fn() + .mockImplementation(() => baseMockData + doc_count), + }, + }, + }; + } else { + return { + rows: [["MockUser"]], + }; + } }); - test('loadFromTable with valid parameters', async () => { - // Mock the execute method for the column type query - executeMock.mockImplementationOnce( () => { return { + loader = new OracleDocLoader(connMock, baseDirPath, OracleLoadFromType.DIR); + const res = await loader.load(); + + expect(res.length).toEqual(3); + for (let i = 0; i < res.length; i += 1) { + expect(res[i].pageContent).toEqual(baseMockData + (i + 1)); + if (res[i].metadata.title) { + expect(res[i].metadata.title).toEqual("Sample HTML Page"); + expect(res[i].metadata.viewport).toBeTruthy(); + expect(res[i].metadata.viewport).toEqual( + "width=device-width, initial-scale=1.0" + ); + } + } + }); + + test("loadFromTable with valid parameters", async () => { + // Mock the execute method for the column type query + executeMock.mockImplementationOnce(() => { + return { rows: [ - { COLUMN_NAME: 'COL1', DATA_TYPE: 'VARCHAR2' }, - { COLUMN_NAME: 'COL2', DATA_TYPE: 'NUMBER' }, - { COLUMN_NAME: 'COL3', DATA_TYPE: 'DATE' }, + { COLUMN_NAME: "COL1", DATA_TYPE: "VARCHAR2" }, + { COLUMN_NAME: "COL2", DATA_TYPE: "NUMBER" }, + { COLUMN_NAME: "COL3", DATA_TYPE: "DATE" }, ], - } as oracledb.Result<{ COLUMN_NAME: string; DATA_TYPE: string }>}); - - // Mock the execute method for getting username - executeMock.mockImplementationOnce( () => { return { - rows: [{ USER: 'TESTUSER' }], - } as oracledb.Result<{ USER: string }> }); - - // Mock the execute method for the main query - executeMock.mockImplementationOnce( () => { return { + } as Result<{ COLUMN_NAME: string; DATA_TYPE: string }>; + }); + + // Mock the execute method for getting username + executeMock.mockImplementationOnce(() => { + return { + rows: [{ USER: "TESTUSER" }], + } as Result<{ USER: string }>; + }); + + // Mock the execute method for the main query + executeMock.mockImplementationOnce(() => { + return { rows: [ { - MDATA: { getData: jest.fn().mockImplementation( () => 'Title1' ) }, - TEXT: 'Text content 1', - ROWID: 'AAABBBCCC', - COL1: 'Value1', + MDATA: { + getData: jest + .fn() + .mockImplementation( + () => + 'Title1' + ), + }, + TEXT: "Text content 1", + ROWID: "AAABBBCCC", + COL1: "Value1", COL2: 123, - COL3: new Date('2021-01-01'), + COL3: new Date("2021-01-01"), }, { - MDATA: { getData: jest.fn().mockImplementation( () => 'Title2' ) }, - TEXT: 'Text content 2', - ROWID: 'AAABBBCCD', - COL1: 'Value2', + MDATA: { + getData: jest + .fn() + .mockImplementation( + () => + 'Title2' + ), + }, + TEXT: "Text content 2", + ROWID: "AAABBBCCD", + COL1: "Value2", COL2: 456, - COL3: new Date('2021-02-01'), + COL3: new Date("2021-02-01"), }, ], - } }); - - const loader = new OracleDocLoader( - connMock, - 'MYTABLE', - OracleLoadFromType.TABLE, - 'MYSCHEMA', - 'MYCOLUMN', - ['COL1', 'COL2', 'COL3'] - ); - - const documents = await loader.load(); - - expect(documents).toHaveLength(2); - - expect(documents[0].pageContent).toBe('Text content 1'); - expect(documents[0].metadata).toEqual({ - title: 'Title1', - author: 'Author1', - _oid: expect.any(String), - _rowid: 'AAABBBCCC', - COL1: 'Value1', - COL2: 123, - COL3: new Date('2021-01-01'), - }); - - expect(documents[1].pageContent).toBe('Text content 2'); - expect(documents[1].metadata).toEqual({ - title: 'Title2', - author: 'Author2', - _oid: expect.any(String), - _rowid: 'AAABBBCCD', - COL1: 'Value2', - COL2: 456, - COL3: new Date('2021-02-01'), - }); - }); - - test('loadFromTable with missing owner', async () => { - const loader = new OracleDocLoader( - connMock, - 'MYTABLE', - OracleLoadFromType.TABLE, - undefined, // owner is missing - 'MYCOLUMN', - ['COL1'] - ); - - await expect(loader.load()).rejects.toThrow( - "Owner and column name must be specified for loading from a table" - ); + }; }); - - test('loadFromTable with missing column name', async () => { - const loader = new OracleDocLoader( - connMock, - 'MYTABLE', - OracleLoadFromType.TABLE, - 'MYSCHEMA', - undefined, // column name is missing - ['COL1'] - ); - - await expect(loader.load()).rejects.toThrow( - "Owner and column name must be specified for loading from a table" - ); - }); - - test('loadFromTable with mdata_cols exceeding 3 columns', async () => { - const loader = new OracleDocLoader( - connMock, - 'MYTABLE', - OracleLoadFromType.TABLE, - 'MYSCHEMA', - 'MYCOLUMN', - ['COL1', 'COL2', 'COL3', 'COL4'] // 4 columns, exceeding limit - ); - - await expect(loader.load()).rejects.toThrow( - "Exceeds the max number of columns you can request for metadata." - ); + + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["COL1", "COL2", "COL3"] + ); + + const documents = await loader.load(); + + expect(documents).toHaveLength(2); + + expect(documents[0].pageContent).toBe("Text content 1"); + expect(documents[0].metadata).toEqual({ + title: "Title1", + author: "Author1", + _oid: expect.any(String), + _rowid: "AAABBBCCC", + COL1: "Value1", + COL2: 123, + COL3: new Date("2021-01-01"), }); - - test('loadFromTable with invalid column names in mdata_cols', async () => { - const loader = new OracleDocLoader( - connMock, - 'MYTABLE', - OracleLoadFromType.TABLE, - 'MYSCHEMA', - 'MYCOLUMN', - ['INVALID-COL1'] // invalid column name - ); - - await expect(loader.load()).rejects.toThrow( - "Invalid column name in mdata_cols: INVALID-COL1" - ); + + expect(documents[1].pageContent).toBe("Text content 2"); + expect(documents[1].metadata).toEqual({ + title: "Title2", + author: "Author2", + _oid: expect.any(String), + _rowid: "AAABBBCCD", + COL1: "Value2", + COL2: 456, + COL3: new Date("2021-02-01"), }); - - test('loadFromTable with mdata_cols containing unsupported data types', async () => { - // Mock the execute method for the column type query - executeMock.mockImplementationOnce( () => { return { + }); + + test("loadFromTable with missing owner", async () => { + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + undefined, // owner is missing + "MYCOLUMN", + ["COL1"] + ); + + await expect(loader.load()).rejects.toThrow( + "Owner and column name must be specified for loading from a table" + ); + }); + + test("loadFromTable with missing column name", async () => { + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + undefined, // column name is missing + ["COL1"] + ); + + await expect(loader.load()).rejects.toThrow( + "Owner and column name must be specified for loading from a table" + ); + }); + + test("loadFromTable with mdata_cols exceeding 3 columns", async () => { + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["COL1", "COL2", "COL3", "COL4"] // 4 columns, exceeding limit + ); + + await expect(loader.load()).rejects.toThrow( + "Exceeds the max number of columns you can request for metadata." + ); + }); + + test("loadFromTable with invalid column names in mdata_cols", async () => { + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["INVALID-COL1"] // invalid column name + ); + + await expect(loader.load()).rejects.toThrow( + "Invalid column name in mdata_cols: INVALID-COL1" + ); + }); + + test("loadFromTable with mdata_cols containing unsupported data types", async () => { + // Mock the execute method for the column type query + executeMock.mockImplementationOnce(() => { + return { rows: [ - { COLUMN_NAME: 'COL1', DATA_TYPE: 'CLOB' }, // Unsupported data type + { COLUMN_NAME: "COL1", DATA_TYPE: "CLOB" }, // Unsupported data type ], - } as oracledb.Result<{ COLUMN_NAME: string; DATA_TYPE: string }> }); - - const loader = new OracleDocLoader( - connMock, - 'MYTABLE', - OracleLoadFromType.TABLE, - 'MYSCHEMA', - 'MYCOLUMN', - ['COL1'] - ); - - await expect(loader.load()).rejects.toThrow( - 'The datatype for the column COL1 is not supported' - ); + } as Result<{ COLUMN_NAME: string; DATA_TYPE: string }>; }); - - test('loadFromTable with empty table', async () => { - // Mock the execute method for the column type query - executeMock.mockImplementationOnce( () => { return { - rows: [ - { COLUMN_NAME: 'COL1', DATA_TYPE: 'VARCHAR2' }, - ], - } as oracledb.Result<{ COLUMN_NAME: string; DATA_TYPE: string }> }); - - // Mock the execute method for getting username - executeMock.mockImplementationOnce( () => { return { - rows: [{ USER: 'TESTUSER' }], - } as oracledb.Result<{ USER: string }>}); - - // Mock the execute method for the main query (empty result set) - executeMock.mockImplementationOnce( () => { return { + + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["COL1"] + ); + + await expect(loader.load()).rejects.toThrow( + "The datatype for the column COL1 is not supported" + ); + }); + + test("loadFromTable with empty table", async () => { + // Mock the execute method for the column type query + executeMock.mockImplementationOnce(() => { + return { + rows: [{ COLUMN_NAME: "COL1", DATA_TYPE: "VARCHAR2" }], + } as Result<{ COLUMN_NAME: string; DATA_TYPE: string }>; + }); + + // Mock the execute method for getting username + executeMock.mockImplementationOnce(() => { + return { + rows: [{ USER: "TESTUSER" }], + } as Result<{ USER: string }>; + }); + + // Mock the execute method for the main query (empty result set) + executeMock.mockImplementationOnce(() => { + return { rows: [], - } as oracledb.Result }); - - const loader = new OracleDocLoader( - connMock, - 'MYTABLE', - OracleLoadFromType.TABLE, - 'MYSCHEMA', - 'MYCOLUMN', - ['COL1'] - ); - - const documents = await loader.load(); - - expect(documents).toHaveLength(0); + } as Result; }); - - test('loadFromTable with null column data', async () => { - // Mock the execute method for the column type query - executeMock.mockImplementationOnce( () => { return { - rows: [ - { COLUMN_NAME: 'COL1', DATA_TYPE: 'VARCHAR2' }, - ], - } as oracledb.Result<{ COLUMN_NAME: string; DATA_TYPE: string }> }); - - // Mock the execute method for getting username - executeMock.mockImplementationOnce( () => { return { - rows: [{ USER: 'TESTUSER' }], - } as oracledb.Result<{ USER: string }> }); - - // Mock the execute method for the main query with null TEXT and MDATA - executeMock.mockImplementationOnce( () => { return { + + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["COL1"] + ); + + const documents = await loader.load(); + + expect(documents).toHaveLength(0); + }); + + test("loadFromTable with null column data", async () => { + // Mock the execute method for the column type query + executeMock.mockImplementationOnce(() => { + return { + rows: [{ COLUMN_NAME: "COL1", DATA_TYPE: "VARCHAR2" }], + } as Result<{ COLUMN_NAME: string; DATA_TYPE: string }>; + }); + + // Mock the execute method for getting username + executeMock.mockImplementationOnce(() => { + return { + rows: [{ USER: "TESTUSER" }], + } as Result<{ USER: string }>; + }); + + // Mock the execute method for the main query with null TEXT and MDATA + executeMock.mockImplementationOnce(() => { + return { rows: [ { MDATA: null, TEXT: null, - ROWID: 'AAABBBCCC', - COL1: 'Value1', + ROWID: "AAABBBCCC", + COL1: "Value1", }, ], - } as oracledb.Result }); - - const loader = new OracleDocLoader( - connMock, - 'MYTABLE', - OracleLoadFromType.TABLE, - 'MYSCHEMA', - 'MYCOLUMN', - ['COL1'] - ); - - const documents = await loader.load(); - - expect(documents).toHaveLength(1); - - expect(documents[0].pageContent).toBe(''); - expect(documents[0].metadata).toEqual({ - _oid: expect.any(String), - _rowid: 'AAABBBCCC', - COL1: 'Value1', - }); + } as Result; + }); + + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["COL1"] + ); + + const documents = await loader.load(); + + expect(documents).toHaveLength(1); + + expect(documents[0].pageContent).toBe(""); + expect(documents[0].metadata).toEqual({ + _oid: expect.any(String), + _rowid: "AAABBBCCC", + COL1: "Value1", }); + }); }); diff --git a/libs/langchain-community/src/document_loaders/web/oracleai.ts b/libs/langchain-community/src/document_loaders/web/oracleai.ts index 2b0036c5cfc8..86866265ea1d 100644 --- a/libs/langchain-community/src/document_loaders/web/oracleai.ts +++ b/libs/langchain-community/src/document_loaders/web/oracleai.ts @@ -1,113 +1,114 @@ +import * as fs from "node:fs"; +import * as path from "node:path"; import { Document } from "@langchain/core/documents"; import { BaseDocumentLoader } from "@langchain/core/document_loaders/base"; import { Parser } from "htmlparser2"; +import { createHash } from "crypto"; import oracledb from "oracledb"; -import crypto from "crypto"; -import fs from "fs"; -import path from 'path'; - - interface Metadata { - [key: string]: string; + [key: string]: string; } interface OutBinds { - mdata: oracledb.Lob | null; - text: oracledb.Lob | null; + mdata: oracledb.Lob | null; + text: oracledb.Lob | null; } export interface TableRow { - MDATA?: string | null; - TEXT?: string | null; - ROWID?: string; - [key: string]: any; + MDATA?: string | null; + TEXT?: string | null; + ROWID?: string; + [key: string]: any; } export class ParseOracleDocMetadata { - private metadata: Metadata; - private match: boolean; - - constructor() { - this.metadata = {}; - this.match = false; - } + private metadata: Metadata; - private handleStartTag(tag: string, attrs: { name: string; value: string | null }[]) { - if (tag === "meta") { - let entry: string | undefined; - let content: string | null = null; + private match: boolean; - attrs.forEach(({ name, value }) => { - if (name === "name") entry = value ?? ""; - if (name === "content") content = value; - }); + constructor() { + this.metadata = {}; + this.match = false; + } - if (entry) { - this.metadata[entry] = content ?? "N/A"; - } - } else if (tag === "title") { - this.match = true; - } - } + private handleStartTag( + tag: string, + attrs: { name: string; value: string | null }[] + ) { + if (tag === "meta") { + let entry: string | undefined; + let content: string | null = null; + + attrs.forEach(({ name, value }) => { + if (name === "name") entry = value ?? ""; + if (name === "content") content = value; + }); - private handleData(data: string) { - if (this.match) { - this.metadata["title"] = data; - this.match = false; - } + if (entry) { + this.metadata[entry] = content ?? "N/A"; + } + } else if (tag === "title") { + this.match = true; } + } - public getMetadata(): Metadata { - return this.metadata; + private handleData(data: string) { + if (this.match) { + this.metadata.title = data; + this.match = false; } + } - public parse(htmlString: string): void { - // We add this method to incorperate the feed method of HTMLParser in Python - interface Attribute { - name: string; - value: string | null; - } - - interface ParserOptions { - onopentag: (name: string, attrs: Record) => void; - ontext: (text: string) => void; - } + public getMetadata(): Metadata { + return this.metadata; + } - const parser = new Parser( - { - onopentag: (name: string, attrs: Record) => - this.handleStartTag( - name, - Object.entries(attrs).map(([name, value]): Attribute => ({ - name, - value: value as string | null, - })) - ), - ontext: (text: string) => this.handleData(text), - } as ParserOptions, - { decodeEntities: true } - ); - parser.write(htmlString); - parser.end(); + public parse(htmlString: string): void { + // We add this method to incorperate the feed method of HTMLParser in Python + interface Attribute { + name: string; + value: string | null; } - -} + interface ParserOptions { + onopentag: (name: string, attrs: Record) => void; + ontext: (text: string) => void; + } + const parser = new Parser( + { + onopentag: (name: string, attrs: Record) => + this.handleStartTag( + name, + Object.entries(attrs).map( + ([name, value]): Attribute => ({ + name, + value: value as string | null, + }) + ) + ), + ontext: (text: string) => this.handleData(text), + } as ParserOptions, + { decodeEntities: true } + ); + parser.write(htmlString); + parser.end(); + } +} class OracleDocReader { static generateObjectId(inputString: string | null = null) { const outLength = 32; // Output length const hashLen = 8; // Hash value length - if (!inputString) { - inputString = Array.from( - { length: 16 }, - () => "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" - .charAt(Math.floor(Math.random() * 62)) + const idString = + inputString ?? + Array.from({ length: 16 }, () => + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789".charAt( + Math.floor(Math.random() * 62) + ) ).join(""); - } // Timestamp const timestamp = Math.floor(Date.now() / 1000); @@ -115,16 +116,20 @@ class OracleDocReader { timestampBin.writeUInt32BE(timestamp); // Hash value - const hashValBin = crypto.createHash("sha256").update(inputString).digest(); + const hashValBin = createHash("sha256").update(idString).digest(); const truncatedHashVal = hashValBin.slice(0, hashLen); // Counter const counterBin = Buffer.alloc(4); - counterBin.writeUInt32BE(Math.floor(Math.random() * Math.pow(2, 32))); + counterBin.writeUInt32BE(Math.floor(Math.random() * 2 ** 32)); // Binary object ID - const objectId = Buffer.concat([timestampBin, truncatedHashVal, counterBin]); - let objectIdHex = objectId.toString("hex").padStart(outLength, "0"); + const objectId = Buffer.concat([ + timestampBin, + truncatedHashVal, + counterBin, + ]); + const objectIdHex = objectId.toString("hex").padStart(outLength, "0"); return objectIdHex.slice(0, outLength); } @@ -139,14 +144,17 @@ class OracleDocReader { try { // Read the file as binary data const data = await new Promise((resolve, reject) => { - fs.readFile(filePath, (err: NodeJS.ErrnoException | null, data: Buffer) => { - if (err) reject(err); - else resolve(data); - }); + fs.readFile( + filePath, + (err: NodeJS.ErrnoException | null, data: Buffer) => { + if (err) reject(err); + else resolve(data); + } + ); }); if (!data) { - return new Document({pageContent: "", metadata}); + return new Document({ pageContent: "", metadata }); } const bindVars = { @@ -190,42 +198,50 @@ class OracleDocReader { } // Execute a query to get the current session user - const userResult = await conn.execute( - `SELECT USER FROM dual` - ); + const userResult = await conn.execute(`SELECT USER FROM dual`); const username = userResult.rows?.[0]?.[0]; const docId = OracleDocReader.generateObjectId(`${username}$${filePath}`); - metadata["_oid"] = docId; - metadata["_file"] = filePath; + metadata._oid = docId; + metadata._file = filePath; textData = textData ?? ""; - return new Document({pageContent: textData, metadata}) + return new Document({ pageContent: textData, metadata }); } catch (ex) { console.error(`An exception occurred: ${ex}`); console.error(`Skip processing ${filePath}`); return null; } } - } export enum OracleLoadFromType { FILE, DIR, TABLE, -}; +} export class OracleDocLoader extends BaseDocumentLoader { private conn: oracledb.Connection; + private loadFrom: string; + private loadFromType: OracleLoadFromType; + private owner?: string; + private colname?: string; + private mdata_cols?: string[]; - constructor(conn: oracledb.Connection, loadFrom: string, loadFromType: OracleLoadFromType, - owner?: string, colname?: string, mdata_cols?: string[]) { + constructor( + conn: oracledb.Connection, + loadFrom: string, + loadFromType: OracleLoadFromType, + owner?: string, + colname?: string, + mdata_cols?: string[] + ) { super(); this.conn = conn; this.loadFrom = loadFrom; @@ -236,15 +252,22 @@ export class OracleDocLoader extends BaseDocumentLoader { } public async load(): Promise { - const documents: Document[] = [] - const m_params = {"plaintext": "false"} + const documents: Document[] = []; + const m_params = { plaintext: "false" }; switch (this.loadFromType) { case OracleLoadFromType.FILE: - const filepath = this.loadFrom - const doc = await OracleDocReader.readFile(this.conn, filepath, m_params) - if (doc) - documents.push(doc); + try { + const filepath = this.loadFrom; + const doc = await OracleDocReader.readFile( + this.conn, + filepath, + m_params + ); + if (doc) documents.push(doc); + } catch (err) { + console.error("Error reading file:", err); + } break; case OracleLoadFromType.DIR: @@ -256,190 +279,197 @@ export class OracleDocLoader extends BaseDocumentLoader { const stats = await fs.promises.lstat(filepath); if (stats.isFile()) { - const doc = await OracleDocReader.readFile(this.conn, filepath, m_params) - if (doc) - documents.push(doc); + const doc = await OracleDocReader.readFile( + this.conn, + filepath, + m_params + ); + if (doc) documents.push(doc); } } } catch (err) { - console.error('Error reading directory:', err); + console.error("Error reading directory:", err); } break; case OracleLoadFromType.TABLE: - return await this.loadFromTable(m_params); - default: - throw new Error("Invalid type to load from"); - } - return documents - } - - private isValidIdentifier(identifier: string): boolean { - return /^[A-Za-z_][A-Za-z0-9_]*$/.test(identifier); - } - - private async loadFromTable(m_params: any): Promise { - const results: Document[] = []; - try { - if (!this.owner || !this.colname) { - throw new Error("Owner and column name must be specified for loading from a table"); - } + try { + if (!this.owner || !this.colname) { + throw new Error( + "Owner and column name must be specified for loading from a table" + ); + } - // Validate identifiers to prevent SQL injection - if (!this.isValidIdentifier(this.owner)) { + // Validate identifiers to prevent SQL injection + if (!this.isValidIdentifier(this.owner)) { throw new Error("Invalid owner name"); - } + } - if (!this.isValidIdentifier(this.loadFrom)) { + if (!this.isValidIdentifier(this.loadFrom)) { throw new Error("Invalid table name"); - } + } - if (!this.isValidIdentifier(this.colname)) { + if (!this.isValidIdentifier(this.colname)) { throw new Error("Invalid column name"); - } + } - let mdataColsSql = ", t.ROWID"; + let mdataColsSql = ", t.ROWID"; - if (this.mdata_cols) { + if (this.mdata_cols) { if (this.mdata_cols.length > 3) { - throw new Error("Exceeds the max number of columns you can request for metadata."); + throw new Error( + "Exceeds the max number of columns you can request for metadata." + ); } - + // **First, check if the column names are valid identifiers** for (const col of this.mdata_cols) { - if (!this.isValidIdentifier(col)) { - throw new Error(`Invalid column name in mdata_cols: ${col}`); - } + if (!this.isValidIdentifier(col)) { + throw new Error(`Invalid column name in mdata_cols: ${col}`); + } } // Execute a query to get column data types const colSql = ` - SELECT COLUMN_NAME, DATA_TYPE - FROM ALL_TAB_COLUMNS - WHERE OWNER = :ownername AND TABLE_NAME = :tablename - `; + SELECT COLUMN_NAME, DATA_TYPE + FROM ALL_TAB_COLUMNS + WHERE OWNER = :ownername AND TABLE_NAME = :tablename + `; const colBinds = { - ownername: this.owner.toUpperCase(), - tablename: this.loadFrom.toUpperCase(), + ownername: this.owner.toUpperCase(), + tablename: this.loadFrom.toUpperCase(), }; - const colResult = await this.conn.execute<{ COLUMN_NAME: string; DATA_TYPE: string }>( - colSql, - colBinds, - { outFormat: oracledb.OUT_FORMAT_OBJECT } - ); + const colResult = await this.conn.execute<{ + COLUMN_NAME: string; + DATA_TYPE: string; + }>(colSql, colBinds, { outFormat: oracledb.OUT_FORMAT_OBJECT }); const colRows = colResult.rows; if (!colRows) { - throw new Error("Failed to retrieve column information"); + throw new Error("Failed to retrieve column information"); } const colTypes: Record = {}; for (const row of colRows) { - const colName = row.COLUMN_NAME; - const dataType = row.DATA_TYPE; - colTypes[colName] = dataType; + const colName = row.COLUMN_NAME; + const dataType = row.DATA_TYPE; + colTypes[colName] = dataType; } for (const col of this.mdata_cols) { - if (!this.isValidIdentifier(col)) { - throw new Error(`Invalid column name in mdata_cols: ${col}`); - } - - const dataType = colTypes[col]; - if (!dataType) { - throw new Error(`Column ${col} not found in table ${this.loadFrom}`); - } - - if ( - ![ - "NUMBER", - "BINARY_DOUBLE", - "BINARY_FLOAT", - "LONG", - "DATE", - "TIMESTAMP", - "VARCHAR2", - ].includes(dataType) - ) { - throw new Error(`The datatype for the column ${col} is not supported`); - } + if (!this.isValidIdentifier(col)) { + throw new Error(`Invalid column name in mdata_cols: ${col}`); + } + + const dataType = colTypes[col]; + if (!dataType) { + throw new Error( + `Column ${col} not found in table ${this.loadFrom}` + ); + } + + if ( + ![ + "NUMBER", + "BINARY_DOUBLE", + "BINARY_FLOAT", + "LONG", + "DATE", + "TIMESTAMP", + "VARCHAR2", + ].includes(dataType) + ) { + throw new Error( + `The datatype for the column ${col} is not supported` + ); + } } for (const col of this.mdata_cols) { - mdataColsSql += `, t.${col}`; + mdataColsSql += `, t.${col}`; } - } + } - const mainSql = ` - SELECT dbms_vector_chain.utl_to_text(t.${this.colname}, json(:params)) AS MDATA, - dbms_vector_chain.utl_to_text(t.${this.colname}) AS TEXT - ${mdataColsSql} - FROM ${this.owner}.${this.loadFrom} t - `; + const mainSql = ` + SELECT dbms_vector_chain.utl_to_text(t.${this.colname}, json(:params)) AS MDATA, + dbms_vector_chain.utl_to_text(t.${this.colname}) AS TEXT + ${mdataColsSql} + FROM ${this.owner}.${this.loadFrom} t + `; - const mainBinds = { + const mainBinds = { params: JSON.stringify(m_params), - }; + }; - const options = { + const options = { outFormat: oracledb.OUT_FORMAT_OBJECT, - }; + }; - // Get the username - const userResult = await this.conn.execute<{ USER: string }>('SELECT USER FROM dual'); - const username = userResult.rows?.[0]?.USER || "unknown_user"; + // Get the username + const userResult = await this.conn.execute<{ USER: string }>( + "SELECT USER FROM dual" + ); + const username = userResult.rows?.[0]?.USER || "unknown_user"; - // Execute the main SQL query - const result = await this.conn.execute(mainSql, mainBinds, options); - const rows = result.rows as TableRow[]; + // Execute the main SQL query + const result = await this.conn.execute(mainSql, mainBinds, options); + const rows = result.rows as TableRow[]; - if (rows) { + if (rows) { for (const row of rows) { - let metadata: Record = {}; - - if (row["MDATA"]) { - const data = (await (row["MDATA"] as unknown as oracledb.Lob).getData()).toString(); - if ( - data.trim().startsWith("") - ) { - const parser = new ParseOracleDocMetadata(); - parser.parse(data); - metadata = { ...metadata, ...parser.getMetadata() }; - } + let metadata: Record = {}; + + if (row.MDATA) { + const data = ( + await (row.MDATA as unknown as oracledb.Lob).getData() + ).toString(); + if ( + data.trim().startsWith("") + ) { + const parser = new ParseOracleDocMetadata(); + parser.parse(data); + metadata = { ...metadata, ...parser.getMetadata() }; } + } - const docId = OracleDocReader.generateObjectId( - `${username}$${this.owner}$${this.loadFrom}$${this.colname}$${row["ROWID"]}` - ); + const docId = OracleDocReader.generateObjectId( + `${username}$${this.owner}$${this.loadFrom}$${this.colname}$${row.ROWID}` + ); - metadata["_oid"] = docId; - metadata["_rowid"] = row["ROWID"]; + metadata._oid = docId; + metadata._rowid = row.ROWID; - if (this.mdata_cols) { - for (const colName of this.mdata_cols) { - metadata[colName] = row[colName]; - } + if (this.mdata_cols) { + for (const colName of this.mdata_cols) { + metadata[colName] = row[colName]; } + } - const text = row["TEXT"] as string; + const text = row.TEXT as string; - if (text === null || text === undefined) { - results.push(new Document({ pageContent: "", metadata })); - } else { - results.push(new Document({ pageContent: text, metadata })); - } + if (text === null || text === undefined) { + documents.push(new Document({ pageContent: "", metadata })); + } else { + documents.push(new Document({ pageContent: text, metadata })); + } } + } + break; + } catch (ex) { + console.error(`An exception occurred: ${ex}`); + throw ex; } - - return results; - } catch (ex) { - console.error(`An exception occurred: ${ex}`); - throw ex; + default: + throw new Error("Invalid type to load from"); } + return documents; + } + + private isValidIdentifier(identifier: string): boolean { + return /^[A-Za-z_][A-Za-z0-9_]*$/.test(identifier); } - } diff --git a/yarn.lock b/yarn.lock index 04efbec24400..e871d2d7c36b 100644 --- a/yarn.lock +++ b/yarn.lock @@ -11580,6 +11580,7 @@ __metadata: hdb: 0.19.8 hnswlib-node: ^3.0.0 html-to-text: ^9.0.5 + htmlparser2: ^9.1.0 ibm-cloud-sdk-core: ^5.0.2 ignore: ^5.2.0 interface-datastore: ^8.2.11