diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/oracleai.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/oracleai.mdx new file mode 100644 index 000000000000..c7ef4769bd75 --- /dev/null +++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/oracleai.mdx @@ -0,0 +1,73 @@ +--- +hide_table_of_contents: true +--- + +# Oracle AI + +This example goes over how to load documents using Oracle AI Vector Search. + +## Setup + +You'll need to install the [oracledb](https://www.npmjs.com/package/oracledb) package: + +```bash npm2yarn +npm install @langchain/community @langchain/core oracledb +``` + +## Usage + +### Connect to Oracle Database +You'll need to provide the username, password, hostname and service_name: + +```typescript +import oracledb from 'oracledb'; + +let connection: oracledb.Connection; + +// Replace the placeholders with your information +const username = ""; +const password = ""; +const dsn = "/"; + +try { + connection = await oracledb.getConnection({ + user: username, + password:password, + connectString: dsn + }); + console.log("Connection Successful"); +} catch (err) { + console.error('Connection failed:', err); + throw err; +} +``` + +### Load Documents +As for loading documents, you have 3 options: +- Loading a local file. +- Loading from a local directory. +- Loading from the Oracle Database. + +When loading from the Oracle Database, you must provide the table's name, owner's name, and the name of the column to load. Optionally, you can provide extra column names to be included in the returned documents' metadata: + +```typescript +import { OracleDocLoader, OracleLoadFromType } from "@langchain/community/document_loaders/web/oracleai"; + +/* +// Loading a local file (replace with the path of the file you want to load.) +const loader = new OracleDocLoader(connection, , OracleLoadFromType.FILE); + + +// Loading from a local directory (replace with the path of the directory you want to load from.) +const loader = new OracleDocLoader(connection, , OracleLoadFromType.DIR); +*/ + +// Loading from Oracle Database table (replace the placeholders with your information, optionally add a [metadata_cols] parameter to include columns as metadata.) +const loader = new OracleDocLoader(connection, , OracleLoadFromType.TABLE, , ); + +// Load the docs +const docs = loader.load(); +console.log("Number of docs loaded:", docs.length); +console.log("Document-0:", docs[0].page_content); // content +``` + diff --git a/examples/src/document_loaders/oracleai.ts b/examples/src/document_loaders/oracleai.ts new file mode 100644 index 000000000000..c80fc818cdcf --- /dev/null +++ b/examples/src/document_loaders/oracleai.ts @@ -0,0 +1,40 @@ +import oracledb from 'oracledb'; +import { OracleDocLoader, OracleLoadFromType } from "@langchain/community/document_loaders/web/oracleai"; + +let connection: oracledb.Connection; + +// Replace the placeholders with your information +const username = ""; +const pwd = ""; +const dsn = "/"; + +try { + connection = await oracledb.getConnection({ + user: username, + password: pwd, + connectString: dsn + }); + console.log("Connection Successful"); +} catch (err) { + console.error('Connection failed:', err); + throw err; +} + +// Loading a local file (replace with the path of the file you want to load.) +const loader = new OracleDocLoader(connection, "src/document_loaders/example_data/bitcoin.pdf", OracleLoadFromType.FILE); + +/* +// Loading from a local directory (replace with the path of the directory you want to load from.) +const loader = new OracleDocLoader(connection, , OracleLoadFromType.DIR); + + +// Loading from Oracle Database table (replace the placeholders with your information, optionally add a [metadata_cols] parameter to include columns as metadata.) +const loader = new OracleDocLoader(connection, , OracleLoadFromType.TABLE, , ); +*/ + +// Load the docs +const docs = loader.load(); +console.log("Number of docs loaded:", docs.length); +console.log("Document-0:", docs[0].page_content); // content + + diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore index e6ae5fa54a4f..79f805faafbd 100644 --- a/libs/langchain-community/.gitignore +++ b/libs/langchain-community/.gitignore @@ -922,6 +922,10 @@ document_loaders/web/notionapi.cjs document_loaders/web/notionapi.js document_loaders/web/notionapi.d.ts document_loaders/web/notionapi.d.cts +document_loaders/web/oracleai.cjs +document_loaders/web/oracleai.js +document_loaders/web/oracleai.d.ts +document_loaders/web/oracleai.d.cts document_loaders/web/pdf.cjs document_loaders/web/pdf.js document_loaders/web/pdf.d.ts diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js index 4a402c6941e8..c2add9919f5e 100644 --- a/libs/langchain-community/langchain.config.js +++ b/libs/langchain-community/langchain.config.js @@ -286,6 +286,7 @@ export const config = { "document_loaders/web/github": "document_loaders/web/github", "document_loaders/web/taskade": "document_loaders/web/taskade", "document_loaders/web/notionapi": "document_loaders/web/notionapi", + "document_loaders/web/oracleai": "document_loaders/web/oracleai", "document_loaders/web/pdf": "document_loaders/web/pdf", "document_loaders/web/recursive_url": "document_loaders/web/recursive_url", "document_loaders/web/s3": "document_loaders/web/s3", @@ -505,6 +506,7 @@ export const config = { "document_loaders/web/pdf", "document_loaders/web/taskade", "document_loaders/web/notionapi", + "document_loaders/web/oracleai", "document_loaders/web/recursive_url", "document_loaders/web/s3", "document_loaders/web/sitemap", diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 7b826ad1e106..c34b967ab1f7 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -39,9 +39,11 @@ "binary-extensions": "^2.2.0", "expr-eval": "^2.0.2", "flat": "^5.0.2", + "htmlparser2": "^9.1.0", "js-yaml": "^4.1.0", "langchain": ">=0.2.3 <0.3.0 || >=0.3.4 <0.4.0", "langsmith": "^0.2.0", + "oracledb": "^6.7.0", "uuid": "^10.0.0", "zod": "^3.22.3", "zod-to-json-schema": "^3.22.5" @@ -120,6 +122,7 @@ "@types/jsonwebtoken": "^9", "@types/lodash": "^4", "@types/mozilla-readability": "^0.2.1", + "@types/oracledb": "^6", "@types/pdf-parse": "^1.1.1", "@types/pg": "^8.11.0", "@types/pg-copy-streams": "^1.2.2", @@ -2791,6 +2794,15 @@ "import": "./document_loaders/web/notionapi.js", "require": "./document_loaders/web/notionapi.cjs" }, + "./document_loaders/web/oracleai": { + "types": { + "import": "./document_loaders/web/oracleai.d.ts", + "require": "./document_loaders/web/oracleai.d.cts", + "default": "./document_loaders/web/oracleai.d.ts" + }, + "import": "./document_loaders/web/oracleai.js", + "require": "./document_loaders/web/oracleai.cjs" + }, "./document_loaders/web/pdf": { "types": { "import": "./document_loaders/web/pdf.d.ts", @@ -4025,6 +4037,10 @@ "document_loaders/web/notionapi.js", "document_loaders/web/notionapi.d.ts", "document_loaders/web/notionapi.d.cts", + "document_loaders/web/oracleai.cjs", + "document_loaders/web/oracleai.js", + "document_loaders/web/oracleai.d.ts", + "document_loaders/web/oracleai.d.cts", "document_loaders/web/pdf.cjs", "document_loaders/web/pdf.js", "document_loaders/web/pdf.d.ts", diff --git a/libs/langchain-community/src/document_loaders/tests/example_data/oracleai/Jacob_Lee_Resume_2023.pdf b/libs/langchain-community/src/document_loaders/tests/example_data/oracleai/Jacob_Lee_Resume_2023.pdf new file mode 100644 index 000000000000..de0724b53771 Binary files /dev/null and b/libs/langchain-community/src/document_loaders/tests/example_data/oracleai/Jacob_Lee_Resume_2023.pdf differ diff --git a/libs/langchain-community/src/document_loaders/tests/example_data/oracleai/example.html b/libs/langchain-community/src/document_loaders/tests/example_data/oracleai/example.html new file mode 100644 index 000000000000..7672eb6e9e13 --- /dev/null +++ b/libs/langchain-community/src/document_loaders/tests/example_data/oracleai/example.html @@ -0,0 +1,28 @@ + + + + + + + + Sample HTML Page + + +
+

Welcome to My Sample HTML Page

+
+ +
+

Introduction

+

+ This is a small HTML file with a header, main content section, and a + footer. +

+

Feel free to modify and experiment with the code!

+
+ +
+

Footer Content - © 2024

+
+ + diff --git a/libs/langchain-community/src/document_loaders/tests/example_data/oracleai/example.txt b/libs/langchain-community/src/document_loaders/tests/example_data/oracleai/example.txt new file mode 100644 index 000000000000..04861c126cfe --- /dev/null +++ b/libs/langchain-community/src/document_loaders/tests/example_data/oracleai/example.txt @@ -0,0 +1,4 @@ +Foo +Bar +Baz + diff --git a/libs/langchain-community/src/document_loaders/tests/oracleai.test.ts b/libs/langchain-community/src/document_loaders/tests/oracleai.test.ts new file mode 100644 index 000000000000..9a0125d6f27e --- /dev/null +++ b/libs/langchain-community/src/document_loaders/tests/oracleai.test.ts @@ -0,0 +1,424 @@ +import { jest } from "@jest/globals"; +import { Connection, Result } from "oracledb"; +import { + ParseOracleDocMetadata, + OracleDocLoader, + OracleLoadFromType, + TableRow, +} from "../web/oracleai.js"; + +describe("ParseOracleDocMetadata", () => { + let parser: ParseOracleDocMetadata; + + beforeEach(() => { + parser = new ParseOracleDocMetadata(); + }); + + test("should parse title and meta tags correctly", () => { + const htmlString = + "Sample Title"; + parser.parse(htmlString); + const metadata = parser.getMetadata(); + expect(metadata).toEqual({ + title: "Sample Title", + description: "Sample Content", + }); + }); + + test("should handle missing meta content gracefully", () => { + const htmlString = + "Sample Title"; + parser.parse(htmlString); + const metadata = parser.getMetadata(); + expect(metadata).toEqual({ + title: "Sample Title", + description: "N/A", + }); + }); + + test("should handle multiple meta tags", () => { + const htmlString = + "Sample Title"; + parser.parse(htmlString); + const metadata = parser.getMetadata(); + expect(metadata).toEqual({ + title: "Sample Title", + description: "Sample Content", + author: "John Doe", + }); + }); + + test("should handle no title tag", () => { + const htmlString = + ""; + parser.parse(htmlString); + const metadata = parser.getMetadata(); + expect(metadata).toEqual({ + description: "Sample Content", + }); + }); + + test("should handle empty html string", () => { + const htmlString = ""; + parser.parse(htmlString); + const metadata = parser.getMetadata(); + expect(metadata).toEqual({}); + }); +}); + +describe("OracleDocLoader", () => { + let executeMock: jest.Mock<(sql: string, bindVars?: any) => object>; + let connMock: jest.Mocked; + let loader: OracleDocLoader; + const baseDirPath = "./src/document_loaders/tests/example_data/oracleai"; + const baseMockData = "MockData"; + + beforeEach(() => { + executeMock = jest.fn(); + connMock = { execute: executeMock } as unknown as jest.Mocked; + }); + + test("should load a single file properly", async () => { + executeMock.mockImplementation(async (sql: string, bindVars?: any) => { + if (bindVars) { + return { + outBinds: { + mdata: { + getData: jest + .fn() + .mockImplementation(() => bindVars.blob.val.toString()), + }, + text: { + getData: jest.fn().mockImplementation(() => baseMockData + 1), + }, + }, + }; + } else { + return { + rows: [["MockUser"]], + }; + } + }); + + loader = new OracleDocLoader( + connMock, + baseDirPath + "/example.html", + OracleLoadFromType.FILE + ); + const res = await loader.load(); + console.log(res); + expect(res.length).toEqual(1); + expect(res[0].pageContent).toEqual(baseMockData + "1"); + expect(res[0].metadata.title).toBeTruthy(); + expect(res[0].metadata.title).toEqual("Sample HTML Page"); + expect(res[0].metadata.viewport).toBeTruthy(); + expect(res[0].metadata.viewport).toEqual( + "width=device-width, initial-scale=1.0" + ); + }); + + test("should load a directory properly", async () => { + let doc_count = 0; + executeMock.mockImplementation(async (sql: string, bindVars?: any) => { + if (bindVars) { + doc_count += 1; + return { + outBinds: { + mdata: { + getData: jest + .fn() + .mockImplementation(() => bindVars.blob.val.toString()), + }, + text: { + getData: jest + .fn() + .mockImplementation(() => baseMockData + doc_count), + }, + }, + }; + } else { + return { + rows: [["MockUser"]], + }; + } + }); + + loader = new OracleDocLoader(connMock, baseDirPath, OracleLoadFromType.DIR); + const res = await loader.load(); + + expect(res.length).toEqual(3); + for (let i = 0; i < res.length; i += 1) { + expect(res[i].pageContent).toEqual(baseMockData + (i + 1)); + if (res[i].metadata.title) { + expect(res[i].metadata.title).toEqual("Sample HTML Page"); + expect(res[i].metadata.viewport).toBeTruthy(); + expect(res[i].metadata.viewport).toEqual( + "width=device-width, initial-scale=1.0" + ); + } + } + }); + + test("loadFromTable with valid parameters", async () => { + // Mock the execute method for the column type query + executeMock.mockImplementationOnce(() => { + return { + rows: [ + { COLUMN_NAME: "COL1", DATA_TYPE: "VARCHAR2" }, + { COLUMN_NAME: "COL2", DATA_TYPE: "NUMBER" }, + { COLUMN_NAME: "COL3", DATA_TYPE: "DATE" }, + ], + } as Result<{ COLUMN_NAME: string; DATA_TYPE: string }>; + }); + + // Mock the execute method for getting username + executeMock.mockImplementationOnce(() => { + return { + rows: [{ USER: "TESTUSER" }], + } as Result<{ USER: string }>; + }); + + // Mock the execute method for the main query + executeMock.mockImplementationOnce(() => { + return { + rows: [ + { + MDATA: { + getData: jest + .fn() + .mockImplementation( + () => + 'Title1' + ), + }, + TEXT: "Text content 1", + ROWID: "AAABBBCCC", + COL1: "Value1", + COL2: 123, + COL3: new Date("2021-01-01"), + }, + { + MDATA: { + getData: jest + .fn() + .mockImplementation( + () => + 'Title2' + ), + }, + TEXT: "Text content 2", + ROWID: "AAABBBCCD", + COL1: "Value2", + COL2: 456, + COL3: new Date("2021-02-01"), + }, + ], + }; + }); + + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["COL1", "COL2", "COL3"] + ); + + const documents = await loader.load(); + + expect(documents).toHaveLength(2); + + expect(documents[0].pageContent).toBe("Text content 1"); + expect(documents[0].metadata).toEqual({ + title: "Title1", + author: "Author1", + _oid: expect.any(String), + _rowid: "AAABBBCCC", + COL1: "Value1", + COL2: 123, + COL3: new Date("2021-01-01"), + }); + + expect(documents[1].pageContent).toBe("Text content 2"); + expect(documents[1].metadata).toEqual({ + title: "Title2", + author: "Author2", + _oid: expect.any(String), + _rowid: "AAABBBCCD", + COL1: "Value2", + COL2: 456, + COL3: new Date("2021-02-01"), + }); + }); + + test("loadFromTable with missing owner", async () => { + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + undefined, // owner is missing + "MYCOLUMN", + ["COL1"] + ); + + await expect(loader.load()).rejects.toThrow( + "Owner and column name must be specified for loading from a table" + ); + }); + + test("loadFromTable with missing column name", async () => { + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + undefined, // column name is missing + ["COL1"] + ); + + await expect(loader.load()).rejects.toThrow( + "Owner and column name must be specified for loading from a table" + ); + }); + + test("loadFromTable with mdata_cols exceeding 3 columns", async () => { + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["COL1", "COL2", "COL3", "COL4"] // 4 columns, exceeding limit + ); + + await expect(loader.load()).rejects.toThrow( + "Exceeds the max number of columns you can request for metadata." + ); + }); + + test("loadFromTable with invalid column names in mdata_cols", async () => { + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["INVALID-COL1"] // invalid column name + ); + + await expect(loader.load()).rejects.toThrow( + "Invalid column name in mdata_cols: INVALID-COL1" + ); + }); + + test("loadFromTable with mdata_cols containing unsupported data types", async () => { + // Mock the execute method for the column type query + executeMock.mockImplementationOnce(() => { + return { + rows: [ + { COLUMN_NAME: "COL1", DATA_TYPE: "CLOB" }, // Unsupported data type + ], + } as Result<{ COLUMN_NAME: string; DATA_TYPE: string }>; + }); + + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["COL1"] + ); + + await expect(loader.load()).rejects.toThrow( + "The datatype for the column COL1 is not supported" + ); + }); + + test("loadFromTable with empty table", async () => { + // Mock the execute method for the column type query + executeMock.mockImplementationOnce(() => { + return { + rows: [{ COLUMN_NAME: "COL1", DATA_TYPE: "VARCHAR2" }], + } as Result<{ COLUMN_NAME: string; DATA_TYPE: string }>; + }); + + // Mock the execute method for getting username + executeMock.mockImplementationOnce(() => { + return { + rows: [{ USER: "TESTUSER" }], + } as Result<{ USER: string }>; + }); + + // Mock the execute method for the main query (empty result set) + executeMock.mockImplementationOnce(() => { + return { + rows: [], + } as Result; + }); + + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["COL1"] + ); + + const documents = await loader.load(); + + expect(documents).toHaveLength(0); + }); + + test("loadFromTable with null column data", async () => { + // Mock the execute method for the column type query + executeMock.mockImplementationOnce(() => { + return { + rows: [{ COLUMN_NAME: "COL1", DATA_TYPE: "VARCHAR2" }], + } as Result<{ COLUMN_NAME: string; DATA_TYPE: string }>; + }); + + // Mock the execute method for getting username + executeMock.mockImplementationOnce(() => { + return { + rows: [{ USER: "TESTUSER" }], + } as Result<{ USER: string }>; + }); + + // Mock the execute method for the main query with null TEXT and MDATA + executeMock.mockImplementationOnce(() => { + return { + rows: [ + { + MDATA: null, + TEXT: null, + ROWID: "AAABBBCCC", + COL1: "Value1", + }, + ], + } as Result; + }); + + const loader = new OracleDocLoader( + connMock, + "MYTABLE", + OracleLoadFromType.TABLE, + "MYSCHEMA", + "MYCOLUMN", + ["COL1"] + ); + + const documents = await loader.load(); + + expect(documents).toHaveLength(1); + + expect(documents[0].pageContent).toBe(""); + expect(documents[0].metadata).toEqual({ + _oid: expect.any(String), + _rowid: "AAABBBCCC", + COL1: "Value1", + }); + }); +}); diff --git a/libs/langchain-community/src/document_loaders/web/oracleai.ts b/libs/langchain-community/src/document_loaders/web/oracleai.ts new file mode 100644 index 000000000000..86866265ea1d --- /dev/null +++ b/libs/langchain-community/src/document_loaders/web/oracleai.ts @@ -0,0 +1,475 @@ +import * as fs from "node:fs"; +import * as path from "node:path"; +import { Document } from "@langchain/core/documents"; +import { BaseDocumentLoader } from "@langchain/core/document_loaders/base"; +import { Parser } from "htmlparser2"; +import { createHash } from "crypto"; +import oracledb from "oracledb"; + +interface Metadata { + [key: string]: string; +} + +interface OutBinds { + mdata: oracledb.Lob | null; + text: oracledb.Lob | null; +} + +export interface TableRow { + MDATA?: string | null; + TEXT?: string | null; + ROWID?: string; + [key: string]: any; +} + +export class ParseOracleDocMetadata { + private metadata: Metadata; + + private match: boolean; + + constructor() { + this.metadata = {}; + this.match = false; + } + + private handleStartTag( + tag: string, + attrs: { name: string; value: string | null }[] + ) { + if (tag === "meta") { + let entry: string | undefined; + let content: string | null = null; + + attrs.forEach(({ name, value }) => { + if (name === "name") entry = value ?? ""; + if (name === "content") content = value; + }); + + if (entry) { + this.metadata[entry] = content ?? "N/A"; + } + } else if (tag === "title") { + this.match = true; + } + } + + private handleData(data: string) { + if (this.match) { + this.metadata.title = data; + this.match = false; + } + } + + public getMetadata(): Metadata { + return this.metadata; + } + + public parse(htmlString: string): void { + // We add this method to incorperate the feed method of HTMLParser in Python + interface Attribute { + name: string; + value: string | null; + } + + interface ParserOptions { + onopentag: (name: string, attrs: Record) => void; + ontext: (text: string) => void; + } + + const parser = new Parser( + { + onopentag: (name: string, attrs: Record) => + this.handleStartTag( + name, + Object.entries(attrs).map( + ([name, value]): Attribute => ({ + name, + value: value as string | null, + }) + ) + ), + ontext: (text: string) => this.handleData(text), + } as ParserOptions, + { decodeEntities: true } + ); + parser.write(htmlString); + parser.end(); + } +} + +class OracleDocReader { + static generateObjectId(inputString: string | null = null) { + const outLength = 32; // Output length + const hashLen = 8; // Hash value length + + const idString = + inputString ?? + Array.from({ length: 16 }, () => + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789".charAt( + Math.floor(Math.random() * 62) + ) + ).join(""); + + // Timestamp + const timestamp = Math.floor(Date.now() / 1000); + const timestampBin = Buffer.alloc(4); + timestampBin.writeUInt32BE(timestamp); + + // Hash value + const hashValBin = createHash("sha256").update(idString).digest(); + const truncatedHashVal = hashValBin.slice(0, hashLen); + + // Counter + const counterBin = Buffer.alloc(4); + counterBin.writeUInt32BE(Math.floor(Math.random() * 2 ** 32)); + + // Binary object ID + const objectId = Buffer.concat([ + timestampBin, + truncatedHashVal, + counterBin, + ]); + const objectIdHex = objectId.toString("hex").padStart(outLength, "0"); + + return objectIdHex.slice(0, outLength); + } + + static async readFile( + conn: oracledb.Connection, + filePath: string, + params: Record + ): Promise { + let metadata: Metadata = {}; + + try { + // Read the file as binary data + const data = await new Promise((resolve, reject) => { + fs.readFile( + filePath, + (err: NodeJS.ErrnoException | null, data: Buffer) => { + if (err) reject(err); + else resolve(data); + } + ); + }); + + if (!data) { + return new Document({ pageContent: "", metadata }); + } + + const bindVars = { + blob: { dir: oracledb.BIND_IN, type: oracledb.DB_TYPE_BLOB, val: data }, + pref: { dir: oracledb.BIND_IN, val: JSON.stringify(params) }, + mdata: { dir: oracledb.BIND_OUT, type: oracledb.DB_TYPE_CLOB }, + text: { dir: oracledb.BIND_OUT, type: oracledb.DB_TYPE_CLOB }, + }; + + // Execute the PL/SQL block + const result = await conn.execute( + ` + declare + input blob; + begin + input := :blob; + :mdata := dbms_vector_chain.utl_to_text(input, json(:pref)); + :text := dbms_vector_chain.utl_to_text(input); + end;`, + bindVars + ); + + const outBinds = result.outBinds as OutBinds; + const mdataLob = outBinds.mdata; + const textLob = outBinds.text; + + // Read and parse metadata + let docData = await mdataLob?.getData(); + let textData = await textLob?.getData(); + + docData = docData ? docData.toString() : ""; + textData = textData ? textData.toString() : ""; + + if ( + docData.startsWith("") + ) { + const parser = new ParseOracleDocMetadata(); + parser.parse(docData); + metadata = parser.getMetadata(); + } + + // Execute a query to get the current session user + const userResult = await conn.execute(`SELECT USER FROM dual`); + + const username = userResult.rows?.[0]?.[0]; + const docId = OracleDocReader.generateObjectId(`${username}$${filePath}`); + metadata._oid = docId; + metadata._file = filePath; + + textData = textData ?? ""; + return new Document({ pageContent: textData, metadata }); + } catch (ex) { + console.error(`An exception occurred: ${ex}`); + console.error(`Skip processing ${filePath}`); + return null; + } + } +} + +export enum OracleLoadFromType { + FILE, + DIR, + TABLE, +} + +export class OracleDocLoader extends BaseDocumentLoader { + private conn: oracledb.Connection; + + private loadFrom: string; + + private loadFromType: OracleLoadFromType; + + private owner?: string; + + private colname?: string; + + private mdata_cols?: string[]; + + constructor( + conn: oracledb.Connection, + loadFrom: string, + loadFromType: OracleLoadFromType, + owner?: string, + colname?: string, + mdata_cols?: string[] + ) { + super(); + this.conn = conn; + this.loadFrom = loadFrom; + this.loadFromType = loadFromType; + this.owner = owner; + this.colname = colname; + this.mdata_cols = mdata_cols; + } + + public async load(): Promise { + const documents: Document[] = []; + const m_params = { plaintext: "false" }; + + switch (this.loadFromType) { + case OracleLoadFromType.FILE: + try { + const filepath = this.loadFrom; + const doc = await OracleDocReader.readFile( + this.conn, + filepath, + m_params + ); + if (doc) documents.push(doc); + } catch (err) { + console.error("Error reading file:", err); + } + break; + + case OracleLoadFromType.DIR: + try { + const dirname = this.loadFrom; + const files = await fs.promises.readdir(dirname); + for (const file of files) { + const filepath = path.join(dirname, file); + const stats = await fs.promises.lstat(filepath); + + if (stats.isFile()) { + const doc = await OracleDocReader.readFile( + this.conn, + filepath, + m_params + ); + if (doc) documents.push(doc); + } + } + } catch (err) { + console.error("Error reading directory:", err); + } + break; + + case OracleLoadFromType.TABLE: + try { + if (!this.owner || !this.colname) { + throw new Error( + "Owner and column name must be specified for loading from a table" + ); + } + + // Validate identifiers to prevent SQL injection + if (!this.isValidIdentifier(this.owner)) { + throw new Error("Invalid owner name"); + } + + if (!this.isValidIdentifier(this.loadFrom)) { + throw new Error("Invalid table name"); + } + + if (!this.isValidIdentifier(this.colname)) { + throw new Error("Invalid column name"); + } + + let mdataColsSql = ", t.ROWID"; + + if (this.mdata_cols) { + if (this.mdata_cols.length > 3) { + throw new Error( + "Exceeds the max number of columns you can request for metadata." + ); + } + + // **First, check if the column names are valid identifiers** + for (const col of this.mdata_cols) { + if (!this.isValidIdentifier(col)) { + throw new Error(`Invalid column name in mdata_cols: ${col}`); + } + } + + // Execute a query to get column data types + const colSql = ` + SELECT COLUMN_NAME, DATA_TYPE + FROM ALL_TAB_COLUMNS + WHERE OWNER = :ownername AND TABLE_NAME = :tablename + `; + + const colBinds = { + ownername: this.owner.toUpperCase(), + tablename: this.loadFrom.toUpperCase(), + }; + + const colResult = await this.conn.execute<{ + COLUMN_NAME: string; + DATA_TYPE: string; + }>(colSql, colBinds, { outFormat: oracledb.OUT_FORMAT_OBJECT }); + + const colRows = colResult.rows; + + if (!colRows) { + throw new Error("Failed to retrieve column information"); + } + + const colTypes: Record = {}; + for (const row of colRows) { + const colName = row.COLUMN_NAME; + const dataType = row.DATA_TYPE; + colTypes[colName] = dataType; + } + + for (const col of this.mdata_cols) { + if (!this.isValidIdentifier(col)) { + throw new Error(`Invalid column name in mdata_cols: ${col}`); + } + + const dataType = colTypes[col]; + if (!dataType) { + throw new Error( + `Column ${col} not found in table ${this.loadFrom}` + ); + } + + if ( + ![ + "NUMBER", + "BINARY_DOUBLE", + "BINARY_FLOAT", + "LONG", + "DATE", + "TIMESTAMP", + "VARCHAR2", + ].includes(dataType) + ) { + throw new Error( + `The datatype for the column ${col} is not supported` + ); + } + } + + for (const col of this.mdata_cols) { + mdataColsSql += `, t.${col}`; + } + } + + const mainSql = ` + SELECT dbms_vector_chain.utl_to_text(t.${this.colname}, json(:params)) AS MDATA, + dbms_vector_chain.utl_to_text(t.${this.colname}) AS TEXT + ${mdataColsSql} + FROM ${this.owner}.${this.loadFrom} t + `; + + const mainBinds = { + params: JSON.stringify(m_params), + }; + + const options = { + outFormat: oracledb.OUT_FORMAT_OBJECT, + }; + + // Get the username + const userResult = await this.conn.execute<{ USER: string }>( + "SELECT USER FROM dual" + ); + const username = userResult.rows?.[0]?.USER || "unknown_user"; + + // Execute the main SQL query + const result = await this.conn.execute(mainSql, mainBinds, options); + const rows = result.rows as TableRow[]; + + if (rows) { + for (const row of rows) { + let metadata: Record = {}; + + if (row.MDATA) { + const data = ( + await (row.MDATA as unknown as oracledb.Lob).getData() + ).toString(); + if ( + data.trim().startsWith("") + ) { + const parser = new ParseOracleDocMetadata(); + parser.parse(data); + metadata = { ...metadata, ...parser.getMetadata() }; + } + } + + const docId = OracleDocReader.generateObjectId( + `${username}$${this.owner}$${this.loadFrom}$${this.colname}$${row.ROWID}` + ); + + metadata._oid = docId; + metadata._rowid = row.ROWID; + + if (this.mdata_cols) { + for (const colName of this.mdata_cols) { + metadata[colName] = row[colName]; + } + } + + const text = row.TEXT as string; + + if (text === null || text === undefined) { + documents.push(new Document({ pageContent: "", metadata })); + } else { + documents.push(new Document({ pageContent: text, metadata })); + } + } + } + break; + } catch (ex) { + console.error(`An exception occurred: ${ex}`); + throw ex; + } + default: + throw new Error("Invalid type to load from"); + } + return documents; + } + + private isValidIdentifier(identifier: string): boolean { + return /^[A-Za-z_][A-Za-z0-9_]*$/.test(identifier); + } +} diff --git a/yarn.lock b/yarn.lock index 5e6b6ed60a57..c0139a751072 100644 --- a/yarn.lock +++ b/yarn.lock @@ -11762,6 +11762,7 @@ __metadata: "@types/jsonwebtoken": ^9 "@types/lodash": ^4 "@types/mozilla-readability": ^0.2.1 + "@types/oracledb": ^6 "@types/pdf-parse": ^1.1.1 "@types/pg": ^8.11.0 "@types/pg-copy-streams": ^1.2.2 @@ -11818,6 +11819,7 @@ __metadata: hdb: 0.19.8 hnswlib-node: ^3.0.0 html-to-text: ^9.0.5 + htmlparser2: ^9.1.0 ibm-cloud-sdk-core: ^5.0.2 ignore: ^5.2.0 interface-datastore: ^8.2.11 @@ -11841,6 +11843,7 @@ __metadata: notion-to-md: ^3.1.0 officeparser: ^4.0.4 openai: "*" + oracledb: ^6.7.0 pdf-parse: 1.1.1 pg: ^8.11.0 pg-copy-streams: ^6.0.5 @@ -19614,6 +19617,15 @@ __metadata: languageName: node linkType: hard +"@types/oracledb@npm:^6": + version: 6.5.2 + resolution: "@types/oracledb@npm:6.5.2" + dependencies: + "@types/node": "*" + checksum: 02abec363e8ca1455310e930826095461c2b1e01ca7031aed99f5f0a029ee236a0a4df9c5e6d97e8757ef8e3e8531a1ce906cd65636acf355e72527bb96d4003 + languageName: node + linkType: hard + "@types/pad-left@npm:2.1.1": version: 2.1.1 resolution: "@types/pad-left@npm:2.1.1" @@ -30294,6 +30306,18 @@ __metadata: languageName: node linkType: hard +"htmlparser2@npm:^9.1.0": + version: 9.1.0 + resolution: "htmlparser2@npm:9.1.0" + dependencies: + domelementtype: ^2.3.0 + domhandler: ^5.0.3 + domutils: ^3.1.0 + entities: ^4.5.0 + checksum: e5f8d5193967e4a500226f37bdf2c0f858cecb39dde14d0439f24bf2c461a4342778740d988fbaba652b0e4cb6052f7f2e99e69fc1a329a86c629032bb76e7c8 + languageName: node + linkType: hard + "http-cache-semantics@npm:^4.0.0, http-cache-semantics@npm:^4.1.0, http-cache-semantics@npm:^4.1.1": version: 4.1.1 resolution: "http-cache-semantics@npm:4.1.1" @@ -36375,6 +36399,13 @@ __metadata: languageName: node linkType: hard +"oracledb@npm:^6.7.0": + version: 6.7.0 + resolution: "oracledb@npm:6.7.0" + checksum: f4424e30afc85256a09a23a0772e59ce551a6cbc9d12559d572a944e1278d9420fc537b5783617955e187904a6f6319ed58ef8ae7319253cdcebb01a15d8250a + languageName: node + linkType: hard + "os-name@npm:5.1.0": version: 5.1.0 resolution: "os-name@npm:5.1.0"