diff --git a/examples/src/document_transformers/mozilla_readability.ts b/examples/src/document_transformers/mozilla_readability.ts index b3ac3c2b155af..22b19e463a926 100644 --- a/examples/src/document_transformers/mozilla_readability.ts +++ b/examples/src/document_transformers/mozilla_readability.ts @@ -1,8 +1,8 @@ -import { CheerioWebBaseLoader } from "@langchain/community/document_loaders/web/cheerio"; +import { HTMLWebBaseLoader } from "@langchain/community/document_loaders/web/html"; import { MozillaReadabilityTransformer } from "@langchain/community/document_transformers/mozilla_readability"; import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters"; -const loader = new CheerioWebBaseLoader( +const loader = new HTMLWebBaseLoader( "https://news.ycombinator.com/item?id=34817881" ); @@ -11,7 +11,7 @@ const docs = await loader.load(); const splitter = RecursiveCharacterTextSplitter.fromLanguage("html"); const transformer = new MozillaReadabilityTransformer(); -const sequence = splitter.pipe(transformer); +const sequence = transformer.pipe(splitter); const newDocuments = await sequence.invoke(docs); diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore index 890c93717dea2..db7b3b0ee3712 100644 --- a/libs/langchain-community/.gitignore +++ b/libs/langchain-community/.gitignore @@ -862,6 +862,10 @@ document_loaders/web/cheerio.cjs document_loaders/web/cheerio.js document_loaders/web/cheerio.d.ts document_loaders/web/cheerio.d.cts +document_loaders/web/html.cjs +document_loaders/web/html.js +document_loaders/web/html.d.ts +document_loaders/web/html.d.cts document_loaders/web/puppeteer.cjs document_loaders/web/puppeteer.js document_loaders/web/puppeteer.d.ts diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js index 63b495f92f2c5..ee2951bd705a4 100644 --- a/libs/langchain-community/langchain.config.js +++ b/libs/langchain-community/langchain.config.js @@ -268,6 +268,7 @@ export const config = { "document_loaders/web/azure_blob_storage_file", "document_loaders/web/browserbase": "document_loaders/web/browserbase", "document_loaders/web/cheerio": "document_loaders/web/cheerio", + "document_loaders/web/html": "document_loaders/web/html", "document_loaders/web/puppeteer": "document_loaders/web/puppeteer", "document_loaders/web/playwright": "document_loaders/web/playwright", "document_loaders/web/college_confidential": diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index a1f60050f981a..0842120f72785 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -2652,6 +2652,15 @@ "import": "./document_loaders/web/cheerio.js", "require": "./document_loaders/web/cheerio.cjs" }, + "./document_loaders/web/html": { + "types": { + "import": "./document_loaders/web/html.d.ts", + "require": "./document_loaders/web/html.d.cts", + "default": "./document_loaders/web/html.d.ts" + }, + "import": "./document_loaders/web/html.js", + "require": "./document_loaders/web/html.cjs" + }, "./document_loaders/web/puppeteer": { "types": { "import": "./document_loaders/web/puppeteer.d.ts", @@ -3925,6 +3934,10 @@ "document_loaders/web/cheerio.js", "document_loaders/web/cheerio.d.ts", "document_loaders/web/cheerio.d.cts", + "document_loaders/web/html.cjs", + "document_loaders/web/html.js", + "document_loaders/web/html.d.ts", + "document_loaders/web/html.d.cts", "document_loaders/web/puppeteer.cjs", "document_loaders/web/puppeteer.js", "document_loaders/web/puppeteer.d.ts", diff --git a/libs/langchain-community/src/document_loaders/tests/html.int.test.ts b/libs/langchain-community/src/document_loaders/tests/html.int.test.ts new file mode 100644 index 0000000000000..f9de34862b491 --- /dev/null +++ b/libs/langchain-community/src/document_loaders/tests/html.int.test.ts @@ -0,0 +1,22 @@ +import { expect, test } from "@jest/globals"; +import { HTMLWebBaseLoader } from "../web/html.js"; + +test("Test HTML web scraper loader", async () => { + const loader = new HTMLWebBaseLoader( + "https://news.ycombinator.com/item?id=34817881" + ); + const docs = await loader.load(); + expect(docs[0].pageContent).toEqual(expect.stringContaining("What Lights the Universe’s Standard Candles?")) +}); + +test("Test HTML web scraper loader with textDecoder", async () => { + const loader = new HTMLWebBaseLoader( + "https://corp.163.com/gb/about/management.html", + { + textDecoder: new TextDecoder("gbk"), + } + ); + + const docs = await loader.load(); + expect(docs[0].pageContent.trim()).toEqual(expect.stringContaining("网易")); +}); diff --git a/libs/langchain-community/src/document_loaders/web/cheerio.ts b/libs/langchain-community/src/document_loaders/web/cheerio.ts index abdd5a7e15b2a..d8781f03e36c4 100644 --- a/libs/langchain-community/src/document_loaders/web/cheerio.ts +++ b/libs/langchain-community/src/document_loaders/web/cheerio.ts @@ -7,36 +7,21 @@ import type { import { Document } from "@langchain/core/documents"; import { AsyncCaller, - AsyncCallerParams, } from "@langchain/core/utils/async_caller"; import { BaseDocumentLoader } from "@langchain/core/document_loaders/base"; -import type { DocumentLoader } from "@langchain/core/document_loaders/base"; +import { WebBaseLoaderParams, WebBaseLoader } from "./web_base_loader.js"; /** * Represents the parameters for configuring the CheerioWebBaseLoader. It - * extends the AsyncCallerParams interface and adds additional parameters - * specific to web-based loaders. + * extends the WebBaseLoaderParams interface and adds additional parameters + * specific to loading with Cheerio. */ -export interface WebBaseLoaderParams extends AsyncCallerParams { - /** - * The timeout in milliseconds for the fetch request. Defaults to 10s. - */ - timeout?: number; - +export interface CheerioWebBaseLoaderParams extends WebBaseLoaderParams { /** * The selector to use to extract the text from the document. Defaults to * "body". */ selector?: SelectorType; - - /** - * The text decoder to use to decode the response. Defaults to UTF-8. - */ - textDecoder?: TextDecoder; - /** - * The headers to use in the fetch request. - */ - headers?: HeadersInit; } /** @@ -45,14 +30,14 @@ export interface WebBaseLoaderParams extends AsyncCallerParams { * web-based documents using Cheerio. * @example * ```typescript - * const loader = new CheerioWebBaseLoader("https:exampleurl.com"); + * const loader = new CheerioWebBaseLoader("https://exampleurl.com"); * const docs = await loader.load(); * console.log({ docs }); * ``` */ export class CheerioWebBaseLoader extends BaseDocumentLoader - implements DocumentLoader + implements WebBaseLoader { timeout: number; @@ -64,7 +49,7 @@ export class CheerioWebBaseLoader headers?: HeadersInit; - constructor(public webPath: string, fields?: WebBaseLoaderParams) { + constructor(public webPath: string, fields?: CheerioWebBaseLoaderParams) { super(); const { timeout, selector, textDecoder, headers, ...rest } = fields ?? {}; this.timeout = timeout ?? 10000; diff --git a/libs/langchain-community/src/document_loaders/web/html.ts b/libs/langchain-community/src/document_loaders/web/html.ts new file mode 100644 index 0000000000000..e86ff37834b2a --- /dev/null +++ b/libs/langchain-community/src/document_loaders/web/html.ts @@ -0,0 +1,38 @@ +import { + AsyncCaller, +} from "@langchain/core/utils/async_caller"; +import { BaseDocumentLoader } from "@langchain/core/document_loaders/base"; +import { Document } from "@langchain/core/documents"; +import { WebBaseLoaderParams, WebBaseLoader } from "./web_base_loader.js"; + +export class HTMLWebBaseLoader extends BaseDocumentLoader implements WebBaseLoader { + timeout: number; + + caller: AsyncCaller; + + textDecoder?: TextDecoder; + + headers?: HeadersInit; + + constructor(public webPath: string, fields?: WebBaseLoaderParams) { + super(); + const { timeout, textDecoder, headers, ...rest } = fields ?? {}; + this.timeout = timeout ?? 10000; + this.caller = new AsyncCaller(rest); + this.textDecoder = textDecoder; + this.headers = headers; + } + + async load(): Promise { + const response = await this.caller.call(fetch, this.webPath, { + signal: this.timeout ? AbortSignal.timeout(this.timeout) : undefined, + headers: this.headers, + }); + + const html = + this.textDecoder?.decode(await response.arrayBuffer()) ?? + (await response.text()); + + return [new Document({ pageContent: html })]; + } +} \ No newline at end of file diff --git a/libs/langchain-community/src/document_loaders/web/sitemap.ts b/libs/langchain-community/src/document_loaders/web/sitemap.ts index aa6a6e41cb33e..4ff935ca42aab 100644 --- a/libs/langchain-community/src/document_loaders/web/sitemap.ts +++ b/libs/langchain-community/src/document_loaders/web/sitemap.ts @@ -1,13 +1,13 @@ import { Document, DocumentInterface } from "@langchain/core/documents"; import { chunkArray } from "@langchain/core/utils/chunk_array"; -import { CheerioWebBaseLoader, WebBaseLoaderParams } from "./cheerio.js"; +import { CheerioWebBaseLoader, CheerioWebBaseLoaderParams } from "./cheerio.js"; /** * Interface representing the parameters for initializing a SitemapLoader. * @interface SitemapLoaderParams * @extends WebBaseLoaderParams */ -export interface SitemapLoaderParams extends WebBaseLoaderParams { +export interface SitemapLoaderParams extends CheerioWebBaseLoaderParams { /** * @property {(string | RegExp)[] | undefined} filterUrls - A list of regexes. Only URLs that match one of the filter URLs will be loaded. * WARNING: The filter URLs are interpreted as regular expressions. Escape special characters if needed. diff --git a/libs/langchain-community/src/document_loaders/web/web_base_loader.ts b/libs/langchain-community/src/document_loaders/web/web_base_loader.ts new file mode 100644 index 0000000000000..3bded81823505 --- /dev/null +++ b/libs/langchain-community/src/document_loaders/web/web_base_loader.ts @@ -0,0 +1,36 @@ +import { + AsyncCaller, + AsyncCallerParams, +} from "@langchain/core/utils/async_caller"; +import type { DocumentLoader } from "@langchain/core/document_loaders/base"; + +/** + * Represents the parameters for configuring WebBaseLoaders. It extends the + * AsyncCallerParams interface and adds additional parameters specific to + * web-based loaders. + */ +export interface WebBaseLoaderParams extends AsyncCallerParams { + /** + * The timeout in milliseconds for the fetch request. Defaults to 10s. + */ + timeout?: number; + + /** + * The text decoder to use to decode the response. Defaults to UTF-8. + */ + textDecoder?: TextDecoder; + /** + * The headers to use in the fetch request. + */ + headers?: HeadersInit; +} + +export interface WebBaseLoader extends DocumentLoader { + timeout: number; + + caller: AsyncCaller; + + textDecoder?: TextDecoder; + + headers?: HeadersInit; +} \ No newline at end of file diff --git a/libs/langchain-community/src/document_transformers/mozilla_readability.ts b/libs/langchain-community/src/document_transformers/mozilla_readability.ts index a26b42a6d6c77..e8003c3d05143 100644 --- a/libs/langchain-community/src/document_transformers/mozilla_readability.ts +++ b/libs/langchain-community/src/document_transformers/mozilla_readability.ts @@ -11,7 +11,7 @@ import { * main content from a web page. * @example * ```typescript - * const loader = new CheerioWebBaseLoader("https://example.com/article"); + * const loader = new HTMLWebBaseLoader("https://example.com/article"); * const docs = await loader.load(); * * const splitter = new RecursiveCharacterTextSplitter({ @@ -20,7 +20,7 @@ import { * const transformer = new MozillaReadabilityTransformer(); * * // The sequence processes the loaded documents through the splitter and then the transformer. - * const sequence = splitter.pipe(transformer); + * const sequence = transformer.pipe(splitter); * * // Invoke the sequence to transform the documents into a more readable format. * const newDocuments = await sequence.invoke(docs); diff --git a/libs/langchain-community/src/load/import_map.ts b/libs/langchain-community/src/load/import_map.ts index 5bbd9e4d0a01b..865d9eb3ab769 100644 --- a/libs/langchain-community/src/load/import_map.ts +++ b/libs/langchain-community/src/load/import_map.ts @@ -71,6 +71,7 @@ export * as stores__message__in_memory from "../stores/message/in_memory.js"; export * as memory__chat_memory from "../memory/chat_memory.js"; export * as indexes__base from "../indexes/base.js"; export * as indexes__memory from "../indexes/memory.js"; +export * as document_loaders__web__html from "../document_loaders/web/html.js"; export * as document_loaders__web__searchapi from "../document_loaders/web/searchapi.js"; export * as document_loaders__web__serpapi from "../document_loaders/web/serpapi.js"; export * as document_loaders__web__sort_xyz_blockchain from "../document_loaders/web/sort_xyz_blockchain.js";