From 9484851e33e0d0a0a1c94f966a3e0699b37f8d48 Mon Sep 17 00:00:00 2001 From: Steven Thomas Date: Tue, 11 Jun 2024 15:35:21 -0400 Subject: [PATCH 1/3] Add puppeteer agent tool --- .../src/tools/puppeteer.ts | 177 ++++++++++++++++++ .../src/tools/tests/puppeteer.test.ts | 31 +++ 2 files changed, 208 insertions(+) create mode 100644 libs/langchain-community/src/tools/puppeteer.ts create mode 100644 libs/langchain-community/src/tools/tests/puppeteer.test.ts diff --git a/libs/langchain-community/src/tools/puppeteer.ts b/libs/langchain-community/src/tools/puppeteer.ts new file mode 100644 index 000000000000..f99a9dcc0057 --- /dev/null +++ b/libs/langchain-community/src/tools/puppeteer.ts @@ -0,0 +1,177 @@ +import {launch} from "puppeteer"; + +import type { BaseLanguageModelInterface } from "@langchain/core/language_models/base"; +import { Tool, ToolParams } from "@langchain/core/tools"; +import { RunnableSequence } from "@langchain/core/runnables"; +import { StringOutputParser } from "@langchain/core/output_parsers"; +import type { EmbeddingsInterface } from "@langchain/core/embeddings"; + +import { + RecursiveCharacterTextSplitter, + TextSplitter, +} from "@langchain/textsplitters"; + +import { MemoryVectorStore } from "langchain/vectorstores/memory"; +import { formatDocumentsAsString } from "langchain/util/document"; +import { Document } from "langchain/document"; +import {load} from "cheerio"; + +export const parseInputs = (inputs: string): [string, string] => { + const [baseUrl, task] = inputs.split(",").map((input) => { + let t = input.trim(); + t = t.startsWith('"') ? t.slice(1) : t; + t = t.endsWith('"') ? t.slice(0, -1) : t; + t = t.endsWith("/") ? t.slice(0, -1) : t; + return t.trim(); + }); + + return [baseUrl, task]; +}; + +export const getRelevantHtml = async ( + html: string, +): Promise => { + const $ = load(html); + + const tagsToRemove = ['script', 'svg', 'style'] + + for (const tag of tagsToRemove) { + await $(tag).remove(); + }; + + return $('body').html()?.trim().replace(/\n+/g, " ") ?? ''; +}; + +export const getHtml = async (baseUrl: string, headers: Headers = DEFAULT_HEADERS) => { + const browser = await launch({ + args: ["--no-sandbox", "--disable-setuid-sandbox"], + ignoreDefaultArgs: ["--disable-extensions"], + }); + + const page = await browser.newPage(); + + await page.setExtraHTTPHeaders(headers); + await page.goto(baseUrl, { waitUntil: "networkidle0" }); + + const bodyHtml = await page.content(); + await browser.close(); + + return bodyHtml; +}; + +const DEFAULT_HEADERS = { + Accept: + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Encoding": "gzip, deflate", + "Accept-Language": "en-US,en;q=0.5", + + Referer: "https://www.google.com/", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "cross-site", + "Upgrade-Insecure-Requests": "1", + "User-Agent": + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0", +}; + +// eslint-disable-next-line @typescript-eslint/no-explicit-any +type Headers = Record; + +/** + * Defines the arguments that can be passed to the WebBrowser constructor. + * It extends the ToolParams interface and includes properties for a + * language model, embeddings, HTTP headers, and a text splitter. + */ +export interface PuppeteerBrowserArgs extends ToolParams { + model: BaseLanguageModelInterface; + + embeddings: EmbeddingsInterface; + + headers?: Headers; + + textSplitter?: TextSplitter; +} + +class PuppeteerBrowser extends Tool { + static lc_name() { + return "PuppeteerBrowser"; + } + + get lc_namespace() { + return [...super.lc_namespace, "puppeteer_browser"]; + } + + private model: BaseLanguageModelInterface; + + private embeddings: EmbeddingsInterface; + + private headers: Headers; + + private textSplitter: TextSplitter; + + constructor({ model, headers, embeddings, textSplitter }: PuppeteerBrowserArgs) { + super(...arguments); + + this.model = model; + this.embeddings = embeddings; + this.headers = headers ?? DEFAULT_HEADERS; + + this.textSplitter = + textSplitter ?? + new RecursiveCharacterTextSplitter({ + chunkSize: 2000, + chunkOverlap: 200, + }); + } + + async _call(inputs: string) { + const [baseUrl, task] = parseInputs(inputs); + const doSummary = !task; + + let text; + try { + const html = await getHtml(baseUrl, this.headers); + text = await getRelevantHtml(html); + } catch (e) { + if (e) { + return e.toString(); + } + return "There was a problem connecting to the site"; + } + + const texts = await this.textSplitter.splitText(text); + + let context; + if (doSummary) { + context = texts.slice(0, 4).join("\n"); + } else { + const docs = texts.map( + (pageContent: string) => + new Document({ + pageContent, + metadata: [], + }) + ); + + const vectorStore = await MemoryVectorStore.fromDocuments( + docs, + this.embeddings + ); + const results = await vectorStore.similaritySearch(task, 4); + context = formatDocumentsAsString(results); + } + + const input = `Text:${context}\n\nI need ${ + doSummary ? "a summary" : task + } from the above text, also provide up to 5 markdown links from within that would be of interest (always including URL and text). Links should be provided, if present, in markdown syntax as a list under the heading "Relevant Links:".`; + + const chain = RunnableSequence.from([this.model, new StringOutputParser()]); + return chain.invoke(input); + } + + name = "puppeteer-browser"; + + description = `Useful for when you need to find something on a webpage. Input should be a comma separated list of "ONE valid URL including protocol","What you want to find on the page or empty string for a summary"`; +} + +export { PuppeteerBrowser }; diff --git a/libs/langchain-community/src/tools/tests/puppeteer.test.ts b/libs/langchain-community/src/tools/tests/puppeteer.test.ts new file mode 100644 index 000000000000..4e23d5b6bb21 --- /dev/null +++ b/libs/langchain-community/src/tools/tests/puppeteer.test.ts @@ -0,0 +1,31 @@ +import { test, expect } from "@jest/globals"; +import { getRelevantHtml, parseInputs } from "../puppeteer.js"; + + +test("getRelevantHtml should extract relevant parts of the html", async () => { + const html = "
ahhHello, world!
"; + + expect(getRelevantHtml(html)).toBe("
Hello, world!
"); +}); + +test("parseInputs", () => { + expect(parseInputs(`"https://supermagictaste.com",""`)).toEqual([ + "https://supermagictaste.com", + "", + ]); + expect( + parseInputs(`"https://supermagictaste.com","word of the day"`) + ).toEqual(["https://supermagictaste.com", "word of the day"]); + expect(parseInputs(`"https://supermagictaste.com","`)).toEqual([ + "https://supermagictaste.com", + "", + ]); + expect(parseInputs(`"https://supermagictaste.com",`)).toEqual([ + "https://supermagictaste.com", + "", + ]); + expect(parseInputs(`"https://supermagictaste.com"`)).toEqual([ + "https://supermagictaste.com", + undefined, + ]); +}); From 8493d8d884ea5be8c64ddeb0c88acfadd78194aa Mon Sep 17 00:00:00 2001 From: Steven Thomas Date: Tue, 11 Jun 2024 15:38:22 -0400 Subject: [PATCH 2/3] Add puppeteer tool to package.json --- libs/langchain-community/package.json | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 98290c67561e..c0d87997d8cc 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -880,6 +880,15 @@ "import": "./tools/ifttt.js", "require": "./tools/ifttt.cjs" }, + "./tools/puppeteer": { + "types": { + "import": "./tools/puppeteer.d.ts", + "require": "./tools/puppeteer.d.cts", + "default": "./tools/puppeteer.d.ts" + }, + "import": "./tools/puppeteer.js", + "require": "./tools/puppeteer.cjs" + }, "./tools/searchapi": { "types": { "import": "./tools/searchapi.d.ts", From b8ce7e922835a441974d59c9253e097bef7dd6da Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Thu, 13 Jun 2024 14:14:17 -0700 Subject: [PATCH 3/3] Adds entrypoints --- libs/langchain-community/.gitignore | 4 ++ libs/langchain-community/langchain.config.js | 2 + libs/langchain-community/package.json | 4 ++ .../src/load/import_constants.ts | 1 + .../src/tools/puppeteer.ts | 46 ++++++++++--------- .../src/tools/tests/puppeteer.test.ts | 8 ++-- 6 files changed, 41 insertions(+), 24 deletions(-) diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore index bad275d50cf4..242f3e88538b 100644 --- a/libs/langchain-community/.gitignore +++ b/libs/langchain-community/.gitignore @@ -78,6 +78,10 @@ tools/ifttt.cjs tools/ifttt.js tools/ifttt.d.ts tools/ifttt.d.cts +tools/puppeteer.cjs +tools/puppeteer.js +tools/puppeteer.d.ts +tools/puppeteer.d.cts tools/searchapi.cjs tools/searchapi.js tools/searchapi.d.ts diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js index d5c4c6b15eb3..cd49ca34d26e 100644 --- a/libs/langchain-community/langchain.config.js +++ b/libs/langchain-community/langchain.config.js @@ -51,6 +51,7 @@ export const config = { "tools/google_places": "tools/google_places", "tools/google_routes": "tools/google_routes", "tools/ifttt": "tools/ifttt", + "tools/puppeteer": "tools/puppeteer", "tools/searchapi": "tools/searchapi", "tools/searxng_search": "tools/searxng_search", "tools/serpapi": "tools/serpapi", @@ -322,6 +323,7 @@ export const config = { "tools/discord", "tools/gmail", "tools/google_calendar", + "tools/puppeteer", "agents/toolkits/aws_sfn", "callbacks/handlers/llmonitor", "callbacks/handlers/lunary", diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index c0d87997d8cc..2c6e0d6981b7 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -3088,6 +3088,10 @@ "tools/ifttt.js", "tools/ifttt.d.ts", "tools/ifttt.d.cts", + "tools/puppeteer.cjs", + "tools/puppeteer.js", + "tools/puppeteer.d.ts", + "tools/puppeteer.d.cts", "tools/searchapi.cjs", "tools/searchapi.js", "tools/searchapi.d.ts", diff --git a/libs/langchain-community/src/load/import_constants.ts b/libs/langchain-community/src/load/import_constants.ts index 8497a6b7405b..3df010df8b10 100644 --- a/libs/langchain-community/src/load/import_constants.ts +++ b/libs/langchain-community/src/load/import_constants.ts @@ -7,6 +7,7 @@ export const optionalImportEntrypoints: string[] = [ "langchain_community/tools/discord", "langchain_community/tools/gmail", "langchain_community/tools/google_calendar", + "langchain_community/tools/puppeteer", "langchain_community/agents/toolkits/aws_sfn", "langchain_community/embeddings/bedrock", "langchain_community/embeddings/cloudflare_workersai", diff --git a/libs/langchain-community/src/tools/puppeteer.ts b/libs/langchain-community/src/tools/puppeteer.ts index f99a9dcc0057..81f20ddebc76 100644 --- a/libs/langchain-community/src/tools/puppeteer.ts +++ b/libs/langchain-community/src/tools/puppeteer.ts @@ -1,6 +1,7 @@ -import {launch} from "puppeteer"; +import { launch } from "puppeteer"; +import { load } from "cheerio"; -import type { BaseLanguageModelInterface } from "@langchain/core/language_models/base"; +import type { LanguageModelLike } from "@langchain/core/language_models/base"; import { Tool, ToolParams } from "@langchain/core/tools"; import { RunnableSequence } from "@langchain/core/runnables"; import { StringOutputParser } from "@langchain/core/output_parsers"; @@ -14,7 +15,6 @@ import { import { MemoryVectorStore } from "langchain/vectorstores/memory"; import { formatDocumentsAsString } from "langchain/util/document"; import { Document } from "langchain/document"; -import {load} from "cheerio"; export const parseInputs = (inputs: string): [string, string] => { const [baseUrl, task] = inputs.split(",").map((input) => { @@ -28,21 +28,22 @@ export const parseInputs = (inputs: string): [string, string] => { return [baseUrl, task]; }; -export const getRelevantHtml = async ( - html: string, -): Promise => { +export const getRelevantHtml = async (html: string): Promise => { const $ = load(html); - const tagsToRemove = ['script', 'svg', 'style'] + const tagsToRemove = ["script", "svg", "style"]; for (const tag of tagsToRemove) { await $(tag).remove(); - }; + } - return $('body').html()?.trim().replace(/\n+/g, " ") ?? ''; + return $("body").html()?.trim().replace(/\n+/g, " ") ?? ""; }; -export const getHtml = async (baseUrl: string, headers: Headers = DEFAULT_HEADERS) => { +export const getHtml = async ( + baseUrl: string, + headers: Headers = DEFAULT_HEADERS +) => { const browser = await launch({ args: ["--no-sandbox", "--disable-setuid-sandbox"], ignoreDefaultArgs: ["--disable-extensions"], @@ -83,7 +84,7 @@ type Headers = Record; * language model, embeddings, HTTP headers, and a text splitter. */ export interface PuppeteerBrowserArgs extends ToolParams { - model: BaseLanguageModelInterface; + model: LanguageModelLike; embeddings: EmbeddingsInterface; @@ -92,16 +93,20 @@ export interface PuppeteerBrowserArgs extends ToolParams { textSplitter?: TextSplitter; } -class PuppeteerBrowser extends Tool { +export class PuppeteerBrowser extends Tool { + name = "puppeteer-browser"; + + description = `Useful for when you need to find something on a webpage. Input should be a comma separated list of "ONE valid URL including protocol","What you want to find on the page or empty string for a summary"`; + static lc_name() { return "PuppeteerBrowser"; } get lc_namespace() { - return [...super.lc_namespace, "puppeteer_browser"]; + return [...super.lc_namespace, "puppeteer"]; } - private model: BaseLanguageModelInterface; + private model: LanguageModelLike; private embeddings: EmbeddingsInterface; @@ -109,7 +114,12 @@ class PuppeteerBrowser extends Tool { private textSplitter: TextSplitter; - constructor({ model, headers, embeddings, textSplitter }: PuppeteerBrowserArgs) { + constructor({ + model, + headers, + embeddings, + textSplitter, + }: PuppeteerBrowserArgs) { super(...arguments); this.model = model; @@ -168,10 +178,4 @@ class PuppeteerBrowser extends Tool { const chain = RunnableSequence.from([this.model, new StringOutputParser()]); return chain.invoke(input); } - - name = "puppeteer-browser"; - - description = `Useful for when you need to find something on a webpage. Input should be a comma separated list of "ONE valid URL including protocol","What you want to find on the page or empty string for a summary"`; } - -export { PuppeteerBrowser }; diff --git a/libs/langchain-community/src/tools/tests/puppeteer.test.ts b/libs/langchain-community/src/tools/tests/puppeteer.test.ts index 4e23d5b6bb21..ad9a3e5dad7b 100644 --- a/libs/langchain-community/src/tools/tests/puppeteer.test.ts +++ b/libs/langchain-community/src/tools/tests/puppeteer.test.ts @@ -1,11 +1,13 @@ import { test, expect } from "@jest/globals"; import { getRelevantHtml, parseInputs } from "../puppeteer.js"; - test("getRelevantHtml should extract relevant parts of the html", async () => { - const html = "
ahhHello, world!
"; + const html = + "
ahhHello, world!
"; - expect(getRelevantHtml(html)).toBe("
Hello, world!
"); + expect(getRelevantHtml(html)).toBe( + "
Hello, world!
" + ); }); test("parseInputs", () => {