Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

community[minor]: Add puppeteer agent tool #5731

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions libs/langchain-community/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ tools/ifttt.cjs
tools/ifttt.js
tools/ifttt.d.ts
tools/ifttt.d.cts
tools/puppeteer.cjs
tools/puppeteer.js
tools/puppeteer.d.ts
tools/puppeteer.d.cts
tools/searchapi.cjs
tools/searchapi.js
tools/searchapi.d.ts
Expand Down
2 changes: 2 additions & 0 deletions libs/langchain-community/langchain.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ export const config = {
"tools/google_places": "tools/google_places",
"tools/google_routes": "tools/google_routes",
"tools/ifttt": "tools/ifttt",
"tools/puppeteer": "tools/puppeteer",
"tools/searchapi": "tools/searchapi",
"tools/searxng_search": "tools/searxng_search",
"tools/serpapi": "tools/serpapi",
Expand Down Expand Up @@ -324,6 +325,7 @@ export const config = {
"tools/discord",
"tools/gmail",
"tools/google_calendar",
"tools/puppeteer",
"agents/toolkits/aws_sfn",
"callbacks/handlers/llmonitor",
"callbacks/handlers/lunary",
Expand Down
13 changes: 13 additions & 0 deletions libs/langchain-community/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -880,6 +880,15 @@
"import": "./tools/ifttt.js",
"require": "./tools/ifttt.cjs"
},
"./tools/puppeteer": {
"types": {
"import": "./tools/puppeteer.d.ts",
"require": "./tools/puppeteer.d.cts",
"default": "./tools/puppeteer.d.ts"
},
"import": "./tools/puppeteer.js",
"require": "./tools/puppeteer.cjs"
},
"./tools/searchapi": {
"types": {
"import": "./tools/searchapi.d.ts",
Expand Down Expand Up @@ -3097,6 +3106,10 @@
"tools/ifttt.js",
"tools/ifttt.d.ts",
"tools/ifttt.d.cts",
"tools/puppeteer.cjs",
"tools/puppeteer.js",
"tools/puppeteer.d.ts",
"tools/puppeteer.d.cts",
"tools/searchapi.cjs",
"tools/searchapi.js",
"tools/searchapi.d.ts",
Expand Down
1 change: 1 addition & 0 deletions libs/langchain-community/src/load/import_constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export const optionalImportEntrypoints: string[] = [
"langchain_community/tools/discord",
"langchain_community/tools/gmail",
"langchain_community/tools/google_calendar",
"langchain_community/tools/puppeteer",
"langchain_community/agents/toolkits/aws_sfn",
"langchain_community/embeddings/bedrock",
"langchain_community/embeddings/cloudflare_workersai",
Expand Down
181 changes: 181 additions & 0 deletions libs/langchain-community/src/tools/puppeteer.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
import { launch } from "puppeteer";
import { load } from "cheerio";

import type { LanguageModelLike } from "@langchain/core/language_models/base";
import { Tool, ToolParams } from "@langchain/core/tools";
import { RunnableSequence } from "@langchain/core/runnables";
import { StringOutputParser } from "@langchain/core/output_parsers";
import type { EmbeddingsInterface } from "@langchain/core/embeddings";

import {
RecursiveCharacterTextSplitter,
TextSplitter,
} from "@langchain/textsplitters";

import { MemoryVectorStore } from "langchain/vectorstores/memory";
import { formatDocumentsAsString } from "langchain/util/document";
import { Document } from "langchain/document";

export const parseInputs = (inputs: string): [string, string] => {
const [baseUrl, task] = inputs.split(",").map((input) => {
let t = input.trim();
t = t.startsWith('"') ? t.slice(1) : t;
t = t.endsWith('"') ? t.slice(0, -1) : t;
t = t.endsWith("/") ? t.slice(0, -1) : t;
return t.trim();
});

return [baseUrl, task];
};

export const getRelevantHtml = async (html: string): Promise<string> => {
const $ = load(html);

const tagsToRemove = ["script", "svg", "style"];

for (const tag of tagsToRemove) {
await $(tag).remove();
}

return $("body").html()?.trim().replace(/\n+/g, " ") ?? "";
};

export const getHtml = async (
baseUrl: string,
headers: Headers = DEFAULT_HEADERS

Check failure on line 45 in libs/langchain-community/src/tools/puppeteer.ts

View workflow job for this annotation

GitHub Actions / Check linting

'DEFAULT_HEADERS' was used before it was defined
) => {
const browser = await launch({
args: ["--no-sandbox", "--disable-setuid-sandbox"],
ignoreDefaultArgs: ["--disable-extensions"],
});

const page = await browser.newPage();

await page.setExtraHTTPHeaders(headers);
await page.goto(baseUrl, { waitUntil: "networkidle0" });

const bodyHtml = await page.content();
await browser.close();

return bodyHtml;
};

const DEFAULT_HEADERS = {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move above usage to avoid linter issue

Accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.5",

Referer: "https://www.google.com/",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "cross-site",
"Upgrade-Insecure-Requests": "1",
"User-Agent":
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
};

// eslint-disable-next-line @typescript-eslint/no-explicit-any
type Headers = Record<string, any>;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can be Record<string, string>


/**
* Defines the arguments that can be passed to the WebBrowser constructor.
* It extends the ToolParams interface and includes properties for a
* language model, embeddings, HTTP headers, and a text splitter.
*/
export interface PuppeteerBrowserArgs extends ToolParams {
model: LanguageModelLike;

embeddings: EmbeddingsInterface;

headers?: Headers;

textSplitter?: TextSplitter;
}

export class PuppeteerBrowser extends Tool {
name = "puppeteer-browser";

description = `Useful for when you need to find something on a webpage. Input should be a comma separated list of "ONE valid URL including protocol","What you want to find on the page or empty string for a summary"`;

static lc_name() {
return "PuppeteerBrowser";
}

get lc_namespace() {
return [...super.lc_namespace, "puppeteer"];
}

private model: LanguageModelLike;

private embeddings: EmbeddingsInterface;

private headers: Headers;

private textSplitter: TextSplitter;

constructor({
model,
headers,
embeddings,
textSplitter,
}: PuppeteerBrowserArgs) {
super(...arguments);

this.model = model;
this.embeddings = embeddings;
this.headers = headers ?? DEFAULT_HEADERS;

this.textSplitter =
textSplitter ??
new RecursiveCharacterTextSplitter({
chunkSize: 2000,
chunkOverlap: 200,
});
}

async _call(inputs: string) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can make this extend StructuredTool and give it a schema instead, might be a bit nicer/more reliable than comma separated values

const [baseUrl, task] = parseInputs(inputs);
const doSummary = !task;

let text;
try {
const html = await getHtml(baseUrl, this.headers);
text = await getRelevantHtml(html);
} catch (e) {
if (e) {
return e.toString();
}
return "There was a problem connecting to the site";
}

const texts = await this.textSplitter.splitText(text);

let context;
if (doSummary) {
context = texts.slice(0, 4).join("\n");
} else {
const docs = texts.map(
(pageContent: string) =>
new Document({
pageContent,
metadata: [],
})
);

const vectorStore = await MemoryVectorStore.fromDocuments(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docs,
this.embeddings
);
const results = await vectorStore.similaritySearch(task, 4);
context = formatDocumentsAsString(results);
}

const input = `Text:${context}\n\nI need ${
doSummary ? "a summary" : task
} from the above text, also provide up to 5 markdown links from within that would be of interest (always including URL and text). Links should be provided, if present, in markdown syntax as a list under the heading "Relevant Links:".`;

const chain = RunnableSequence.from([this.model, new StringOutputParser()]);
return chain.invoke(input);
}
}
33 changes: 33 additions & 0 deletions libs/langchain-community/src/tools/tests/puppeteer.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import { test, expect } from "@jest/globals";
import { getRelevantHtml, parseInputs } from "../puppeteer.js";

test("getRelevantHtml should extract relevant parts of the html", async () => {
const html =
"<html><body><div><svg>ahh</svg><strong>Hello,</strong> world!<style>*{background-color: pink;}</style><script>Remove me!</script></div></body>";

expect(getRelevantHtml(html)).toBe(
"<div><strong>Hello,</strong> world!</div>"
);
});

test("parseInputs", () => {
expect(parseInputs(`"https://supermagictaste.com",""`)).toEqual([
"https://supermagictaste.com",
"",
]);
expect(
parseInputs(`"https://supermagictaste.com","word of the day"`)
).toEqual(["https://supermagictaste.com", "word of the day"]);
expect(parseInputs(`"https://supermagictaste.com","`)).toEqual([
"https://supermagictaste.com",
"",
]);
expect(parseInputs(`"https://supermagictaste.com",`)).toEqual([
"https://supermagictaste.com",
"",
]);
expect(parseInputs(`"https://supermagictaste.com"`)).toEqual([
"https://supermagictaste.com",
undefined,
]);
});
Loading