diff --git a/deno.json b/deno.json
index 35e488d9e0e0..367aeecd135f 100644
--- a/deno.json
+++ b/deno.json
@@ -22,7 +22,6 @@
"zod": "npm:/zod",
"zod-to-json-schema": "npm:/zod-to-json-schema",
"node-llama-cpp": "npm:/node-llama-cpp",
- "ml-distance": "npm:/ml-distance",
"pdf-parse": "npm:/pdf-parse",
"peggy": "npm:/peggy",
"readline": "https://deno.land/x/readline@v1.1.0/mod.ts",
@@ -30,6 +29,14 @@
"youtubei.js": "npm:/youtubei.js",
"youtube-transcript": "npm:/youtube-transcript",
"neo4j-driver": "npm:/neo4j-driver",
- "axios": "npm:/axios"
+ "axios": "npm:/axios",
+ "@mendable/firecrawl-js": "npm:/@mendable/firecrawl-js",
+ "@aws-crypto/sha256-js": "npm:/@aws-crypto/sha256-js",
+ "@aws-sdk/credential-provider-node": "npm:/@aws-sdk/credential-provider-node",
+ "@smithy/protocol-http": "npm:/@smithy/protocol-http",
+ "@smithy/signature-v4": "npm:/@smithy/signature-v4",
+ "@smithy/eventstream-codec": "npm:/@smithy/eventstream-codec",
+ "@smithy/util-utf8": "npm:/@smithy/util-utf8",
+ "@aws-sdk/types": "npm:/@aws-sdk/types"
}
}
\ No newline at end of file
diff --git a/docs/core_docs/.gitignore b/docs/core_docs/.gitignore
index df8abd01615f..3b2f9ca94e78 100644
--- a/docs/core_docs/.gitignore
+++ b/docs/core_docs/.gitignore
@@ -34,6 +34,26 @@ yarn-error.log*
/.quarto/
# AUTO_GENERATED_DOCS
+docs/tutorials/rag.md
+docs/tutorials/rag.mdx
+docs/tutorials/query_analysis.md
+docs/tutorials/query_analysis.mdx
+docs/tutorials/qa_chat_history.md
+docs/tutorials/qa_chat_history.mdx
+docs/tutorials/pdf_qa.md
+docs/tutorials/pdf_qa.mdx
+docs/tutorials/local_rag.md
+docs/tutorials/local_rag.mdx
+docs/tutorials/llm_chain.md
+docs/tutorials/llm_chain.mdx
+docs/tutorials/graph.md
+docs/tutorials/graph.mdx
+docs/tutorials/extraction.md
+docs/tutorials/extraction.mdx
+docs/tutorials/classification.md
+docs/tutorials/classification.mdx
+docs/tutorials/chatbot.md
+docs/tutorials/chatbot.mdx
docs/how_to/trim_messages.md
docs/how_to/trim_messages.mdx
docs/how_to/tools_prompting.md
@@ -188,27 +208,29 @@ docs/how_to/assign.md
docs/how_to/assign.mdx
docs/how_to/agent_executor.md
docs/how_to/agent_executor.mdx
-docs/tutorials/rag.md
-docs/tutorials/rag.mdx
-docs/tutorials/query_analysis.md
-docs/tutorials/query_analysis.mdx
-docs/tutorials/qa_chat_history.md
-docs/tutorials/qa_chat_history.mdx
-docs/tutorials/pdf_qa.md
-docs/tutorials/pdf_qa.mdx
-docs/tutorials/local_rag.md
-docs/tutorials/local_rag.mdx
-docs/tutorials/llm_chain.md
-docs/tutorials/llm_chain.mdx
-docs/tutorials/graph.md
-docs/tutorials/graph.mdx
-docs/tutorials/extraction.md
-docs/tutorials/extraction.mdx
-docs/tutorials/classification.md
-docs/tutorials/classification.mdx
-docs/tutorials/chatbot.md
-docs/tutorials/chatbot.mdx
docs/integrations/llms/mistral.md
docs/integrations/llms/mistral.mdx
+docs/integrations/chat/togetherai.md
+docs/integrations/chat/togetherai.mdx
+docs/integrations/chat/openai.md
+docs/integrations/chat/openai.mdx
+docs/integrations/chat/ollama.md
+docs/integrations/chat/ollama.mdx
docs/integrations/chat/mistral.md
-docs/integrations/chat/mistral.mdx
\ No newline at end of file
+docs/integrations/chat/mistral.mdx
+docs/integrations/chat/groq.md
+docs/integrations/chat/groq.mdx
+docs/integrations/chat/google_vertex_ai.md
+docs/integrations/chat/google_vertex_ai.mdx
+docs/integrations/chat/google_generativeai.md
+docs/integrations/chat/google_generativeai.mdx
+docs/integrations/chat/fireworks.md
+docs/integrations/chat/fireworks.mdx
+docs/integrations/chat/cohere.md
+docs/integrations/chat/cohere.mdx
+docs/integrations/chat/azure.md
+docs/integrations/chat/azure.mdx
+docs/integrations/chat/anthropic.md
+docs/integrations/chat/anthropic.mdx
+docs/integrations/document_loaders/web_loaders/web_cheerio.md
+docs/integrations/document_loaders/web_loaders/web_cheerio.mdx
\ No newline at end of file
diff --git a/docs/core_docs/docs/integrations/chat/google_generativeai.ipynb b/docs/core_docs/docs/integrations/chat/google_generativeai.ipynb
index af47303b2917..192339ddce01 100644
--- a/docs/core_docs/docs/integrations/chat/google_generativeai.ipynb
+++ b/docs/core_docs/docs/integrations/chat/google_generativeai.ipynb
@@ -326,23 +326,19 @@
"```{=mdx}\n",
"\n",
":::caution\n",
+ "\n",
"The Google GenerativeAI API does not allow tool schemas to contain an object with unknown properties.\n",
"\n",
- "For example, the following Zod schema will throw an error:\n",
+ "For example, the following Zod schemas will throw an error:\n",
"\n",
- "```typescript\n",
- "const schema = z.object({\n",
- " properties: z.record(z.unknown()), // Not allowed\n",
- "});\n",
- "```\n",
+ "`const invalidSchema = z.object({ properties: z.record(z.unknown()) });`\n",
"\n",
- "or\n",
+ "and\n",
"\n",
- "```typescript\n",
- "const schema = z.record(z.unknown()); // Not allowed\n",
- "```\n",
+ "`const invalidSchema2 = z.record(z.unknown());`\n",
"\n",
"Instead, you should explicitly define the properties of the object field.\n",
+ "\n",
":::\n",
"\n",
"```\n"
diff --git a/docs/core_docs/docs/integrations/document_loaders/file_loaders/csv.ipynb b/docs/core_docs/docs/integrations/document_loaders/file_loaders/csv.ipynb
new file mode 100644
index 000000000000..5f0f34c143d5
--- /dev/null
+++ b/docs/core_docs/docs/integrations/document_loaders/file_loaders/csv.ipynb
@@ -0,0 +1,226 @@
+{
+ "cells": [
+ {
+ "cell_type": "raw",
+ "metadata": {
+ "vscode": {
+ "languageId": "raw"
+ }
+ },
+ "source": [
+ "---\n",
+ "sidebar_label: CSV\n",
+ "sidebar_class_name: node-only\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# CSVLoader\n",
+ "\n",
+ "```{=mdx}\n",
+ "\n",
+ ":::tip Compatibility\n",
+ "\n",
+ "Only available on Node.js.\n",
+ "\n",
+ ":::\n",
+ "\n",
+ "```\n",
+ "\n",
+ "This notebook provides a quick overview for getting started with `CSVLoader` [document loaders](/docs/concepts/#document-loaders). For detailed documentation of all `CSVLoader` features and configurations head to the [API reference](https://api.js.langchain.com/classes/langchain_community_document_loaders_fs_csv.CSVLoader.html).\n",
+ "\n",
+ "This example goes over how to load data from CSV files. The second argument is the `column` name to extract from the CSV file. One document will be created for each row in the CSV file. When `column` is not specified, each row is converted into a key/value pair with each key/value pair outputted to a new line in the document's `pageContent`. When `column` is specified, one document is created for each row, and the value of the specified column is used as the document's `pageContent`.\n",
+ "\n",
+ "## Overview\n",
+ "### Integration details\n",
+ "\n",
+ "| Class | Package | Compatibility | Local | [PY support](https://python.langchain.com/docs/integrations/document_loaders/csv)| \n",
+ "| :--- | :--- | :---: | :---: | :---: |\n",
+ "| [CSVLoader](https://api.js.langchain.com/classes/langchain_community_document_loaders_fs_csv.CSVLoader.html) | [@langchain/community](https://api.js.langchain.com/modules/langchain_community_document_loaders_fs_csv.html) | Node-only | ✅ | ✅ |\n",
+ "\n",
+ "## Setup\n",
+ "\n",
+ "To access `CSVLoader` document loader you'll need to install the `@langchain/community` integration, along with the `d3-dsv@2` peer dependency.\n",
+ "\n",
+ "### Installation\n",
+ "\n",
+ "The LangChain CSVLoader integration lives in the `@langchain/community` integration package.\n",
+ "\n",
+ "```{=mdx}\n",
+ "import IntegrationInstallTooltip from \"@mdx_components/integration_install_tooltip.mdx\";\n",
+ "import Npm2Yarn from \"@theme/Npm2Yarn\";\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ " @langchain/community d3-dsv@2\n",
+ "\n",
+ "\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Instantiation\n",
+ "\n",
+ "Now we can instantiate our model object and load documents:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import { CSVLoader } from \"@langchain/community/document_loaders/fs/csv\"\n",
+ "\n",
+ "const exampleCsvPath = \"../../../../../../langchain/src/document_loaders/tests/example_data/example_separator.csv\";\n",
+ "\n",
+ "const loader = new CSVLoader(exampleCsvPath)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Document {\n",
+ " pageContent: 'id|html: 1|\"Corruption discovered at the core of the Banking Clan!\"',\n",
+ " metadata: {\n",
+ " source: '../../../../../../langchain/src/document_loaders/tests/example_data/example_separator.csv',\n",
+ " line: 1\n",
+ " },\n",
+ " id: undefined\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "const docs = await loader.load()\n",
+ "docs[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{\n",
+ " source: '../../../../../../langchain/src/document_loaders/tests/example_data/example_separator.csv',\n",
+ " line: 1\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "console.log(docs[0].metadata)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Usage, extracting a single column\n",
+ "\n",
+ "Example CSV file:\n",
+ "\n",
+ "```csv\n",
+ "id|html\n",
+ "1|\"Corruption discovered at the core of the Banking Clan!\"\n",
+ "2|\"Reunited, Rush Clovis and Senator Amidala\"\n",
+ "3|\"discover the full extent of the deception.\"\n",
+ "4|\"Anakin Skywalker is sent to the rescue!\"\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Document {\n",
+ " pageContent: 'Corruption discovered at the core of the Banking Clan!',\n",
+ " metadata: {\n",
+ " source: '../../../../../../langchain/src/document_loaders/tests/example_data/example_separator.csv',\n",
+ " line: 1\n",
+ " },\n",
+ " id: undefined\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "import { CSVLoader } from \"@langchain/community/document_loaders/fs/csv\";\n",
+ "\n",
+ "const singleColumnLoader = new CSVLoader(\n",
+ " exampleCsvPath,\n",
+ " {\n",
+ " column: \"html\",\n",
+ " separator:\"|\"\n",
+ " }\n",
+ ");\n",
+ "\n",
+ "const singleColumnDocs = await singleColumnLoader.load();\n",
+ "console.log(singleColumnDocs[0]);"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## API reference\n",
+ "\n",
+ "For detailed documentation of all CSVLoader features and configurations head to the API reference: https://api.js.langchain.com/classes/langchain_community_document_loaders_fs_csv.CSVLoader.html"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "TypeScript",
+ "language": "typescript",
+ "name": "tslab"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "mode": "typescript",
+ "name": "javascript",
+ "typescript": true
+ },
+ "file_extension": ".ts",
+ "mimetype": "text/typescript",
+ "name": "typescript",
+ "version": "3.7.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/core_docs/docs/integrations/document_loaders/file_loaders/csv.mdx b/docs/core_docs/docs/integrations/document_loaders/file_loaders/csv.mdx
deleted file mode 100644
index e9adf18540a8..000000000000
--- a/docs/core_docs/docs/integrations/document_loaders/file_loaders/csv.mdx
+++ /dev/null
@@ -1,90 +0,0 @@
-# CSV files
-
-This example goes over how to load data from CSV files. The second argument is the `column` name to extract from the CSV file. One document will be created for each row in the CSV file. When `column` is not specified, each row is converted into a key/value pair with each key/value pair outputted to a new line in the document's `pageContent`. When `column` is specified, one document is created for each row, and the value of the specified column is used as the document's pageContent.
-
-## Setup
-
-```bash npm2yarn
-npm install d3-dsv@2
-```
-
-## Usage, extracting all columns
-
-Example CSV file:
-
-```csv
-id,text
-1,This is a sentence.
-2,This is another sentence.
-```
-
-Example code:
-
-```typescript
-import { CSVLoader } from "@langchain/community/document_loaders/fs/csv";
-
-const loader = new CSVLoader("src/document_loaders/example_data/example.csv");
-
-const docs = await loader.load();
-/*
-[
- Document {
- "metadata": {
- "line": 1,
- "source": "src/document_loaders/example_data/example.csv",
- },
- "pageContent": "id: 1
-text: This is a sentence.",
- },
- Document {
- "metadata": {
- "line": 2,
- "source": "src/document_loaders/example_data/example.csv",
- },
- "pageContent": "id: 2
-text: This is another sentence.",
- },
-]
-*/
-```
-
-## Usage, extracting a single column
-
-Example CSV file:
-
-```csv
-id,text
-1,This is a sentence.
-2,This is another sentence.
-```
-
-Example code:
-
-```typescript
-import { CSVLoader } from "@langchain/community/document_loaders/fs/csv";
-
-const loader = new CSVLoader(
- "src/document_loaders/example_data/example.csv",
- "text"
-);
-
-const docs = await loader.load();
-/*
-[
- Document {
- "metadata": {
- "line": 1,
- "source": "src/document_loaders/example_data/example.csv",
- },
- "pageContent": "This is a sentence.",
- },
- Document {
- "metadata": {
- "line": 2,
- "source": "src/document_loaders/example_data/example.csv",
- },
- "pageContent": "This is another sentence.",
- },
-]
-*/
-```
diff --git a/docs/core_docs/docs/integrations/document_loaders/file_loaders/directory.ipynb b/docs/core_docs/docs/integrations/document_loaders/file_loaders/directory.ipynb
new file mode 100644
index 000000000000..3d19d94677d2
--- /dev/null
+++ b/docs/core_docs/docs/integrations/document_loaders/file_loaders/directory.ipynb
@@ -0,0 +1,192 @@
+{
+ "cells": [
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "sidebar_label: DirectoryLoader\n",
+ "sidebar_class_name: node-only\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# DirectoryLoader\n",
+ "\n",
+ "```{=mdx}\n",
+ "\n",
+ ":::tip Compatibility\n",
+ "\n",
+ "Only available on Node.js.\n",
+ "\n",
+ ":::\n",
+ "\n",
+ "```\n",
+ "\n",
+ "This notebook provides a quick overview for getting started with `DirectoryLoader` [document loaders](/docs/concepts/#document-loaders). For detailed documentation of all `DirectoryLoader` features and configurations head to the [API reference](https://api.js.langchain.com/classes/langchain_document_loaders_fs_directory.DirectoryLoader.html).\n",
+ "\n",
+ "This example goes over how to load data from folders with multiple files. The second argument is a map of file extensions to loader factories. Each file will be passed to the matching loader, and the resulting documents will be concatenated together.\n",
+ "\n",
+ "Example folder:\n",
+ "\n",
+ "```text\n",
+ "src/document_loaders/example_data/example/\n",
+ "├── example.json\n",
+ "├── example.jsonl\n",
+ "├── example.txt\n",
+ "└── example.csv\n",
+ "```\n",
+ "\n",
+ "## Overview\n",
+ "### Integration details\n",
+ "\n",
+ "| Class | Package | Compatibility | Local | PY support | \n",
+ "| :--- | :--- | :---: | :---: | :---: |\n",
+ "| [DirectoryLoader](https://api.js.langchain.com/classes/langchain_document_loaders_fs_directory.DirectoryLoader.html) | [langchain](https://api.js.langchain.com/modules/langchain_document_loaders_fs_directory.html) | Node-only | ✅ | ✅ |\n",
+ "\n",
+ "## Setup\n",
+ "\n",
+ "To access `DirectoryLoader` document loader you'll need to install the `langchain` package.\n",
+ "\n",
+ "### Installation\n",
+ "\n",
+ "The LangChain DirectoryLoader integration lives in the `langchain` package:\n",
+ "\n",
+ "```{=mdx}\n",
+ "import IntegrationInstallTooltip from \"@mdx_components/integration_install_tooltip.mdx\";\n",
+ "import Npm2Yarn from \"@theme/Npm2Yarn\";\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ " langchain\n",
+ "\n",
+ "\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Instantiation\n",
+ "\n",
+ "Now we can instantiate our model object and load documents:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import { DirectoryLoader } from \"langchain/document_loaders/fs/directory\";\n",
+ "import {\n",
+ " JSONLoader,\n",
+ " JSONLinesLoader,\n",
+ "} from \"langchain/document_loaders/fs/json\";\n",
+ "import { TextLoader } from \"langchain/document_loaders/fs/text\";\n",
+ "import { CSVLoader } from \"@langchain/community/document_loaders/fs/csv\";\n",
+ "\n",
+ "const loader = new DirectoryLoader(\n",
+ " \"../../../../../../examples/src/document_loaders/example_data\",\n",
+ " {\n",
+ " \".json\": (path) => new JSONLoader(path, \"/texts\"),\n",
+ " \".jsonl\": (path) => new JSONLinesLoader(path, \"/html\"),\n",
+ " \".txt\": (path) => new TextLoader(path),\n",
+ " \".csv\": (path) => new CSVLoader(path, \"text\"),\n",
+ " }\n",
+ ");"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Document {\n",
+ " pageContent: 'Foo\\nBar\\nBaz\\n\\n',\n",
+ " metadata: {\n",
+ " source: '/Users/bracesproul/code/lang-chain-ai/langchainjs/examples/src/document_loaders/example_data/example.txt'\n",
+ " },\n",
+ " id: undefined\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "const docs = await loader.load()\n",
+ "// disable console.warn calls\n",
+ "console.warn = () => {}\n",
+ "docs[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{\n",
+ " source: '/Users/bracesproul/code/lang-chain-ai/langchainjs/examples/src/document_loaders/example_data/example.txt'\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "console.log(docs[0].metadata)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## API reference\n",
+ "\n",
+ "For detailed documentation of all DirectoryLoader features and configurations head to the API reference: https://api.js.langchain.com/classes/langchain_document_loaders_fs_directory.DirectoryLoader.html"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "TypeScript",
+ "language": "typescript",
+ "name": "tslab"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "mode": "typescript",
+ "name": "javascript",
+ "typescript": true
+ },
+ "file_extension": ".ts",
+ "mimetype": "text/typescript",
+ "name": "typescript",
+ "version": "3.7.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/core_docs/docs/integrations/document_loaders/file_loaders/directory.mdx b/docs/core_docs/docs/integrations/document_loaders/file_loaders/directory.mdx
deleted file mode 100644
index a0c3f67ad700..000000000000
--- a/docs/core_docs/docs/integrations/document_loaders/file_loaders/directory.mdx
+++ /dev/null
@@ -1,42 +0,0 @@
----
-sidebar_position: 1
-hide_table_of_contents: true
----
-
-# Folders with multiple files
-
-This example goes over how to load data from folders with multiple files. The second argument is a map of file extensions to loader factories. Each file will be passed to the matching loader, and the resulting documents will be concatenated together.
-
-Example folder:
-
-```text
-src/document_loaders/example_data/example/
-├── example.json
-├── example.jsonl
-├── example.txt
-└── example.csv
-```
-
-Example code:
-
-```typescript
-import { DirectoryLoader } from "langchain/document_loaders/fs/directory";
-import {
- JSONLoader,
- JSONLinesLoader,
-} from "langchain/document_loaders/fs/json";
-import { TextLoader } from "langchain/document_loaders/fs/text";
-import { CSVLoader } from "@langchain/community/document_loaders/fs/csv";
-
-const loader = new DirectoryLoader(
- "src/document_loaders/example_data/example",
- {
- ".json": (path) => new JSONLoader(path, "/texts"),
- ".jsonl": (path) => new JSONLinesLoader(path, "/html"),
- ".txt": (path) => new TextLoader(path),
- ".csv": (path) => new CSVLoader(path, "text"),
- }
-);
-const docs = await loader.load();
-console.log({ docs });
-```
diff --git a/docs/core_docs/docs/integrations/document_loaders/file_loaders/pdf.ipynb b/docs/core_docs/docs/integrations/document_loaders/file_loaders/pdf.ipynb
new file mode 100644
index 000000000000..ac0092586134
--- /dev/null
+++ b/docs/core_docs/docs/integrations/document_loaders/file_loaders/pdf.ipynb
@@ -0,0 +1,502 @@
+{
+ "cells": [
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "sidebar_label: PDFLoader\n",
+ "sidebar_class_name: node-only\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# PDFLoader\n",
+ "\n",
+ "```{=mdx}\n",
+ "\n",
+ ":::tip Compatibility\n",
+ "\n",
+ "Only available on Node.js.\n",
+ "\n",
+ ":::\n",
+ "\n",
+ "```\n",
+ "\n",
+ "This notebook provides a quick overview for getting started with `PDFLoader` [document loaders](/docs/concepts/#document-loaders). For detailed documentation of all `PDFLoader` features and configurations head to the [API reference](https://api.js.langchain.com/classes/langchain_community_document_loaders_fs_pdf.PDFLoader.html).\n",
+ "\n",
+ "## Overview\n",
+ "### Integration details\n",
+ "\n",
+ "| Class | Package | Compatibility | Local | PY support | \n",
+ "| :--- | :--- | :---: | :---: | :---: |\n",
+ "| [PDFLoader](https://api.js.langchain.com/classes/langchain_community_document_loaders_fs_pdf.PDFLoader.html) | [@langchain/community](https://api.js.langchain.com/modules/langchain_community_document_loaders_fs_pdf.html) | Node-only | ✅ | 🟠 (See note below) |\n",
+ "\n",
+ "> The Python package has many PDF loaders to choose from. See [this link](https://python.langchain.com/docs/integrations/document_loaders/) for a full list of Python document loaders.\n",
+ "\n",
+ "## Setup\n",
+ "\n",
+ "To access `PDFLoader` document loader you'll need to install the `@langchain/community` integration, along with the `pdf-parse` package.\n",
+ "\n",
+ "### Credentials\n",
+ "\n",
+ "### Installation\n",
+ "\n",
+ "The LangChain PDFLoader integration lives in the `@langchain/community` package:\n",
+ "\n",
+ "```{=mdx}\n",
+ "import IntegrationInstallTooltip from \"@mdx_components/integration_install_tooltip.mdx\";\n",
+ "import Npm2Yarn from \"@theme/Npm2Yarn\";\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ " @langchain/community pdf-parse\n",
+ "\n",
+ "\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Instantiation\n",
+ "\n",
+ "Now we can instantiate our model object and load documents:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import { PDFLoader } from \"@langchain/community/document_loaders/fs/pdf\"\n",
+ "\n",
+ "const nike10kPdfPath = \"../../../../data/nke-10k-2023.pdf\"\n",
+ "\n",
+ "const loader = new PDFLoader(nike10kPdfPath)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Document {\n",
+ " pageContent: 'Table of Contents\\n' +\n",
+ " 'UNITED STATES\\n' +\n",
+ " 'SECURITIES AND EXCHANGE COMMISSION\\n' +\n",
+ " 'Washington, D.C. 20549\\n' +\n",
+ " 'FORM 10-K\\n' +\n",
+ " '(Mark One)\\n' +\n",
+ " '☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934\\n' +\n",
+ " 'FOR THE FISCAL YEAR ENDED MAY 31, 2023\\n' +\n",
+ " 'OR\\n' +\n",
+ " '☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934\\n' +\n",
+ " 'FOR THE TRANSITION PERIOD FROM TO .\\n' +\n",
+ " 'Commission File No. 1-10635\\n' +\n",
+ " 'NIKE, Inc.\\n' +\n",
+ " '(Exact name of Registrant as specified in its charter)\\n' +\n",
+ " 'Oregon93-0584541\\n' +\n",
+ " '(State or other jurisdiction of incorporation)(IRS Employer Identification No.)\\n' +\n",
+ " 'One Bowerman Drive, Beaverton, Oregon 97005-6453\\n' +\n",
+ " '(Address of principal executive offices and zip code)\\n' +\n",
+ " '(503) 671-6453\\n' +\n",
+ " \"(Registrant's telephone number, including area code)\\n\" +\n",
+ " 'SECURITIES REGISTERED PURSUANT TO SECTION 12(B) OF THE ACT:\\n' +\n",
+ " 'Class B Common StockNKENew York Stock Exchange\\n' +\n",
+ " '(Title of each class)(Trading symbol)(Name of each exchange on which registered)\\n' +\n",
+ " 'SECURITIES REGISTERED PURSUANT TO SECTION 12(G) OF THE ACT:\\n' +\n",
+ " 'NONE\\n' +\n",
+ " 'Indicate by check mark:YESNO\\n' +\n",
+ " '•if the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.þ ̈\\n' +\n",
+ " '•if the registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Act. ̈þ\\n' +\n",
+ " '•whether the registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding\\n' +\n",
+ " '12 months (or for such shorter period that the registrant was required to file such reports), and (2) has been subject to such filing requirements for the\\n' +\n",
+ " 'past 90 days.\\n' +\n",
+ " 'þ ̈\\n' +\n",
+ " '•whether the registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T\\n' +\n",
+ " '(§232.405 of this chapter) during the preceding 12 months (or for such shorter period that the registrant was required to submit such files).\\n' +\n",
+ " 'þ ̈\\n' +\n",
+ " '•whether the registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, a smaller reporting company or an emerging growth company. See the definitions of “large accelerated filer,”\\n' +\n",
+ " '“accelerated filer,” “smaller reporting company,” and “emerging growth company” in Rule 12b-2 of the Exchange Act.\\n' +\n",
+ " 'Large accelerated filerþAccelerated filer☐Non-accelerated filer☐Smaller reporting company☐Emerging growth company☐\\n' +\n",
+ " '•if an emerging growth company, if the registrant has elected not to use the extended transition period for complying with any new or revised financial\\n' +\n",
+ " 'accounting standards provided pursuant to Section 13(a) of the Exchange Act.\\n' +\n",
+ " ' ̈\\n' +\n",
+ " \"•whether the registrant has filed a report on and attestation to its management's assessment of the effectiveness of its internal control over financial\\n\" +\n",
+ " 'reporting under Section 404(b) of the Sarbanes-Oxley Act (15 U.S.C. 7262(b)) by the registered public accounting firm that prepared or issued its audit\\n' +\n",
+ " 'report.\\n' +\n",
+ " 'þ\\n' +\n",
+ " '•if securities are registered pursuant to Section 12(b) of the Act, whether the financial statements of the registrant included in the filing reflect the\\n' +\n",
+ " 'correction of an error to previously issued financial statements.\\n' +\n",
+ " ' ̈\\n' +\n",
+ " '•whether any of those error corrections are restatements that required a recovery analysis of incentive-based compensation received by any of the\\n' +\n",
+ " \"registrant's executive officers during the relevant recovery period pursuant to § 240.10D-1(b).\\n\" +\n",
+ " ' ̈\\n' +\n",
+ " '•\\n' +\n",
+ " 'whether the registrant is a shell company (as defined in Rule 12b-2 of the Act).☐þ\\n' +\n",
+ " \"As of November 30, 2022, the aggregate market values of the Registrant's Common Stock held by non-affiliates were:\\n\" +\n",
+ " 'Class A$7,831,564,572 \\n' +\n",
+ " 'Class B136,467,702,472 \\n' +\n",
+ " '$144,299,267,044 ',\n",
+ " metadata: {\n",
+ " source: '../../../../data/nke-10k-2023.pdf',\n",
+ " pdf: {\n",
+ " version: '1.10.100',\n",
+ " info: [Object],\n",
+ " metadata: null,\n",
+ " totalPages: 107\n",
+ " },\n",
+ " loc: { pageNumber: 1 }\n",
+ " },\n",
+ " id: undefined\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "const docs = await loader.load()\n",
+ "docs[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{\n",
+ " source: '../../../../data/nke-10k-2023.pdf',\n",
+ " pdf: {\n",
+ " version: '1.10.100',\n",
+ " info: {\n",
+ " PDFFormatVersion: '1.4',\n",
+ " IsAcroFormPresent: false,\n",
+ " IsXFAPresent: false,\n",
+ " Title: '0000320187-23-000039',\n",
+ " Author: 'EDGAR Online, a division of Donnelley Financial Solutions',\n",
+ " Subject: 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31',\n",
+ " Keywords: '0000320187-23-000039; ; 10-K',\n",
+ " Creator: 'EDGAR Filing HTML Converter',\n",
+ " Producer: 'EDGRpdf Service w/ EO.Pdf 22.0.40.0',\n",
+ " CreationDate: \"D:20230720162200-04'00'\",\n",
+ " ModDate: \"D:20230720162208-04'00'\"\n",
+ " },\n",
+ " metadata: null,\n",
+ " totalPages: 107\n",
+ " },\n",
+ " loc: { pageNumber: 1 }\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "console.log(docs[0].metadata)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Usage, one document per file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Table of Contents\n",
+ "UNITED STATES\n",
+ "SECURITIES AND EXCHANGE COMMISSION\n",
+ "Washington, D.C. 20549\n",
+ "FORM 10-K\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "import { PDFLoader } from \"@langchain/community/document_loaders/fs/pdf\";\n",
+ "\n",
+ "const singleDocPerFileLoader = new PDFLoader(nike10kPdfPath, {\n",
+ " splitPages: false,\n",
+ "});\n",
+ "\n",
+ "const singleDoc = await singleDocPerFileLoader.load();\n",
+ "console.log(singleDoc[0].pageContent.slice(0, 100))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Usage, custom `pdfjs` build\n",
+ "\n",
+ "By default we use the `pdfjs` build bundled with `pdf-parse`, which is compatible with most environments, including Node.js and modern browsers. If you want to use a more recent version of `pdfjs-dist` or if you want to use a custom build of `pdfjs-dist`, you can do so by providing a custom `pdfjs` function that returns a promise that resolves to the `PDFJS` object.\n",
+ "\n",
+ "In the following example we use the \"legacy\" (see [pdfjs docs](https://github.com/mozilla/pdf.js/wiki/Frequently-Asked-Questions#which-browsersenvironments-are-supported)) build of `pdfjs-dist`, which includes several polyfills not included in the default build.\n",
+ "\n",
+ "```{=mdx}\n",
+ "\n",
+ " pdfjs-dist\n",
+ "\n",
+ "\n",
+ "```\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import { PDFLoader } from \"@langchain/community/document_loaders/fs/pdf\";\n",
+ "\n",
+ "const customBuildLoader = new PDFLoader(nike10kPdfPath, {\n",
+ " // you may need to add `.then(m => m.default)` to the end of the import\n",
+ " // @lc-ts-ignore\n",
+ " pdfjs: () => import(\"pdfjs-dist/legacy/build/pdf.js\"),\n",
+ "});"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Eliminating extra spaces\n",
+ "\n",
+ "PDFs come in many varieties, which makes reading them a challenge. The loader parses individual text elements and joins them together with a space by default, but\n",
+ "if you are seeing excessive spaces, this may not be the desired behavior. In that case, you can override the separator with an empty string like this:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(Mark One)\n",
+ "☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934\n",
+ "FOR THE FISCAL YEAR ENDED MAY 31, 2023\n",
+ "OR\n",
+ "☐ TRANSITI\n"
+ ]
+ }
+ ],
+ "source": [
+ "import { PDFLoader } from \"@langchain/community/document_loaders/fs/pdf\";\n",
+ "\n",
+ "const noExtraSpacesLoader = new PDFLoader(nike10kPdfPath, {\n",
+ " parsedItemSeparator: \"\",\n",
+ "});\n",
+ "\n",
+ "const noExtraSpacesDocs = await noExtraSpacesLoader.load();\n",
+ "console.log(noExtraSpacesDocs[0].pageContent.slice(100, 250))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Loading directories"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Unknown file type: Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.srt\n",
+ "Unknown file type: example.txt\n",
+ "Unknown file type: notion.md\n",
+ "Unknown file type: bad_frontmatter.md\n",
+ "Unknown file type: frontmatter.md\n",
+ "Unknown file type: no_frontmatter.md\n",
+ "Unknown file type: no_metadata.md\n",
+ "Unknown file type: tags_and_frontmatter.md\n",
+ "Unknown file type: test.mp3\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Document {\n",
+ " pageContent: 'Bitcoin: A Peer-to-Peer Electronic Cash System\\n' +\n",
+ " 'Satoshi Nakamoto\\n' +\n",
+ " 'satoshin@gmx.com\\n' +\n",
+ " 'www.bitcoin.org\\n' +\n",
+ " 'Abstract. A purely peer-to-peer version of electronic cash would allow online \\n' +\n",
+ " 'payments to be sent directly from one party to another without going through a \\n' +\n",
+ " 'financial institution. Digital signatures provide part of the solution, but the main \\n' +\n",
+ " 'benefits are lost if a trusted third party is still required to prevent double-spending. \\n' +\n",
+ " 'We propose a solution to the double-spending problem using a peer-to-peer network. \\n' +\n",
+ " 'The network timestamps transactions by hashing them into an ongoing chain of \\n' +\n",
+ " 'hash-based proof-of-work, forming a record that cannot be changed without redoing \\n' +\n",
+ " 'the proof-of-work. The longest chain not only serves as proof of the sequence of \\n' +\n",
+ " 'events witnessed, but proof that it came from the largest pool of CPU power. As \\n' +\n",
+ " 'long as a majority of CPU power is controlled by nodes that are not cooperating to \\n' +\n",
+ " \"attack the network, they'll generate the longest chain and outpace attackers. The \\n\" +\n",
+ " 'network itself requires minimal structure. Messages are broadcast on a best effort \\n' +\n",
+ " 'basis, and nodes can leave and rejoin the network at will, accepting the longest \\n' +\n",
+ " 'proof-of-work chain as proof of what happened while they were gone.\\n' +\n",
+ " '1.Introduction\\n' +\n",
+ " 'Commerce on the Internet has come to rely almost exclusively on financial institutions serving as \\n' +\n",
+ " 'trusted third parties to process electronic payments. While the system works well enough for \\n' +\n",
+ " 'most transactions, it still suffers from the inherent weaknesses of the trust based model. \\n' +\n",
+ " 'Completely non-reversible transactions are not really possible, since financial institutions cannot \\n' +\n",
+ " 'avoid mediating disputes. The cost of mediation increases transaction costs, limiting the \\n' +\n",
+ " 'minimum practical transaction size and cutting off the possibility for small casual transactions, \\n' +\n",
+ " 'and there is a broader cost in the loss of ability to make non-reversible payments for non-\\n' +\n",
+ " 'reversible services. With the possibility of reversal, the need for trust spreads. Merchants must \\n' +\n",
+ " 'be wary of their customers, hassling them for more information than they would otherwise need. \\n' +\n",
+ " 'A certain percentage of fraud is accepted as unavoidable. These costs and payment uncertainties \\n' +\n",
+ " 'can be avoided in person by using physical currency, but no mechanism exists to make payments \\n' +\n",
+ " 'over a communications channel without a trusted party.\\n' +\n",
+ " 'What is needed is an electronic payment system based on cryptographic proof instead of trust, \\n' +\n",
+ " 'allowing any two willing parties to transact directly with each other without the need for a trusted \\n' +\n",
+ " 'third party. Transactions that are computationally impractical to reverse would protect sellers \\n' +\n",
+ " 'from fraud, and routine escrow mechanisms could easily be implemented to protect buyers. In \\n' +\n",
+ " 'this paper, we propose a solution to the double-spending problem using a peer-to-peer distributed \\n' +\n",
+ " 'timestamp server to generate computational proof of the chronological order of transactions. The \\n' +\n",
+ " 'system is secure as long as honest nodes collectively control more CPU power than any \\n' +\n",
+ " 'cooperating group of attacker nodes.\\n' +\n",
+ " '1',\n",
+ " metadata: {\n",
+ " source: '/Users/bracesproul/code/lang-chain-ai/langchainjs/examples/src/document_loaders/example_data/bitcoin.pdf',\n",
+ " pdf: {\n",
+ " version: '1.10.100',\n",
+ " info: [Object],\n",
+ " metadata: null,\n",
+ " totalPages: 9\n",
+ " },\n",
+ " loc: { pageNumber: 1 }\n",
+ " },\n",
+ " id: undefined\n",
+ "}\n",
+ "Document {\n",
+ " pageContent: 'Bitcoin: A Peer-to-Peer Electronic Cash System\\n' +\n",
+ " 'Satoshi Nakamoto\\n' +\n",
+ " 'satoshin@gmx.com\\n' +\n",
+ " 'www.bitcoin.org\\n' +\n",
+ " 'Abstract. A purely peer-to-peer version of electronic cash would allow online \\n' +\n",
+ " 'payments to be sent directly from one party to another without going through a \\n' +\n",
+ " 'financial institution. Digital signatures provide part of the solution, but the main \\n' +\n",
+ " 'benefits are lost if a trusted third party is still required to prevent double-spending. \\n' +\n",
+ " 'We propose a solution to the double-spending problem using a peer-to-peer network. \\n' +\n",
+ " 'The network timestamps transactions by hashing them into an ongoing chain of \\n' +\n",
+ " 'hash-based proof-of-work, forming a record that cannot be changed without redoing \\n' +\n",
+ " 'the proof-of-work. The longest chain not only serves as proof of the sequence of \\n' +\n",
+ " 'events witnessed, but proof that it came from the largest pool of CPU power. As \\n' +\n",
+ " 'long as a majority of CPU power is controlled by nodes that are not cooperating to',\n",
+ " metadata: {\n",
+ " source: '/Users/bracesproul/code/lang-chain-ai/langchainjs/examples/src/document_loaders/example_data/bitcoin.pdf',\n",
+ " pdf: {\n",
+ " version: '1.10.100',\n",
+ " info: [Object],\n",
+ " metadata: null,\n",
+ " totalPages: 9\n",
+ " },\n",
+ " loc: { pageNumber: 1, lines: [Object] }\n",
+ " },\n",
+ " id: undefined\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "import { DirectoryLoader } from \"langchain/document_loaders/fs/directory\";\n",
+ "import { PDFLoader } from \"@langchain/community/document_loaders/fs/pdf\";\n",
+ "import { RecursiveCharacterTextSplitter } from \"@langchain/textsplitters\";\n",
+ "\n",
+ "const exampleDataPath = \"../../../../../../examples/src/document_loaders/example_data/\";\n",
+ "\n",
+ "/* Load all PDFs within the specified directory */\n",
+ "const directoryLoader = new DirectoryLoader(\n",
+ " exampleDataPath,\n",
+ " {\n",
+ " \".pdf\": (path: string) => new PDFLoader(path),\n",
+ " }\n",
+ ");\n",
+ "\n",
+ "const directoryDocs = await directoryLoader.load();\n",
+ "\n",
+ "console.log(directoryDocs[0]);\n",
+ "\n",
+ "/* Additional steps : Split text into chunks with any TextSplitter. You can then use it as context or save it to memory afterwards. */\n",
+ "const textSplitter = new RecursiveCharacterTextSplitter({\n",
+ " chunkSize: 1000,\n",
+ " chunkOverlap: 200,\n",
+ "});\n",
+ "\n",
+ "const splitDocs = await textSplitter.splitDocuments(directoryDocs);\n",
+ "console.log(splitDocs[0]);\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## API reference\n",
+ "\n",
+ "For detailed documentation of all PDFLoader features and configurations head to the API reference: https://api.js.langchain.com/classes/langchain_community_document_loaders_fs_pdf.PDFLoader.html"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "TypeScript",
+ "language": "typescript",
+ "name": "tslab"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "mode": "typescript",
+ "name": "javascript",
+ "typescript": true
+ },
+ "file_extension": ".ts",
+ "mimetype": "text/typescript",
+ "name": "typescript",
+ "version": "3.7.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/core_docs/docs/integrations/document_loaders/file_loaders/pdf.mdx b/docs/core_docs/docs/integrations/document_loaders/file_loaders/pdf.mdx
deleted file mode 100644
index 9e92902d452a..000000000000
--- a/docs/core_docs/docs/integrations/document_loaders/file_loaders/pdf.mdx
+++ /dev/null
@@ -1,72 +0,0 @@
-# PDF files
-
-This example goes over how to load data from PDF files. By default, one document will be created for each page in the PDF file, you can change this behavior by setting the `splitPages` option to `false`.
-
-## Setup
-
-```bash npm2yarn
-npm install pdf-parse
-```
-
-## Usage, one document per page
-
-```typescript
-import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf";
-
-const loader = new PDFLoader("src/document_loaders/example_data/example.pdf");
-
-const docs = await loader.load();
-```
-
-## Usage, one document per file
-
-```typescript
-import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf";
-
-const loader = new PDFLoader("src/document_loaders/example_data/example.pdf", {
- splitPages: false,
-});
-
-const docs = await loader.load();
-```
-
-## Usage, custom `pdfjs` build
-
-By default we use the `pdfjs` build bundled with `pdf-parse`, which is compatible with most environments, including Node.js and modern browsers. If you want to use a more recent version of `pdfjs-dist` or if you want to use a custom build of `pdfjs-dist`, you can do so by providing a custom `pdfjs` function that returns a promise that resolves to the `PDFJS` object.
-
-In the following example we use the "legacy" (see [pdfjs docs](https://github.com/mozilla/pdf.js/wiki/Frequently-Asked-Questions#which-browsersenvironments-are-supported)) build of `pdfjs-dist`, which includes several polyfills not included in the default build.
-
-```bash npm2yarn
-npm install pdfjs-dist
-```
-
-```typescript
-import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf";
-
-const loader = new PDFLoader("src/document_loaders/example_data/example.pdf", {
- // you may need to add `.then(m => m.default)` to the end of the import
- pdfjs: () => import("pdfjs-dist/legacy/build/pdf.js"),
-});
-```
-
-## Eliminating extra spaces
-
-PDFs come in many varieties, which makes reading them a challenge. The loader parses individual text elements and joins them together with a space by default, but
-if you are seeing excessive spaces, this may not be the desired behavior. In that case, you can override the separator with an empty string like this:
-
-```typescript
-import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf";
-
-const loader = new PDFLoader("src/document_loaders/example_data/example.pdf", {
- parsedItemSeparator: "",
-});
-
-const docs = await loader.load();
-```
-
-## Loading directories
-
-import CodeBlock from "@theme/CodeBlock";
-import MemoryExample from "@examples/document_loaders/pdf_directory.ts";
-
-{MemoryExample}
diff --git a/docs/core_docs/docs/integrations/document_loaders/file_loaders/text.ipynb b/docs/core_docs/docs/integrations/document_loaders/file_loaders/text.ipynb
new file mode 100644
index 000000000000..bf6c6de8d823
--- /dev/null
+++ b/docs/core_docs/docs/integrations/document_loaders/file_loaders/text.ipynb
@@ -0,0 +1,164 @@
+{
+ "cells": [
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "sidebar_label: TextLoader\n",
+ "sidebar_class_name: node-only\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# TextLoader\n",
+ "\n",
+ "```{=mdx}\n",
+ "\n",
+ ":::tip Compatibility\n",
+ "\n",
+ "Only available on Node.js.\n",
+ "\n",
+ ":::\n",
+ "\n",
+ "```\n",
+ "\n",
+ "This notebook provides a quick overview for getting started with `TextLoader` [document loaders](/docs/concepts/#document-loaders). For detailed documentation of all `TextLoader` features and configurations head to the [API reference](https://api.js.langchain.com/classes/langchain_document_loaders_fs_text.TextLoader.html).\n",
+ "\n",
+ "## Overview\n",
+ "### Integration details\n",
+ "\n",
+ "| Class | Package | Compatibility | Local | PY support | \n",
+ "| :--- | :--- | :---: | :---: | :---: |\n",
+ "| [TextLoader](https://api.js.langchain.com/classes/langchain_document_loaders_fs_text.TextLoader.html) | [langchain](https://api.js.langchain.com/modules/langchain_document_loaders_fs_text.html) | Node-only | ✅ | ❌ |\n",
+ "\n",
+ "## Setup\n",
+ "\n",
+ "To access `TextLoader` document loader you'll need to install the `langchain` package.\n",
+ "\n",
+ "### Installation\n",
+ "\n",
+ "The LangChain TextLoader integration lives in the `langchain` package:\n",
+ "\n",
+ "```{=mdx}\n",
+ "import IntegrationInstallTooltip from \"@mdx_components/integration_install_tooltip.mdx\";\n",
+ "import Npm2Yarn from \"@theme/Npm2Yarn\";\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ " langchain\n",
+ "\n",
+ "\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Instantiation\n",
+ "\n",
+ "Now we can instantiate our model object and load documents:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import { TextLoader } from \"langchain/document_loaders/fs/text\"\n",
+ "\n",
+ "const loader = new TextLoader(\"../../../../../../examples/src/document_loaders/example_data/example.txt\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Document {\n",
+ " pageContent: 'Foo\\nBar\\nBaz\\n\\n',\n",
+ " metadata: {\n",
+ " source: '../../../../../../examples/src/document_loaders/example_data/example.txt'\n",
+ " },\n",
+ " id: undefined\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "const docs = await loader.load()\n",
+ "docs[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{\n",
+ " source: '../../../../../../examples/src/document_loaders/example_data/example.txt'\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "console.log(docs[0].metadata)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## API reference\n",
+ "\n",
+ "For detailed documentation of all TextLoader features and configurations head to the API reference: https://api.js.langchain.com/classes/langchain_document_loaders_fs_text.TextLoader.html"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "TypeScript",
+ "language": "typescript",
+ "name": "tslab"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "mode": "typescript",
+ "name": "javascript",
+ "typescript": true
+ },
+ "file_extension": ".ts",
+ "mimetype": "text/typescript",
+ "name": "typescript",
+ "version": "3.7.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/core_docs/docs/integrations/document_loaders/file_loaders/text.mdx b/docs/core_docs/docs/integrations/document_loaders/file_loaders/text.mdx
deleted file mode 100644
index d20d7c1942d2..000000000000
--- a/docs/core_docs/docs/integrations/document_loaders/file_loaders/text.mdx
+++ /dev/null
@@ -1,15 +0,0 @@
----
-hide_table_of_contents: true
----
-
-# Text files
-
-This example goes over how to load data from text files.
-
-```typescript
-import { TextLoader } from "langchain/document_loaders/fs/text";
-
-const loader = new TextLoader("src/document_loaders/example_data/example.txt");
-
-const docs = await loader.load();
-```
diff --git a/docs/core_docs/docs/integrations/document_loaders/file_loaders/unstructured.ipynb b/docs/core_docs/docs/integrations/document_loaders/file_loaders/unstructured.ipynb
new file mode 100644
index 000000000000..6004fabb0f8a
--- /dev/null
+++ b/docs/core_docs/docs/integrations/document_loaders/file_loaders/unstructured.ipynb
@@ -0,0 +1,243 @@
+{
+ "cells": [
+ {
+ "cell_type": "raw",
+ "metadata": {
+ "vscode": {
+ "languageId": "raw"
+ }
+ },
+ "source": [
+ "---\n",
+ "sidebar_label: Unstructured\n",
+ "sidebar_class_name: node-only\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# UnstructuredLoader\n",
+ "\n",
+ "```{=mdx}\n",
+ "\n",
+ ":::tip Compatibility\n",
+ "\n",
+ "Only available on Node.js.\n",
+ "\n",
+ ":::\n",
+ "\n",
+ "```\n",
+ "\n",
+ "This notebook provides a quick overview for getting started with `UnstructuredLoader` [document loaders](/docs/concepts/#document-loaders). For detailed documentation of all `UnstructuredLoader` features and configurations head to the [API reference](https://api.js.langchain.com/classes/langchain_community_document_loaders_fs_unstructured.UnstructuredLoader.html).\n",
+ "\n",
+ "## Overview\n",
+ "### Integration details\n",
+ "\n",
+ "| Class | Package | Compatibility | Local | [PY support](https://python.langchain.com/docs/integrations/document_loaders/unstructured_file) | \n",
+ "| :--- | :--- | :---: | :---: | :---: |\n",
+ "| [UnstructuredLoader](https://api.js.langchain.com/classes/langchain_community_document_loaders_fs_unstructured.UnstructuredLoader.html) | [@langchain/community](https://api.js.langchain.com/modules/langchain_community_document_loaders_fs_unstructured.html) | Node-only | ✅ | ✅ |\n",
+ "\n",
+ "## Setup\n",
+ "\n",
+ "To access `UnstructuredLoader` document loader you'll need to install the `@langchain/community` integration package, and create an Unstructured account and get an API key.\n",
+ "\n",
+ "### Local\n",
+ "\n",
+ "You can run Unstructured locally in your computer using Docker. To do so, you need to have Docker installed. You can find the instructions to install Docker [here](https://docs.docker.com/get-docker/).\n",
+ "\n",
+ "```bash\n",
+ "docker run -p 8000:8000 -d --rm --name unstructured-api downloads.unstructured.io/unstructured-io/unstructured-api:latest --port 8000 --host 0.0.0.0\n",
+ "```\n",
+ "\n",
+ "### Credentials\n",
+ "\n",
+ "Head to [unstructured.io](https://unstructured.io/api-key-hosted) to sign up to Unstructured and generate an API key. Once you've done this set the `UNSTRUCTURED_API_KEY` environment variable:\n",
+ "\n",
+ "```bash\n",
+ "export UNSTRUCTURED_API_KEY=\"your-api-key\"\n",
+ "```\n",
+ "\n",
+ "### Installation\n",
+ "\n",
+ "The LangChain UnstructuredLoader integration lives in the `@langchain/community` package:\n",
+ "\n",
+ "```{=mdx}\n",
+ "import IntegrationInstallTooltip from \"@mdx_components/integration_install_tooltip.mdx\";\n",
+ "import Npm2Yarn from \"@theme/Npm2Yarn\";\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ " @langchain/community\n",
+ "\n",
+ "\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Instantiation\n",
+ "\n",
+ "Now we can instantiate our model object and load documents:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import { UnstructuredLoader } from \"@langchain/community/document_loaders/fs/unstructured\"\n",
+ "\n",
+ "const loader = new UnstructuredLoader(\"../../../../../../examples/src/document_loaders/example_data/notion.md\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Document {\n",
+ " pageContent: '# Testing the notion markdownloader',\n",
+ " metadata: {\n",
+ " filename: 'notion.md',\n",
+ " languages: [ 'eng' ],\n",
+ " filetype: 'text/plain',\n",
+ " category: 'NarrativeText'\n",
+ " },\n",
+ " id: undefined\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "const docs = await loader.load()\n",
+ "docs[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{\n",
+ " filename: 'notion.md',\n",
+ " languages: [ 'eng' ],\n",
+ " filetype: 'text/plain',\n",
+ " category: 'NarrativeText'\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "console.log(docs[0].metadata)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Directories\n",
+ "\n",
+ "You can also load all of the files in the directory using [`UnstructuredDirectoryLoader`](https://v02.api.js.langchain.com/classes/langchain_document_loaders_fs_unstructured.UnstructuredDirectoryLoader.html), which inherits from [`DirectoryLoader`](/docs/integrations/document_loaders/file_loaders/directory):\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Unknown file type: Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.srt\n",
+ "Unknown file type: test.mp3\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "directoryDocs.length: 247\n",
+ "Document {\n",
+ " pageContent: 'Bitcoin: A Peer-to-Peer Electronic Cash System',\n",
+ " metadata: {\n",
+ " filetype: 'application/pdf',\n",
+ " languages: [ 'eng' ],\n",
+ " page_number: 1,\n",
+ " filename: 'bitcoin.pdf',\n",
+ " category: 'Title'\n",
+ " },\n",
+ " id: undefined\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "import { UnstructuredDirectoryLoader } from \"@langchain/community/document_loaders/fs/unstructured\";\n",
+ "\n",
+ "const directoryLoader = new UnstructuredDirectoryLoader(\n",
+ " \"../../../../../../examples/src/document_loaders/example_data/\",\n",
+ " {}\n",
+ ");\n",
+ "const directoryDocs = await directoryLoader.load();\n",
+ "console.log(\"directoryDocs.length: \", directoryDocs.length);\n",
+ "console.log(directoryDocs[0])\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## API reference\n",
+ "\n",
+ "For detailed documentation of all UnstructuredLoader features and configurations head to the API reference: https://api.js.langchain.com/classes/langchain_community_document_loaders_fs_unstructured.UnstructuredLoader.html"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "TypeScript",
+ "language": "typescript",
+ "name": "tslab"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "mode": "typescript",
+ "name": "javascript",
+ "typescript": true
+ },
+ "file_extension": ".ts",
+ "mimetype": "text/typescript",
+ "name": "typescript",
+ "version": "3.7.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/core_docs/docs/integrations/document_loaders/file_loaders/unstructured.mdx b/docs/core_docs/docs/integrations/document_loaders/file_loaders/unstructured.mdx
deleted file mode 100644
index 7c82029f16de..000000000000
--- a/docs/core_docs/docs/integrations/document_loaders/file_loaders/unstructured.mdx
+++ /dev/null
@@ -1,32 +0,0 @@
----
-hide_table_of_contents: true
----
-
-# Unstructured
-
-This example covers how to use [Unstructured.io](https://unstructured.io/) to load files of many types. Unstructured currently supports loading of text files, powerpoints, html, pdfs, images, and more.
-
-## Setup
-
-You can run Unstructured locally in your computer using Docker. To do so, you need to have Docker installed. You can find the instructions to install Docker [here](https://docs.docker.com/get-docker/).
-
-```bash
-docker run -p 8000:8000 -d --rm --name unstructured-api downloads.unstructured.io/unstructured-io/unstructured-api:latest --port 8000 --host 0.0.0.0
-```
-
-## Usage
-
-Once Unstructured is running, you can use it to load files from your computer. You can use the following code to load a file from your computer.
-
-import CodeBlock from "@theme/CodeBlock";
-import Example from "@examples/document_loaders/unstructured.ts";
-
-{Example}
-
-## Directories
-
-You can also load all of the files in the directory using [`UnstructuredDirectoryLoader`](https://v02.api.js.langchain.com/classes/langchain_document_loaders_fs_unstructured.UnstructuredDirectoryLoader.html), which inherits from [`DirectoryLoader`](/docs/integrations/document_loaders/file_loaders/directory):
-
-import DirectoryExample from "@examples/document_loaders/unstructured_directory.ts";
-
-{DirectoryExample}
diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/firecrawl.ipynb b/docs/core_docs/docs/integrations/document_loaders/web_loaders/firecrawl.ipynb
new file mode 100644
index 000000000000..ab81ec8b86c6
--- /dev/null
+++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/firecrawl.ipynb
@@ -0,0 +1,221 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "sidebar_label: FireCrawl\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# FireCrawlLoader\n",
+ "\n",
+ "This notebook provides a quick overview for getting started with [FireCrawlLoader](/docs/integrations/document_loaders/). For detailed documentation of all FireCrawlLoader features and configurations head to the [API reference](https://api.js.langchain.com/classes/langchain_community_document_loaders_web_firecrawl.FireCrawlLoader.html).\n",
+ "\n",
+ "## Overview\n",
+ "### Integration details\n",
+ "\n",
+ "| Class | Package | Local | Serializable | [PY support](https://python.langchain.com/docs/integrations/document_loaders/firecrawl)|\n",
+ "| :--- | :--- | :---: | :---: | :---: |\n",
+ "| [FireCrawlLoader](https://api.js.langchain.com/classes/langchain_community_document_loaders_web_firecrawl.FireCrawlLoader.html) | [@langchain/community](https://api.js.langchain.com/modules/langchain_community_document_loaders_web_firecrawl.html) | 🟠 (see details below) | beta | ✅ | \n",
+ "### Loader features\n",
+ "| Source | Web Loader | Node Envs Only\n",
+ "| :---: | :---: | :---: | \n",
+ "| FireCrawlLoader | ✅ | ❌ | \n",
+ "\n",
+ "[FireCrawl](https://firecrawl.dev) crawls and convert any website into LLM-ready data. It crawls all accessible sub-pages and give you clean markdown and metadata for each. No sitemap required.\n",
+ "\n",
+ "FireCrawl handles complex tasks such as reverse proxies, caching, rate limits, and content blocked by JavaScript. Built by the [mendable.ai](https://mendable.ai) team.\n",
+ "\n",
+ "This guide shows how to scrap and crawl entire websites and load them using the `FireCrawlLoader` in LangChain.\n",
+ "\n",
+ "## Setup\n",
+ "\n",
+ "To access `FireCrawlLoader` document loader you'll need to install the `@langchain/community` integration, and the `@mendable/firecrawl-js` package. Then create a **[FireCrawl](https://firecrawl.dev)** account and get an API key.\n",
+ "\n",
+ "### Credentials\n",
+ "\n",
+ "Sign up and get your free [FireCrawl API key](https://firecrawl.dev) to start. FireCrawl offers 300 free credits to get you started, and it's [open-source](https://github.com/mendableai/firecrawl) in case you want to self-host.\n",
+ "\n",
+ "Once you've done this set the `FIRECRAWL_API_KEY` environment variable:\n",
+ "\n",
+ "```bash\n",
+ "export FIRECRAWL_API_KEY=\"your-api-key\"\n",
+ "```\n",
+ "\n",
+ "If you want to get automated tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:\n",
+ "\n",
+ "```bash\n",
+ "# export LANGCHAIN_TRACING_V2=\"true\"\n",
+ "# export LANGCHAIN_API_KEY=\"your-api-key\"\n",
+ "```\n",
+ "\n",
+ "### Installation\n",
+ "\n",
+ "The LangChain FireCrawlLoader integration lives in the `@langchain/community` package:\n",
+ "\n",
+ "```{=mdx}\n",
+ "import IntegrationInstallTooltip from \"@mdx_components/integration_install_tooltip.mdx\";\n",
+ "import Npm2Yarn from \"@theme/Npm2Yarn\";\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ " @langchain/community @mendable/firecrawl-js\n",
+ "\n",
+ "\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Instantiation\n",
+ "\n",
+ "Here's an example of how to use the `FireCrawlLoader` to load web search results:\n",
+ "\n",
+ "Firecrawl offers 2 modes: `scrape` and `crawl`. In `scrape` mode, Firecrawl will only scrape the page you provide. In `crawl` mode, Firecrawl will crawl the entire website.\n",
+ "\n",
+ "Now we can instantiate our model object and load documents:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import \"@mendable/firecrawl-js\";\n",
+ "import { FireCrawlLoader } from \"@langchain/community/document_loaders/web/firecrawl\"\n",
+ "\n",
+ "const loader = new FireCrawlLoader({\n",
+ " url: \"https://firecrawl.dev\", // The URL to scrape\n",
+ " apiKey: \"...\", // Optional, defaults to `FIRECRAWL_API_KEY` in your env.\n",
+ " mode: \"scrape\", // The mode to run the crawler in. Can be \"scrape\" for single urls or \"crawl\" for all accessible subpages\n",
+ " params: {\n",
+ " // optional parameters based on Firecrawl API docs\n",
+ " // For API documentation, visit https://docs.firecrawl.dev\n",
+ " },\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Document {\n",
+ " pageContent: \u001b[32m\"Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)\\n\"\u001b[39m +\n",
+ " \u001b[32m\" Join the waitlist to turn any web\"\u001b[39m... 18721 more characters,\n",
+ " metadata: {\n",
+ " title: \u001b[32m\"Home - Firecrawl\"\u001b[39m,\n",
+ " description: \u001b[32m\"Firecrawl crawls and converts any website into clean markdown.\"\u001b[39m,\n",
+ " keywords: \u001b[32m\"Firecrawl,Markdown,Data,Mendable,Langchain\"\u001b[39m,\n",
+ " robots: \u001b[32m\"follow, index\"\u001b[39m,\n",
+ " ogTitle: \u001b[32m\"Firecrawl\"\u001b[39m,\n",
+ " ogDescription: \u001b[32m\"Turn any website into LLM-ready data.\"\u001b[39m,\n",
+ " ogUrl: \u001b[32m\"https://www.firecrawl.dev/\"\u001b[39m,\n",
+ " ogImage: \u001b[32m\"https://www.firecrawl.dev/og.png?123\"\u001b[39m,\n",
+ " ogLocaleAlternate: [],\n",
+ " ogSiteName: \u001b[32m\"Firecrawl\"\u001b[39m,\n",
+ " sourceURL: \u001b[32m\"https://firecrawl.dev\"\u001b[39m,\n",
+ " pageStatusCode: \u001b[33m500\u001b[39m\n",
+ " },\n",
+ " id: \u001b[90mundefined\u001b[39m\n",
+ "}"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "const docs = await loader.load()\n",
+ "docs[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{\n",
+ " title: \"Home - Firecrawl\",\n",
+ " description: \"Firecrawl crawls and converts any website into clean markdown.\",\n",
+ " keywords: \"Firecrawl,Markdown,Data,Mendable,Langchain\",\n",
+ " robots: \"follow, index\",\n",
+ " ogTitle: \"Firecrawl\",\n",
+ " ogDescription: \"Turn any website into LLM-ready data.\",\n",
+ " ogUrl: \"https://www.firecrawl.dev/\",\n",
+ " ogImage: \"https://www.firecrawl.dev/og.png?123\",\n",
+ " ogLocaleAlternate: [],\n",
+ " ogSiteName: \"Firecrawl\",\n",
+ " sourceURL: \"https://firecrawl.dev\",\n",
+ " pageStatusCode: 500\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "console.log(docs[0].metadata)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Additional Parameters\n",
+ "\n",
+ "For `params` you can pass any of the params according to the [Firecrawl documentation](https://docs.firecrawl.dev)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## API reference\n",
+ "\n",
+ "For detailed documentation of all FireCrawlLoader features and configurations head to the API reference: https://api.js.langchain.com/classes/langchain_community_document_loaders_web_firecrawl.FireCrawlLoader.html"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Deno",
+ "language": "typescript",
+ "name": "deno"
+ },
+ "language_info": {
+ "file_extension": ".ts",
+ "mimetype": "text/x.typescript",
+ "name": "typescript",
+ "nb_converter": "script",
+ "pygments_lexer": "typescript",
+ "version": "5.3.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/firecrawl.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/firecrawl.mdx
deleted file mode 100644
index 59fecb799db3..000000000000
--- a/docs/core_docs/docs/integrations/document_loaders/web_loaders/firecrawl.mdx
+++ /dev/null
@@ -1,38 +0,0 @@
----
-hide_table_of_contents: true
----
-
-# Firecrawl
-
-This guide shows how to use [Firecrawl](https://firecrawl.dev) with LangChain to load web data into an LLM-ready format using Firecrawl.
-
-## Overview
-
-[FireCrawl](https://firecrawl.dev) crawls and convert any website into LLM-ready data. It crawls all accessible subpages and give you clean markdown and metadata for each. No sitemap required.
-
-FireCrawl handles complex tasks such as reverse proxies, caching, rate limits, and content blocked by JavaScript. Built by the [mendable.ai](https://mendable.ai) team.
-
-This guide shows how to scrap and crawl entire websites and load them using the `FireCrawlLoader` in LangChain.
-
-## Setup
-
-Sign up and get your free [FireCrawl API key](https://firecrawl.dev) to start. FireCrawl offers 300 free credits to get you started, and it's [open-source](https://github.com/mendableai/firecrawl) in case you want to self-host.
-
-## Usage
-
-Here's an example of how to use the `FireCrawlLoader` to load web search results:
-
-Firecrawl offers 2 modes: `scrape` and `crawl`. In `scrape` mode, Firecrawl will only scrape the page you provide. In `crawl` mode, Firecrawl will crawl the entire website.
-
-import CodeBlock from "@theme/CodeBlock";
-import Example from "@examples/document_loaders/firecrawl.ts";
-
-```bash npm2yarn
-npm install @mendable/firecrawl-js
-```
-
-{Example}
-
-### Additional Parameters
-
-For `params` you can pass any of the params according to the [Firecrawl documentation](https://docs.firecrawl.dev).
diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb b/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb
new file mode 100644
index 000000000000..812ed2961124
--- /dev/null
+++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb
@@ -0,0 +1,323 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "sidebar_label: PDF files\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# WebPDFLoader\n",
+ "\n",
+ "This notebook provides a quick overview for getting started with [WebPDFLoader](/docs/integrations/document_loaders/). For detailed documentation of all WebPDFLoader features and configurations head to the [API reference](https://api.js.langchain.com/classes/langchain_community_document_loaders_web_pdf.WebPDFLoader.html).\n",
+ "\n",
+ "## Overview\n",
+ "### Integration details\n",
+ "\n",
+ "| Class | Package | Local | Serializable | PY support |\n",
+ "| :--- | :--- | :---: | :---: | :---: |\n",
+ "| [WebPDFLoader](https://api.js.langchain.com/classes/langchain_community_document_loaders_web_pdf.WebPDFLoader.html) | [@langchain/community](https://api.js.langchain.com/modules/langchain_community_document_loaders_web_pdf.html) | ✅ | beta | ❌ | \n",
+ "### Loader features\n",
+ "| Source | Web Loader | Node Envs Only\n",
+ "| :---: | :---: | :---: | \n",
+ "| WebPDFLoader | ✅ | ❌ | \n",
+ "\n",
+ "You can use this version of the popular PDFLoader in web environments.\n",
+ "By default, one document will be created for each page in the PDF file, you can change this behavior by setting the `splitPages` option to `false`.\n",
+ "\n",
+ "## Setup\n",
+ "\n",
+ "To access `WebPDFLoader` document loader you'll need to install the `@langchain/community` integration, along with the `pdf-parse` package:\n",
+ "\n",
+ "### Credentials\n",
+ "\n",
+ "If you want to get automated tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:\n",
+ "\n",
+ "```bash\n",
+ "# export LANGCHAIN_TRACING_V2=\"true\"\n",
+ "# export LANGCHAIN_API_KEY=\"your-api-key\"\n",
+ "```\n",
+ "\n",
+ "### Installation\n",
+ "\n",
+ "The LangChain WebPDFLoader integration lives in the `@langchain/community` package:\n",
+ "\n",
+ "```{=mdx}\n",
+ "import IntegrationInstallTooltip from \"@mdx_components/integration_install_tooltip.mdx\";\n",
+ "import Npm2Yarn from \"@theme/Npm2Yarn\";\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ " @langchain/community pdf-parse\n",
+ "\n",
+ "\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Instantiation\n",
+ "\n",
+ "Now we can instantiate our model object and load documents:\n",
+ "\n",
+ "- TODO: Update model instantiation with relevant params."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import fs from \"fs/promises\";\n",
+ "import { WebPDFLoader } from \"@langchain/community/document_loaders/web/pdf\"\n",
+ "\n",
+ "const nike10kPDFPath = \"../../../../data/nke-10k-2023.pdf\";\n",
+ "\n",
+ "// Read the file as a buffer\n",
+ "const buffer = await fs.readFile(nike10kPDFPath);\n",
+ "\n",
+ "// Create a Blob from the buffer\n",
+ "const nike10kPDFBlob = new Blob([buffer], { type: 'application/pdf' });\n",
+ "\n",
+ "const loader = new WebPDFLoader(nike10kPDFBlob, {\n",
+ " // required params = ...\n",
+ " // optional params = ...\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Document {\n",
+ " pageContent: 'Table of Contents\\n' +\n",
+ " 'UNITED STATES\\n' +\n",
+ " 'SECURITIES AND EXCHANGE COMMISSION\\n' +\n",
+ " 'Washington, D.C. 20549\\n' +\n",
+ " 'FORM 10-K\\n' +\n",
+ " '(Mark One)\\n' +\n",
+ " '☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934\\n' +\n",
+ " 'FOR THE FISCAL YEAR ENDED MAY 31, 2023\\n' +\n",
+ " 'OR\\n' +\n",
+ " '☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934\\n' +\n",
+ " 'FOR THE TRANSITION PERIOD FROM TO .\\n' +\n",
+ " 'Commission File No. 1-10635\\n' +\n",
+ " 'NIKE, Inc.\\n' +\n",
+ " '(Exact name of Registrant as specified in its charter)\\n' +\n",
+ " 'Oregon93-0584541\\n' +\n",
+ " '(State or other jurisdiction of incorporation)(IRS Employer Identification No.)\\n' +\n",
+ " 'One Bowerman Drive, Beaverton, Oregon 97005-6453\\n' +\n",
+ " '(Address of principal executive offices and zip code)\\n' +\n",
+ " '(503) 671-6453\\n' +\n",
+ " \"(Registrant's telephone number, including area code)\\n\" +\n",
+ " 'SECURITIES REGISTERED PURSUANT TO SECTION 12(B) OF THE ACT:\\n' +\n",
+ " 'Class B Common StockNKENew York Stock Exchange\\n' +\n",
+ " '(Title of each class)(Trading symbol)(Name of each exchange on which registered)\\n' +\n",
+ " 'SECURITIES REGISTERED PURSUANT TO SECTION 12(G) OF THE ACT:\\n' +\n",
+ " 'NONE\\n' +\n",
+ " 'Indicate by check mark:YESNO\\n' +\n",
+ " '•if the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.þ ̈\\n' +\n",
+ " '•if the registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Act. ̈þ\\n' +\n",
+ " '•whether the registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding\\n' +\n",
+ " '12 months (or for such shorter period that the registrant was required to file such reports), and (2) has been subject to such filing requirements for the\\n' +\n",
+ " 'past 90 days.\\n' +\n",
+ " 'þ ̈\\n' +\n",
+ " '•whether the registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T\\n' +\n",
+ " '(§232.405 of this chapter) during the preceding 12 months (or for such shorter period that the registrant was required to submit such files).\\n' +\n",
+ " 'þ ̈\\n' +\n",
+ " '•whether the registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, a smaller reporting company or an emerging growth company. See the definitions of “large accelerated filer,”\\n' +\n",
+ " '“accelerated filer,” “smaller reporting company,” and “emerging growth company” in Rule 12b-2 of the Exchange Act.\\n' +\n",
+ " 'Large accelerated filerþAccelerated filer☐Non-accelerated filer☐Smaller reporting company☐Emerging growth company☐\\n' +\n",
+ " '•if an emerging growth company, if the registrant has elected not to use the extended transition period for complying with any new or revised financial\\n' +\n",
+ " 'accounting standards provided pursuant to Section 13(a) of the Exchange Act.\\n' +\n",
+ " ' ̈\\n' +\n",
+ " \"•whether the registrant has filed a report on and attestation to its management's assessment of the effectiveness of its internal control over financial\\n\" +\n",
+ " 'reporting under Section 404(b) of the Sarbanes-Oxley Act (15 U.S.C. 7262(b)) by the registered public accounting firm that prepared or issued its audit\\n' +\n",
+ " 'report.\\n' +\n",
+ " 'þ\\n' +\n",
+ " '•if securities are registered pursuant to Section 12(b) of the Act, whether the financial statements of the registrant included in the filing reflect the\\n' +\n",
+ " 'correction of an error to previously issued financial statements.\\n' +\n",
+ " ' ̈\\n' +\n",
+ " '•whether any of those error corrections are restatements that required a recovery analysis of incentive-based compensation received by any of the\\n' +\n",
+ " \"registrant's executive officers during the relevant recovery period pursuant to § 240.10D-1(b).\\n\" +\n",
+ " ' ̈\\n' +\n",
+ " '•\\n' +\n",
+ " 'whether the registrant is a shell company (as defined in Rule 12b-2 of the Act).☐þ\\n' +\n",
+ " \"As of November 30, 2022, the aggregate market values of the Registrant's Common Stock held by non-affiliates were:\\n\" +\n",
+ " 'Class A$7,831,564,572 \\n' +\n",
+ " 'Class B136,467,702,472 \\n' +\n",
+ " '$144,299,267,044 ',\n",
+ " metadata: {\n",
+ " pdf: {\n",
+ " version: '1.10.100',\n",
+ " info: [Object],\n",
+ " metadata: null,\n",
+ " totalPages: 107\n",
+ " },\n",
+ " loc: { pageNumber: 1 }\n",
+ " },\n",
+ " id: undefined\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "const docs = await loader.load()\n",
+ "docs[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{\n",
+ " pdf: {\n",
+ " version: '1.10.100',\n",
+ " info: {\n",
+ " PDFFormatVersion: '1.4',\n",
+ " IsAcroFormPresent: false,\n",
+ " IsXFAPresent: false,\n",
+ " Title: '0000320187-23-000039',\n",
+ " Author: 'EDGAR Online, a division of Donnelley Financial Solutions',\n",
+ " Subject: 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31',\n",
+ " Keywords: '0000320187-23-000039; ; 10-K',\n",
+ " Creator: 'EDGAR Filing HTML Converter',\n",
+ " Producer: 'EDGRpdf Service w/ EO.Pdf 22.0.40.0',\n",
+ " CreationDate: \"D:20230720162200-04'00'\",\n",
+ " ModDate: \"D:20230720162208-04'00'\"\n",
+ " },\n",
+ " metadata: null,\n",
+ " totalPages: 107\n",
+ " },\n",
+ " loc: { pageNumber: 1 }\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "console.log(docs[0].metadata)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Usage, custom `pdfjs` build\n",
+ "\n",
+ "By default we use the `pdfjs` build bundled with `pdf-parse`, which is compatible with most environments, including Node.js and modern browsers. If you want to use a more recent version of `pdfjs-dist` or if you want to use a custom build of `pdfjs-dist`, you can do so by providing a custom `pdfjs` function that returns a promise that resolves to the `PDFJS` object.\n",
+ "\n",
+ "In the following example we use the \"legacy\" (see [pdfjs docs](https://github.com/mozilla/pdf.js/wiki/Frequently-Asked-Questions#which-browsersenvironments-are-supported)) build of `pdfjs-dist`, which includes several polyfills not included in the default build.\n",
+ "\n",
+ "```{=mdx}\n",
+ "\n",
+ " pdfjs-dist\n",
+ "\n",
+ "\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import { WebPDFLoader } from \"@langchain/community/document_loaders/web/pdf\";\n",
+ "\n",
+ "const blob = new Blob(); // e.g. from a file input\n",
+ "\n",
+ "const customBuildLoader = new WebPDFLoader(blob, {\n",
+ " // you may need to add `.then(m => m.default)` to the end of the import\n",
+ " // @lc-ts-ignore\n",
+ " pdfjs: () => import(\"pdfjs-dist/legacy/build/pdf.js\"),\n",
+ "});"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Eliminating extra spaces\n",
+ "\n",
+ "PDFs come in many varieties, which makes reading them a challenge. The loader parses individual text elements and joins them together with a space by default, but\n",
+ "if you are seeing excessive spaces, this may not be the desired behavior. In that case, you can override the separator with an empty string like this:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import { WebPDFLoader } from \"@langchain/community/document_loaders/web/pdf\";\n",
+ "\n",
+ "// new Blob(); e.g. from a file input\n",
+ "const eliminatingExtraSpacesLoader = new WebPDFLoader(new Blob(), {\n",
+ " parsedItemSeparator: \"\",\n",
+ "});"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## API reference\n",
+ "\n",
+ "For detailed documentation of all WebPDFLoader features and configurations head to the API reference: https://api.js.langchain.com/classes/langchain_community_document_loaders_web_pdf.WebPDFLoader.html"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "TypeScript",
+ "language": "typescript",
+ "name": "tslab"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "mode": "typescript",
+ "name": "javascript",
+ "typescript": true
+ },
+ "file_extension": ".ts",
+ "mimetype": "text/typescript",
+ "name": "typescript",
+ "version": "3.7.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.mdx
deleted file mode 100644
index 64e96247765c..000000000000
--- a/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.mdx
+++ /dev/null
@@ -1,53 +0,0 @@
-# PDF files
-
-You can use this version of the popular PDFLoader in web environments.
-By default, one document will be created for each page in the PDF file, you can change this behavior by setting the `splitPages` option to `false`.
-
-## Setup
-
-```bash npm2yarn
-npm install pdf-parse
-```
-
-## Usage
-
-import CodeBlock from "@theme/CodeBlock";
-import Example from "@examples/document_loaders/web_pdf.ts";
-
-{Example}
-
-## Usage, custom `pdfjs` build
-
-By default we use the `pdfjs` build bundled with `pdf-parse`, which is compatible with most environments, including Node.js and modern browsers. If you want to use a more recent version of `pdfjs-dist` or if you want to use a custom build of `pdfjs-dist`, you can do so by providing a custom `pdfjs` function that returns a promise that resolves to the `PDFJS` object.
-
-In the following example we use the "legacy" (see [pdfjs docs](https://github.com/mozilla/pdf.js/wiki/Frequently-Asked-Questions#which-browsersenvironments-are-supported)) build of `pdfjs-dist`, which includes several polyfills not included in the default build.
-
-```bash npm2yarn
-npm install pdfjs-dist
-```
-
-```typescript
-import { WebPDFLoader } from "@langchain/community/document_loaders/web/pdf";
-
-const blob = new Blob(); // e.g. from a file input
-
-const loader = new WebPDFLoader(blob, {
- // you may need to add `.then(m => m.default)` to the end of the import
- pdfjs: () => import("pdfjs-dist/legacy/build/pdf.js"),
-});
-```
-
-## Eliminating extra spaces
-
-PDFs come in many varieties, which makes reading them a challenge. The loader parses individual text elements and joins them together with a space by default, but
-if you are seeing excessive spaces, this may not be the desired behavior. In that case, you can override the separator with an empty string like this:
-
-```typescript
-import { WebPDFLoader } from "@langchain/community/document_loaders/web/pdf";
-
-const blob = new Blob(); // e.g. from a file input
-
-const loader = new WebPDFLoader(blob, {
- parsedItemSeparator: "",
-});
-```
diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/recursive_url_loader.ipynb b/docs/core_docs/docs/integrations/document_loaders/web_loaders/recursive_url_loader.ipynb
new file mode 100644
index 000000000000..ec13013b245c
--- /dev/null
+++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/recursive_url_loader.ipynb
@@ -0,0 +1,449 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "sidebar_label: RecursiveUrlLoader\n",
+ "sidebar_class_name: node-only\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# RecursiveUrlLoader\n",
+ "\n",
+ "```{=mdx}\n",
+ "\n",
+ ":::tip Compatibility\n",
+ "\n",
+ "Only available on Node.js.\n",
+ "\n",
+ ":::\n",
+ "\n",
+ "```\n",
+ "\n",
+ "This notebook provides a quick overview for getting started with [RecursiveUrlLoader](/docs/integrations/document_loaders/). For detailed documentation of all RecursiveUrlLoader features and configurations head to the [API reference](https://api.js.langchain.com/classes/langchain_community_document_loaders_web_recursive_url.RecursiveUrlLoader.html).\n",
+ "\n",
+ "## Overview\n",
+ "### Integration details\n",
+ "\n",
+ "| Class | Package | Local | Serializable | PY support |\n",
+ "| :--- | :--- | :---: | :---: | :---: |\n",
+ "| [RecursiveUrlLoader](https://api.js.langchain.com/classes/langchain_community_document_loaders_web_recursive_url.RecursiveUrlLoader.html) | [@langchain/community](https://api.js.langchain.com/modules/langchain_community_document_loaders_web_recursive_url.html) | ✅ | beta | ❌ | \n",
+ "### Loader features\n",
+ "| Source | Web Loader | Node Envs Only\n",
+ "| :---: | :---: | :---: | \n",
+ "| RecursiveUrlLoader | ✅ | ✅ | \n",
+ "\n",
+ "When loading content from a website, we may want to process load all URLs on a page.\n",
+ "\n",
+ "For example, let's look at the [LangChain.js introduction](/docs/introduction) docs.\n",
+ "\n",
+ "This has many interesting child pages that we may want to load, split, and later retrieve in bulk.\n",
+ "\n",
+ "The challenge is traversing the tree of child pages and assembling a list!\n",
+ "\n",
+ "We do this using the `RecursiveUrlLoader`.\n",
+ "\n",
+ "This also gives us the flexibility to exclude some children, customize the extractor, and more.\n",
+ "\n",
+ "## Setup\n",
+ "\n",
+ "To access `RecursiveUrlLoader` document loader you'll need to install the `@langchain/community` integration, and the [`jsdom`](https://www.npmjs.com/package/jsdom) package.\n",
+ "\n",
+ "### Credentials\n",
+ "\n",
+ "If you want to get automated tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:\n",
+ "\n",
+ "```bash\n",
+ "# export LANGCHAIN_TRACING_V2=\"true\"\n",
+ "# export LANGCHAIN_API_KEY=\"your-api-key\"\n",
+ "```\n",
+ "\n",
+ "### Installation\n",
+ "\n",
+ "The LangChain RecursiveUrlLoader integration lives in the `@langchain/community` package:\n",
+ "\n",
+ "```{=mdx}\n",
+ "import IntegrationInstallTooltip from \"@mdx_components/integration_install_tooltip.mdx\";\n",
+ "import Npm2Yarn from \"@theme/Npm2Yarn\";\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ " @langchain/community jsdom\n",
+ "\n",
+ "\n",
+ "We also suggest adding a package like [`html-to-text`](https://www.npmjs.com/package/html-to-text) or\n",
+ "[`@mozilla/readability`](https://www.npmjs.com/package/@mozilla/readability) for extracting the raw text from the page.\n",
+ "\n",
+ "\n",
+ " html-to-text\n",
+ "\n",
+ "\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Instantiation\n",
+ "\n",
+ "Now we can instantiate our model object and load documents:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import { RecursiveUrlLoader } from \"@langchain/community/document_loaders/web/recursive_url\"\n",
+ "import { compile } from \"html-to-text\";\n",
+ "\n",
+ "const compiledConvert = compile({ wordwrap: 130 }); // returns (text: string) => string;\n",
+ "\n",
+ "const loader = new RecursiveUrlLoader(\"https://langchain.com/\", {\n",
+ " extractor: compiledConvert,\n",
+ " maxDepth: 1,\n",
+ " excludeDirs: [\"/docs/api/\"],\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{\n",
+ " pageContent: '\\n' +\n",
+ " '/\\n' +\n",
+ " 'Products\\n' +\n",
+ " '\\n' +\n",
+ " 'LangChain [/langchain]LangSmith [/langsmith]LangGraph [/langgraph]\\n' +\n",
+ " 'Methods\\n' +\n",
+ " '\\n' +\n",
+ " 'Retrieval [/retrieval]Agents [/agents]Evaluation [/evaluation]\\n' +\n",
+ " 'Resources\\n' +\n",
+ " '\\n' +\n",
+ " 'Blog [https://blog.langchain.dev/]Case Studies [/case-studies]Use Case Inspiration [/use-cases]Experts [/experts]Changelog\\n' +\n",
+ " '[https://changelog.langchain.com/]\\n' +\n",
+ " 'Docs\\n' +\n",
+ " '\\n' +\n",
+ " 'LangChain Docs [https://python.langchain.com/v0.2/docs/introduction/]LangSmith Docs [https://docs.smith.langchain.com/]\\n' +\n",
+ " 'Company\\n' +\n",
+ " '\\n' +\n",
+ " 'About [/about]Careers [/careers]\\n' +\n",
+ " 'Pricing [/pricing]\\n' +\n",
+ " 'Get a demo [/contact-sales]\\n' +\n",
+ " 'Sign up [https://smith.langchain.com/]\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " 'LangChain’s suite of products supports developers along each step of the LLM application lifecycle.\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " 'APPLICATIONS THAT CAN REASON. POWERED BY LANGCHAIN.\\n' +\n",
+ " '\\n' +\n",
+ " 'Get a demo [/contact-sales]Sign up for free [https://smith.langchain.com/]\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " 'FROM STARTUPS TO GLOBAL ENTERPRISES,\\n' +\n",
+ " 'AMBITIOUS BUILDERS CHOOSE\\n' +\n",
+ " 'LANGCHAIN PRODUCTS.\\n' +\n",
+ " '\\n' +\n",
+ " '[https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65ca3b7c22746faa78338532_logo_Ally.svg][https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65ca3b7c08e67bb7eefba4c2_logo_Rakuten.svg][https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65ca3b7c576fdde32d03c1a0_logo_Elastic.svg][https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65ca3b7c6d5592036dae24e5_logo_BCG.svg][https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/667f19528c3557c2c19c3086_the-home-depot-2%201.png][https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65ca3b7cbcf6473519b06d84_logo_IDEO.svg][https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65ca3b7cb5f96dcc100ee3b7_logo_Zapier.svg][https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/6606183e52d49bc369acc76c_mdy_logo_rgb_moodysblue.png][https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65ca3b7c8ad7db6ed6ec611e_logo_Adyen.svg][https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65ca3b7c737d50036a62768b_logo_Infor.svg][https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/667f59d98444a5f98aabe21c_acxiom-vector-logo-2022%201.png][https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65ca3b7c09a158ffeaab0bd2_logo_Replit.svg][https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65ca3b7c9d2b23d292a0cab0_logo_Retool.svg][https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65ca3b7c44e67a3d0a996bf3_logo_Databricks.svg][https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/667f5a1299d6ba453c78a849_image%20(19).png][https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65ca3b7c63af578816bafcc3_logo_Instacart.svg][https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/665dc1dabc940168384d9596_podium%20logo.svg]\\n' +\n",
+ " '\\n' +\n",
+ " 'Build\\n' +\n",
+ " '\\n' +\n",
+ " 'LangChain is a framework to build with LLMs by chaining interoperable components. LangGraph is the framework for building\\n' +\n",
+ " 'controllable agentic workflows.\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " 'Run\\n' +\n",
+ " '\\n' +\n",
+ " 'Deploy your LLM applications at scale with LangGraph Cloud, our infrastructure purpose-built for agents.\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " 'Manage\\n' +\n",
+ " '\\n' +\n",
+ " \"Debug, collaborate, test, and monitor your LLM app in LangSmith - whether it's built with a LangChain framework or not. \\n\" +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " 'BUILD YOUR APP WITH LANGCHAIN\\n' +\n",
+ " '\\n' +\n",
+ " 'Build context-aware, reasoning applications with LangChain’s flexible framework that leverages your company’s data and APIs.\\n' +\n",
+ " 'Future-proof your application by making vendor optionality part of your LLM infrastructure design.\\n' +\n",
+ " '\\n' +\n",
+ " 'Learn more about LangChain\\n' +\n",
+ " '\\n' +\n",
+ " '[/langchain]\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " 'RUN AT SCALE WITH LANGGRAPH CLOUD\\n' +\n",
+ " '\\n' +\n",
+ " 'Deploy your LangGraph app with LangGraph Cloud for fault-tolerant scalability - including support for async background jobs,\\n' +\n",
+ " 'built-in persistence, and distributed task queues.\\n' +\n",
+ " '\\n' +\n",
+ " 'Learn more about LangGraph\\n' +\n",
+ " '\\n' +\n",
+ " '[/langgraph]\\n' +\n",
+ " '[https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/667c6d7284e58f4743a430e6_Langgraph%20UI-home-2.webp]\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " 'MANAGE LLM PERFORMANCE WITH LANGSMITH\\n' +\n",
+ " '\\n' +\n",
+ " 'Ship faster with LangSmith’s debug, test, deploy, and monitoring workflows. Don’t rely on “vibes” – add engineering rigor to your\\n' +\n",
+ " 'LLM-development workflow, whether you’re building with LangChain or not.\\n' +\n",
+ " '\\n' +\n",
+ " 'Learn more about LangSmith\\n' +\n",
+ " '\\n' +\n",
+ " '[/langsmith]\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " 'HEAR FROM OUR HAPPY CUSTOMERS\\n' +\n",
+ " '\\n' +\n",
+ " 'LangChain, LangGraph, and LangSmith help teams of all sizes, across all industries - from ambitious startups to established\\n' +\n",
+ " 'enterprises.\\n' +\n",
+ " '\\n' +\n",
+ " '[https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65c5308aee06d9826765c897_Retool_logo%201.png]\\n' +\n",
+ " '\\n' +\n",
+ " '“LangSmith helped us improve the accuracy and performance of Retool’s fine-tuned models. Not only did we deliver a better product\\n' +\n",
+ " 'by iterating with LangSmith, but we’re shipping new AI features to our users in a fraction of the time it would have taken without\\n' +\n",
+ " 'it.”\\n' +\n",
+ " '\\n' +\n",
+ " '[https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65c5308abdd2dbbdde5a94a1_Jamie%20Cuffe.png]\\n' +\n",
+ " 'Jamie Cuffe\\n' +\n",
+ " 'Head of Self-Serve and New Products\\n' +\n",
+ " '[https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65c5308a04d37cf7d3eb1341_Rakuten_Global_Brand_Logo.png]\\n' +\n",
+ " '\\n' +\n",
+ " '“By combining the benefits of LangSmith and standing on the shoulders of a gigantic open-source community, we’re able to identify\\n' +\n",
+ " 'the right approaches of using LLMs in an enterprise-setting faster.”\\n' +\n",
+ " '\\n' +\n",
+ " '[https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65c5308a8b6137d44c621cb4_Yusuke%20Kaji.png]\\n' +\n",
+ " 'Yusuke Kaji\\n' +\n",
+ " 'General Manager of AI\\n' +\n",
+ " '[https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65c5308aea1371b447cc4af9_elastic-ar21.png]\\n' +\n",
+ " '\\n' +\n",
+ " '“Working with LangChain and LangSmith on the Elastic AI Assistant had a significant positive impact on the overall pace and\\n' +\n",
+ " 'quality of the development and shipping experience. We couldn’t have achieved the product experience delivered to our customers\\n' +\n",
+ " 'without LangChain, and we couldn’t have done it at the same pace without LangSmith.”\\n' +\n",
+ " '\\n' +\n",
+ " '[https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65c5308a4095d5a871de7479_James%20Spiteri.png]\\n' +\n",
+ " 'James Spiteri\\n' +\n",
+ " 'Director of Security Products\\n' +\n",
+ " '[https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65c530539f4824b828357352_Logo_de_Fintual%201.png]\\n' +\n",
+ " '\\n' +\n",
+ " '“As soon as we heard about LangSmith, we moved our entire development stack onto it. We could have built evaluation, testing and\\n' +\n",
+ " 'monitoring tools in house, but with LangSmith it took us 10x less time to get a 1000x better tool.”\\n' +\n",
+ " '\\n' +\n",
+ " '[https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65c53058acbff86f4c2dcee2_jose%20pena.png]\\n' +\n",
+ " 'Jose Peña\\n' +\n",
+ " 'Senior Manager\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " 'THE REFERENCE ARCHITECTURE ENTERPRISES ADOPT FOR SUCCESS.\\n' +\n",
+ " '\\n' +\n",
+ " 'LangChain’s suite of products can be used independently or stacked together for multiplicative impact – guiding you through\\n' +\n",
+ " 'building, running, and managing your LLM apps.\\n' +\n",
+ " '\\n' +\n",
+ " '[https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/6695b116b0b60c78fd4ef462_15.07.24%20-Updated%20stack%20diagram%20-%20lightfor%20website-3.webp][https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/667d392696fc0bc3e17a6d04_New%20LC%20stack%20-%20light-2.webp]\\n' +\n",
+ " '15M+\\n' +\n",
+ " 'Monthly Downloads\\n' +\n",
+ " '100K+\\n' +\n",
+ " 'Apps Powered\\n' +\n",
+ " '75K+\\n' +\n",
+ " 'GitHub Stars\\n' +\n",
+ " '3K+\\n' +\n",
+ " 'Contributors\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " 'THE BIGGEST DEVELOPER COMMUNITY IN GENAI\\n' +\n",
+ " '\\n' +\n",
+ " 'Learn alongside the 1M+ developers who are pushing the industry forward.\\n' +\n",
+ " '\\n' +\n",
+ " 'Explore LangChain\\n' +\n",
+ " '\\n' +\n",
+ " '[/langchain]\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " 'GET STARTED WITH THE LANGSMITH PLATFORM TODAY\\n' +\n",
+ " '\\n' +\n",
+ " 'Get a demo [/contact-sales]Sign up for free [https://smith.langchain.com/]\\n' +\n",
+ " '[https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65ccf12801bc39bf912a58f3_Home%20C.webp]\\n' +\n",
+ " '\\n' +\n",
+ " 'Teams building with LangChain are driving operational efficiency, increasing discovery & personalization, and delivering premium\\n' +\n",
+ " 'products that generate revenue.\\n' +\n",
+ " '\\n' +\n",
+ " 'Discover Use Cases\\n' +\n",
+ " '\\n' +\n",
+ " '[/use-cases]\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " 'GET INSPIRED BY COMPANIES WHO HAVE DONE IT.\\n' +\n",
+ " '\\n' +\n",
+ " '[https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65bcd7ee85507bdf350399c3_Ally_Financial%201.svg]\\n' +\n",
+ " 'Financial Services\\n' +\n",
+ " '\\n' +\n",
+ " '[https://blog.langchain.dev/ally-financial-collaborates-with-langchain-to-deliver-critical-coding-module-to-mask-personal-identifying-information-in-a-compliant-and-safe-manner/]\\n' +\n",
+ " '[https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65bcd8b3ae4dc901daa3037a_Adyen_Corporate_Logo%201.svg]\\n' +\n",
+ " 'FinTech\\n' +\n",
+ " '\\n' +\n",
+ " '[https://blog.langchain.dev/llms-accelerate-adyens-support-team-through-smart-ticket-routing-and-support-agent-copilot/]\\n' +\n",
+ " '[https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65c534b3fa387379c0f4ebff_elastic-ar21%20(1).png]\\n' +\n",
+ " 'Technology\\n' +\n",
+ " '\\n' +\n",
+ " '[https://blog.langchain.dev/langchain-partners-with-elastic-to-launch-the-elastic-ai-assistant/]\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " 'LANGSMITH IS THE ENTERPRISE DEVOPS PLATFORM BUILT FOR LLMS.\\n' +\n",
+ " '\\n' +\n",
+ " 'Explore LangSmith\\n' +\n",
+ " '\\n' +\n",
+ " '[/langsmith]\\n' +\n",
+ " 'Gain visibility to make trade offs between cost, latency, and quality.\\n' +\n",
+ " 'Increase developer productivity.\\n' +\n",
+ " 'Eliminate manual, error-prone testing.\\n' +\n",
+ " 'Reduce hallucinations and improve reliability.\\n' +\n",
+ " 'Enterprise deployment options to keep data secure.\\n' +\n",
+ " '\\n' +\n",
+ " '\\n' +\n",
+ " 'READY TO START SHIPPING RELIABLE GENAI APPS FASTER?\\n' +\n",
+ " '\\n' +\n",
+ " 'Get started with LangChain, LangGraph, and LangSmith to enhance your LLM app development, from prototype to production.\\n' +\n",
+ " '\\n' +\n",
+ " 'Get a demo [/contact-sales]Sign up for free [https://smith.langchain.com/]\\n' +\n",
+ " 'Products\\n' +\n",
+ " 'LangChain [/langchain]LangSmith [/langsmith]LangGraph [/langgraph]Agents [/agents]Evaluation [/evaluation]Retrieval [/retrieval]\\n' +\n",
+ " 'Resources\\n' +\n",
+ " 'Python Docs [https://python.langchain.com/]JS/TS Docs [https://js.langchain.com/docs/get_started/introduction/]GitHub\\n' +\n",
+ " '[https://github.com/langchain-ai]Integrations [https://python.langchain.com/v0.2/docs/integrations/platforms/]Templates\\n' +\n",
+ " '[https://templates.langchain.com/]Changelog [https://changelog.langchain.com/]LangSmith Trust Portal\\n' +\n",
+ " '[https://trust.langchain.com/]\\n' +\n",
+ " 'Company\\n' +\n",
+ " 'About [/about]Blog [https://blog.langchain.dev/]Twitter [https://twitter.com/LangChainAI]LinkedIn\\n' +\n",
+ " '[https://www.linkedin.com/company/langchain/]YouTube [https://www.youtube.com/@LangChain]Community [/join-community]Marketing\\n' +\n",
+ " 'Assets [https://drive.google.com/drive/folders/17xybjzmVBdsQA-VxouuGLxF6bDsHDe80?usp=sharing]\\n' +\n",
+ " 'Sign up for our newsletter to stay up to date\\n' +\n",
+ " 'Thank you! Your submission has been received!\\n' +\n",
+ " 'Oops! Something went wrong while submitting the form.\\n' +\n",
+ " '[https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/65c6a38f9c53ec71f5fc73de_langchain-word.svg]\\n' +\n",
+ " 'All systems operational\\n' +\n",
+ " '[https://status.smith.langchain.com/]Privacy Policy [/'... 111 more characters,\n",
+ " metadata: {\n",
+ " source: 'https://langchain.com/',\n",
+ " title: 'LangChain',\n",
+ " description: 'LangChain’s suite of products supports developers along each step of their development journey.',\n",
+ " language: 'en'\n",
+ " }\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "const docs = await loader.load()\n",
+ "docs[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{\n",
+ " source: 'https://langchain.com/',\n",
+ " title: 'LangChain',\n",
+ " description: 'LangChain’s suite of products supports developers along each step of their development journey.',\n",
+ " language: 'en'\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "console.log(docs[0].metadata)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Options\n",
+ "\n",
+ "```typescript\n",
+ "interface Options {\n",
+ " excludeDirs?: string[]; // webpage directories to exclude.\n",
+ " extractor?: (text: string) => string; // a function to extract the text of the document from the webpage, by default it returns the page as it is. It is recommended to use tools like html-to-text to extract the text. By default, it just returns the page as it is.\n",
+ " maxDepth?: number; // the maximum depth to crawl. By default, it is set to 2. If you need to crawl the whole website, set it to a number that is large enough would simply do the job.\n",
+ " timeout?: number; // the timeout for each request, in the unit of seconds. By default, it is set to 10000 (10 seconds).\n",
+ " preventOutside?: boolean; // whether to prevent crawling outside the root url. By default, it is set to true.\n",
+ " callerOptions?: AsyncCallerConstructorParams; // the options to call the AsyncCaller for example setting max concurrency (default is 64)\n",
+ "}\n",
+ "```\n",
+ "\n",
+ "However, since it's hard to perform a perfect filter, you may still see some irrelevant results in the results. You can perform a filter on the returned documents by yourself, if it's needed. Most of the time, the returned results are good enough."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## API reference\n",
+ "\n",
+ "For detailed documentation of all RecursiveUrlLoader features and configurations head to the API reference: https://api.js.langchain.com/classes/langchain_community_document_loaders_web_recursive_url.RecursiveUrlLoader.html"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "TypeScript",
+ "language": "typescript",
+ "name": "tslab"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "mode": "typescript",
+ "name": "javascript",
+ "typescript": true
+ },
+ "file_extension": ".ts",
+ "mimetype": "text/typescript",
+ "name": "typescript",
+ "version": "3.7.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/recursive_url_loader.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/recursive_url_loader.mdx
deleted file mode 100644
index ddcb358c3056..000000000000
--- a/docs/core_docs/docs/integrations/document_loaders/web_loaders/recursive_url_loader.mdx
+++ /dev/null
@@ -1,67 +0,0 @@
----
-sidebar_class_name: node-only
-hide_table_of_contents: true
----
-
-# Recursive URL Loader
-
-When loading content from a website, we may want to process load all URLs on a page.
-
-For example, let's look at the [LangChain.js introduction](/docs/introduction) docs.
-
-This has many interesting child pages that we may want to load, split, and later retrieve in bulk.
-
-The challenge is traversing the tree of child pages and assembling a list!
-
-We do this using the RecursiveUrlLoader.
-
-This also gives us the flexibility to exclude some children, customize the extractor, and more.
-
-## Setup
-
-To get started, you'll need to install the [`jsdom`](https://www.npmjs.com/package/jsdom) package:
-
-```bash npm2yarn
-npm i jsdom
-```
-
-We also suggest adding a package like [`html-to-text`](https://www.npmjs.com/package/html-to-text) or
-[`@mozilla/readability`](https://www.npmjs.com/package/@mozilla/readability) for extracting the raw text from the page.
-
-```bash npm2yarn
-npm i html-to-text
-```
-
-## Usage
-
-```typescript
-import { compile } from "html-to-text";
-import { RecursiveUrlLoader } from "@langchain/community/document_loaders/web/recursive_url";
-
-const url = "/docs/introduction";
-
-const compiledConvert = compile({ wordwrap: 130 }); // returns (text: string) => string;
-
-const loader = new RecursiveUrlLoader(url, {
- extractor: compiledConvert,
- maxDepth: 1,
- excludeDirs: ["/docs/api/"],
-});
-
-const docs = await loader.load();
-```
-
-## Options
-
-```typescript
-interface Options {
- excludeDirs?: string[]; // webpage directories to exclude.
- extractor?: (text: string) => string; // a function to extract the text of the document from the webpage, by default it returns the page as it is. It is recommended to use tools like html-to-text to extract the text. By default, it just returns the page as it is.
- maxDepth?: number; // the maximum depth to crawl. By default, it is set to 2. If you need to crawl the whole website, set it to a number that is large enough would simply do the job.
- timeout?: number; // the timeout for each request, in the unit of seconds. By default, it is set to 10000 (10 seconds).
- preventOutside?: boolean; // whether to prevent crawling outside the root url. By default, it is set to true.
- callerOptions?: AsyncCallerConstructorParams; // the options to call the AsyncCaller for example setting max concurrency (default is 64)
-}
-```
-
-However, since it's hard to perform a perfect filter, you may still see some irrelevant results in the results. You can perform a filter on the returned documents by yourself, if it's needed. Most of the time, the returned results are good enough.
diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/web_cheerio.ipynb b/docs/core_docs/docs/integrations/document_loaders/web_loaders/web_cheerio.ipynb
index b12e3a8e5a00..488205129b1d 100644
--- a/docs/core_docs/docs/integrations/document_loaders/web_loaders/web_cheerio.ipynb
+++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/web_cheerio.ipynb
@@ -5,7 +5,7 @@
"metadata": {},
"source": [
"---\n",
- "sidebar_label: CheerioWebBaseLoader\n",
+ "sidebar_label: Cheerio\n",
"---"
]
},
@@ -36,8 +36,6 @@
"\n",
"## Setup\n",
"\n",
- "- TODO: Update with relevant info.\n",
- "\n",
"To access `CheerioWebBaseLoader` document loader you'll need to install the `@langchain/community` integration package, along with the `cheerio` peer dependency.\n",
"\n",
"### Credentials\n",
@@ -72,9 +70,7 @@
"source": [
"## Instantiation\n",
"\n",
- "Now we can instantiate our model object and load documents:\n",
- "\n",
- "- TODO: Update model instantiation with relevant params."
+ "Now we can instantiate our model object and load documents:"
]
},
{
diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/web_puppeteer.ipynb b/docs/core_docs/docs/integrations/document_loaders/web_loaders/web_puppeteer.ipynb
new file mode 100644
index 000000000000..332ded7ae820
--- /dev/null
+++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/web_puppeteer.ipynb
@@ -0,0 +1,543 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "sidebar_label: Puppeteer\n",
+ "sidebar_class_name: node-only\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# PuppeteerWebBaseLoader\n",
+ "\n",
+ "```{=mdx}\n",
+ ":::tip Compatibility\n",
+ "\n",
+ "Only available on Node.js.\n",
+ "\n",
+ ":::\n",
+ "```\n",
+ "\n",
+ "This notebook provides a quick overview for getting started with [PuppeteerWebBaseLoader](/docs/integrations/document_loaders/). For detailed documentation of all PuppeteerWebBaseLoader features and configurations head to the [API reference](https://api.js.langchain.com/classes/langchain_community_document_loaders_web_puppeteer.PuppeteerWebBaseLoader.html).\n",
+ "\n",
+ "Puppeteer is a Node.js library that provides a high-level API for controlling headless Chrome or Chromium. You can use Puppeteer to automate web page interactions, including extracting data from dynamic web pages that require JavaScript to render.\n",
+ "\n",
+ "If you want a lighterweight solution, and the webpages you want to load do not require JavaScript to render, you can use the [CheerioWebBaseLoader](/docs/integrations/document_loaders/web_loaders/web_cheerio) instead.\n",
+ "\n",
+ "## Overview\n",
+ "### Integration details\n",
+ "\n",
+ "| Class | Package | Local | Serializable | PY support |\n",
+ "| :--- | :--- | :---: | :---: | :---: |\n",
+ "| [PuppeteerWebBaseLoader](https://api.js.langchain.com/classes/langchain_community_document_loaders_web_puppeteer.PuppeteerWebBaseLoader.html) | [@langchain/community](https://api.js.langchain.com/modules/langchain_community_document_loaders_web_puppeteer.html) | ✅ | beta | ❌ | \n",
+ "### Loader features\n",
+ "| Source | Web Loader | Node Envs Only\n",
+ "| :---: | :---: | :---: | \n",
+ "| PuppeteerWebBaseLoader | ✅ | ✅ | \n",
+ "\n",
+ "## Setup\n",
+ "\n",
+ "To access `PuppeteerWebBaseLoader` document loader you'll need to install the `@langchain/community` integration package, along with the `puppeteer` peer dependency.\n",
+ "\n",
+ "### Credentials\n",
+ "\n",
+ "If you want to get automated tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:\n",
+ "\n",
+ "```bash\n",
+ "# export LANGCHAIN_TRACING_V2=\"true\"\n",
+ "# export LANGCHAIN_API_KEY=\"your-api-key\"\n",
+ "```\n",
+ "\n",
+ "### Installation\n",
+ "\n",
+ "The LangChain PuppeteerWebBaseLoader integration lives in the `@langchain/community` package:\n",
+ "\n",
+ "```{=mdx}\n",
+ "import IntegrationInstallTooltip from \"@mdx_components/integration_install_tooltip.mdx\";\n",
+ "import Npm2Yarn from \"@theme/Npm2Yarn\";\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ " @langchain/community puppeteer\n",
+ "\n",
+ "\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Instantiation\n",
+ "\n",
+ "Now we can instantiate our model object and load documents:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import { PuppeteerWebBaseLoader } from \"@langchain/community/document_loaders/web/puppeteer\"\n",
+ "\n",
+ "const loader = new PuppeteerWebBaseLoader(\"https://langchain.com\", {\n",
+ " // required params = ...\n",
+ " // optional params = ...\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Document {\n",
+ " pageContent: '