utils.html

langchain-ai · Nov 20, 2023 · 3e3b161 · 3e3b161
1 parent 06c0570
commit 3e3b161
Show file tree

Hide file tree

Showing 5 changed files with 90 additions and 251 deletions.
diff --git a/libs/core/langchain_core/utils/html.py b/libs/core/langchain_core/utils/html.py
diff --git a/libs/core/tests/unit_tests/utils/test_html.py b/libs/core/tests/unit_tests/utils/test_html.py
diff --git a/libs/langchain/langchain/document_loaders/recursive_url_loader.py b/libs/langchain/langchain/document_loaders/recursive_url_loader.py
@@ -15,10 +15,10 @@
 )
 
 import requests
-from langchain_core.utils.html import extract_sub_links
 
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
+from langchain.utils.html import extract_sub_links
 
 if TYPE_CHECKING:
     import aiohttp

diff --git a/libs/langchain/langchain/utils/html.py b/libs/langchain/langchain/utils/html.py
@@ -1,3 +1,89 @@
-from langchain_core.utils.html import extract_sub_links, find_all_links
+import re
+from typing import List, Optional, Sequence, Union
+from urllib.parse import urljoin, urlparse
 
-__all__ = ["find_all_links", "extract_sub_links"]
+PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
+SUFFIXES_TO_IGNORE = (
+    ".css",
+    ".js",
+    ".ico",
+    ".png",
+    ".jpg",
+    ".jpeg",
+    ".gif",
+    ".svg",
+    ".csv",
+    ".bz2",
+    ".zip",
+    ".epub",
+)
+SUFFIXES_TO_IGNORE_REGEX = (
+    "(?!" + "|".join([re.escape(s) + r"[\#'\"]" for s in SUFFIXES_TO_IGNORE]) + ")"
+)
+PREFIXES_TO_IGNORE_REGEX = (
+    "(?!" + "|".join([re.escape(s) for s in PREFIXES_TO_IGNORE]) + ")"
+)
+DEFAULT_LINK_REGEX = (
+    rf"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)[\#'\"]"
+)
+
+
+def find_all_links(
+    raw_html: str, *, pattern: Union[str, re.Pattern, None] = None
+) -> List[str]:
+    """Extract all links from a raw html string.
+
+    Args:
+        raw_html: original html.
+        pattern: Regex to use for extracting links from raw html.
+
+    Returns:
+        List[str]: all links
+    """
+    pattern = pattern or DEFAULT_LINK_REGEX
+    return list(set(re.findall(pattern, raw_html)))
+
+
+def extract_sub_links(
+    raw_html: str,
+    url: str,
+    *,
+    base_url: Optional[str] = None,
+    pattern: Union[str, re.Pattern, None] = None,
+    prevent_outside: bool = True,
+    exclude_prefixes: Sequence[str] = (),
+) -> List[str]:
+    """Extract all links from a raw html string and convert into absolute paths.
+
+    Args:
+        raw_html: original html.
+        url: the url of the html.
+        base_url: the base url to check for outside links against.
+        pattern: Regex to use for extracting links from raw html.
+        prevent_outside: If True, ignore external links which are not children
+            of the base url.
+        exclude_prefixes: Exclude any URLs that start with one of these prefixes.
+
+    Returns:
+        List[str]: sub links
+    """
+    base_url = base_url if base_url is not None else url
+    all_links = find_all_links(raw_html, pattern=pattern)
+    absolute_paths = set()
+    for link in all_links:
+        # Some may be absolute links like https://to/path
+        if link.startswith("http"):
+            absolute_paths.add(link)
+        # Some may have omitted the protocol like //to/path
+        elif link.startswith("//"):
+            absolute_paths.add(f"{urlparse(url).scheme}:{link}")
+        else:
+            absolute_paths.add(urljoin(url, link))
+    res = []
+    for path in absolute_paths:
+        if any(path.startswith(exclude) for exclude in exclude_prefixes):
+            continue
+        if prevent_outside and not path.startswith(base_url):
+            continue
+        res.append(path)
+    return res
diff --git a/libs/langchain/tests/unit_tests/utils/test_html.py b/libs/langchain/tests/unit_tests/utils/test_html.py
@@ -1,4 +1,4 @@
-from langchain_core.utils.html import (
+from langchain.utils.html import (
     PREFIXES_TO_IGNORE,
     SUFFIXES_TO_IGNORE,
     extract_sub_links,