Skip to content

Commit

Permalink
skip excluded sublinks before recursion (#11036)
Browse files Browse the repository at this point in the history
  • Loading branch information
baskaryan authored Sep 26, 2023
1 parent 9c5eca9 commit a2f7246
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,13 @@ def __init__(
else _metadata_extractor
)
self.exclude_dirs = exclude_dirs if exclude_dirs is not None else ()

if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
raise ValueError(
f"Base url is included in exclude_dirs. Received base_url: {url} and "
f"exclude_dirs: {self.exclude_dirs}"
)

self.timeout = timeout
self.prevent_outside = prevent_outside if prevent_outside is not None else True
self.link_regex = link_regex
Expand Down Expand Up @@ -149,6 +156,7 @@ def _get_child_links_recursive(
base_url=self.url,
pattern=self.link_regex,
prevent_outside=self.prevent_outside,
exclude_prefixes=self.exclude_dirs,
)
for link in sub_links:
# Check all unvisited links
Expand Down Expand Up @@ -182,10 +190,6 @@ async def _async_get_child_links_recursive(
if depth >= self.max_depth:
return []

# Exclude the root and parent from a list
# Exclude the links that start with any of the excluded directories
if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
return []
# Disable SSL verification because websites may have invalid SSL certificates,
# but won't cause any security issues for us.
close_session = session is None
Expand Down
8 changes: 6 additions & 2 deletions libs/langchain/langchain/utils/html.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import re
from typing import List, Optional, Union
from typing import List, Optional, Sequence, Union
from urllib.parse import urljoin, urlparse

PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
Expand Down Expand Up @@ -42,6 +42,7 @@ def extract_sub_links(
base_url: Optional[str] = None,
pattern: Union[str, re.Pattern, None] = None,
prevent_outside: bool = True,
exclude_prefixes: Sequence[str] = (),
) -> List[str]:
"""Extract all links from a raw html string and convert into absolute paths.
Expand All @@ -52,6 +53,7 @@ def extract_sub_links(
pattern: Regex to use for extracting links from raw html.
prevent_outside: If True, ignore external links which are not children
of the base url.
exclude_prefixes: Exclude any URLs that start with one of these prefixes.
Returns:
List[str]: sub links
Expand All @@ -60,8 +62,10 @@ def extract_sub_links(
all_links = find_all_links(raw_html, pattern=pattern)
absolute_paths = set()
for link in all_links:
if any(link.startswith(exclude) for exclude in exclude_prefixes):
continue
# Some may be absolute links like https://to/path
if link.startswith("http"):
elif link.startswith("http"):
absolute_paths.add(link)
# Some may have omitted the protocol like //to/path
elif link.startswith("//"):
Expand Down

0 comments on commit a2f7246

Please sign in to comment.