-
Notifications
You must be signed in to change notification settings - Fork 15.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
90 additions
and
251 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,89 @@ | ||
from langchain_core.utils.html import extract_sub_links, find_all_links | ||
import re | ||
from typing import List, Optional, Sequence, Union | ||
from urllib.parse import urljoin, urlparse | ||
|
||
__all__ = ["find_all_links", "extract_sub_links"] | ||
PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#") | ||
SUFFIXES_TO_IGNORE = ( | ||
".css", | ||
".js", | ||
".ico", | ||
".png", | ||
".jpg", | ||
".jpeg", | ||
".gif", | ||
".svg", | ||
".csv", | ||
".bz2", | ||
".zip", | ||
".epub", | ||
) | ||
SUFFIXES_TO_IGNORE_REGEX = ( | ||
"(?!" + "|".join([re.escape(s) + r"[\#'\"]" for s in SUFFIXES_TO_IGNORE]) + ")" | ||
) | ||
PREFIXES_TO_IGNORE_REGEX = ( | ||
"(?!" + "|".join([re.escape(s) for s in PREFIXES_TO_IGNORE]) + ")" | ||
) | ||
DEFAULT_LINK_REGEX = ( | ||
rf"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)[\#'\"]" | ||
) | ||
|
||
|
||
def find_all_links( | ||
raw_html: str, *, pattern: Union[str, re.Pattern, None] = None | ||
) -> List[str]: | ||
"""Extract all links from a raw html string. | ||
Args: | ||
raw_html: original html. | ||
pattern: Regex to use for extracting links from raw html. | ||
Returns: | ||
List[str]: all links | ||
""" | ||
pattern = pattern or DEFAULT_LINK_REGEX | ||
return list(set(re.findall(pattern, raw_html))) | ||
|
||
|
||
def extract_sub_links( | ||
raw_html: str, | ||
url: str, | ||
*, | ||
base_url: Optional[str] = None, | ||
pattern: Union[str, re.Pattern, None] = None, | ||
prevent_outside: bool = True, | ||
exclude_prefixes: Sequence[str] = (), | ||
) -> List[str]: | ||
"""Extract all links from a raw html string and convert into absolute paths. | ||
Args: | ||
raw_html: original html. | ||
url: the url of the html. | ||
base_url: the base url to check for outside links against. | ||
pattern: Regex to use for extracting links from raw html. | ||
prevent_outside: If True, ignore external links which are not children | ||
of the base url. | ||
exclude_prefixes: Exclude any URLs that start with one of these prefixes. | ||
Returns: | ||
List[str]: sub links | ||
""" | ||
base_url = base_url if base_url is not None else url | ||
all_links = find_all_links(raw_html, pattern=pattern) | ||
absolute_paths = set() | ||
for link in all_links: | ||
# Some may be absolute links like https://to/path | ||
if link.startswith("http"): | ||
absolute_paths.add(link) | ||
# Some may have omitted the protocol like //to/path | ||
elif link.startswith("//"): | ||
absolute_paths.add(f"{urlparse(url).scheme}:{link}") | ||
else: | ||
absolute_paths.add(urljoin(url, link)) | ||
res = [] | ||
for path in absolute_paths: | ||
if any(path.startswith(exclude) for exclude in exclude_prefixes): | ||
continue | ||
if prevent_outside and not path.startswith(base_url): | ||
continue | ||
res.append(path) | ||
return res |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters