From 0771f8e37d32ea148eae69c559b142ecc277ffb0 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Mon, 28 Oct 2024 13:57:18 +0300 Subject: [PATCH 01/30] Update html.py used bs4 for larger html file processing --- .../langchain_text_splitters/html.py | 154 ++++++++++-------- 1 file changed, 90 insertions(+), 64 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index cdbea7f724b53..4b9280934ae50 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -90,79 +90,105 @@ def split_text(self, text: str) -> List[Document]: """ return self.split_text_from_file(StringIO(text)) + def split_text_from_file(self, file: Any) -> List[Document]: - """Split HTML file + """Split HTML file using BeautifulSoup. Args: - file: HTML file + file: HTML file path or file-like object. + + Returns: + List of Document objects with page_content and metadata. """ - try: - from lxml import etree - except ImportError as e: - raise ImportError( - "Unable to import lxml, please install with `pip install lxml`." - ) from e - # use lxml library to parse html document and return xml ElementTree - # Explicitly encoding in utf-8 allows non-English - # html files to be processed without garbled characters - parser = etree.HTMLParser(encoding="utf-8") - tree = etree.parse(file, parser) - - # document transformation for "structure-aware" chunking is handled with xsl. - # see comments in html_chunks_with_headers.xslt for more detailed information. - xslt_path = pathlib.Path(__file__).parent / "xsl/html_chunks_with_headers.xslt" - xslt_tree = etree.parse(xslt_path) - transform = etree.XSLT(xslt_tree) - result = transform(tree) - result_dom = etree.fromstring(str(result)) + from bs4 import BeautifulSoup + from langchain.docstore.document import Document + import bs4 + + # Read the HTML content from the file or file-like object + if isinstance(file, str): + with open(file, 'r', encoding='utf-8') as f: + html_content = f.read() + else: + # Assuming file is a file-like object + html_content = file.read() - # create filter and mapping for header metadata - header_filter = [header[0] for header in self.headers_to_split_on] - header_mapping = dict(self.headers_to_split_on) + # Parse the HTML content using BeautifulSoup + soup = BeautifulSoup(html_content, 'html.parser') - # map xhtml namespace prefix - ns_map = {"h": "http://www.w3.org/1999/xhtml"} + # Extract the header tags and their corresponding metadata keys + headers_to_split_on = [tag[0] for tag in self.headers_to_split_on] + header_mapping = dict(self.headers_to_split_on) - # build list of elements from DOM - elements = [] - for element in result_dom.findall("*//*", ns_map): - if element.findall("*[@class='headers']") or element.findall( - "*[@class='chunk']" - ): - elements.append( - ElementType( - url=file, - xpath="".join( - [ - node.text or "" - for node in element.findall("*[@class='xpath']", ns_map) - ] - ), - content="".join( - [ - node.text or "" - for node in element.findall("*[@class='chunk']", ns_map) - ] - ), - metadata={ - # Add text of specified headers to metadata using header - # mapping. - header_mapping[node.tag]: node.text or "" - for node in filter( - lambda x: x.tag in header_filter, - element.findall("*[@class='headers']/*", ns_map), - ) - }, - ) - ) + documents = [] - if not self.return_each_element: - return self.aggregate_elements_to_chunks(elements) + # Find the body of the document + body = soup.body if soup.body else soup + + # Find all header tags in the order they appear + all_headers = body.find_all(headers_to_split_on) + + # If there's content before the first header, collect it + first_header = all_headers[0] if all_headers else None + if first_header: + pre_header_content = '' + for elem in first_header.find_all_previous(): + if isinstance(elem, bs4.Tag): + text = elem.get_text(separator=' ', strip=True) + if text: + pre_header_content = text + ' ' + pre_header_content + if pre_header_content.strip(): + documents.append(Document( + page_content=pre_header_content.strip(), + metadata={} # No metadata since there's no header + )) else: - return [ - Document(page_content=chunk["content"], metadata=chunk["metadata"]) - for chunk in elements - ] + # If no headers are found, return the whole content + full_text = body.get_text(separator=' ', strip=True) + if full_text.strip(): + documents.append(Document( + page_content=full_text.strip(), + metadata={} + )) + return documents + + # Process each header and its associated content + for header in all_headers: + current_metadata = {} + header_name = header.name + header_text = header.get_text(separator=' ', strip=True) + current_metadata[header_mapping[header_name]] = header_text + + # Collect all sibling elements until the next header of the same or higher level + content_elements = [] + for sibling in header.find_next_siblings(): + if sibling.name in headers_to_split_on: + # Stop at the next header + break + if isinstance(sibling, bs4.Tag): + content_elements.append(sibling) + + # Get the text content of the collected elements + current_content = '' + for elem in content_elements: + text = elem.get_text(separator=' ', strip=True) + if text: + current_content += text + ' ' + + # Create a Document if there is content + if current_content.strip(): + documents.append(Document( + page_content=current_content.strip(), + metadata=current_metadata.copy() + )) + else: + # If there's no content, but we have metadata, still create a Document + documents.append(Document( + page_content='', + metadata=current_metadata.copy() + )) + + return documents + class HTMLSectionSplitter: From d4efd97db21e071f72284ac67b54767a23266634 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Fri, 8 Nov 2024 21:34:37 +0200 Subject: [PATCH 02/30] Update html.py updated according to linter tests --- .../langchain_text_splitters/html.py | 175 +++++++++--------- 1 file changed, 87 insertions(+), 88 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 4b9280934ae50..6367de0633559 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -9,7 +9,9 @@ from langchain_core.documents import Document from langchain_text_splitters.character import RecursiveCharacterTextSplitter - +from bs4 import BeautifulSoup +from bs4.element import Tag +from langchain.docstore.document import Document class ElementType(TypedDict): """Element type as typed dict.""" @@ -91,104 +93,101 @@ def split_text(self, text: str) -> List[Document]: return self.split_text_from_file(StringIO(text)) - def split_text_from_file(self, file: Any) -> List[Document]: - """Split HTML file using BeautifulSoup. +def split_text_from_file(self, file: Any) -> List[Document]: + """Split HTML file using BeautifulSoup. - Args: - file: HTML file path or file-like object. + Args: + file: HTML file path or file-like object. - Returns: - List of Document objects with page_content and metadata. - """ - from bs4 import BeautifulSoup - from langchain.docstore.document import Document - import bs4 - - # Read the HTML content from the file or file-like object - if isinstance(file, str): - with open(file, 'r', encoding='utf-8') as f: - html_content = f.read() - else: - # Assuming file is a file-like object - html_content = file.read() + Returns: + List of Document objects with page_content and metadata. + """ - # Parse the HTML content using BeautifulSoup - soup = BeautifulSoup(html_content, 'html.parser') + # Read the HTML content from the file or file-like object + if isinstance(file, str): + with open(file, 'r', encoding='utf-8') as f: + html_content = f.read() + else: + # Assuming file is a file-like object + html_content = file.read() - # Extract the header tags and their corresponding metadata keys - headers_to_split_on = [tag[0] for tag in self.headers_to_split_on] - header_mapping = dict(self.headers_to_split_on) + # Parse the HTML content using BeautifulSoup + soup = BeautifulSoup(html_content, 'html.parser') - documents = [] + # Extract the header tags and their corresponding metadata keys + headers_to_split_on = [tag[0] for tag in self.headers_to_split_on] + header_mapping = dict(self.headers_to_split_on) - # Find the body of the document - body = soup.body if soup.body else soup - - # Find all header tags in the order they appear - all_headers = body.find_all(headers_to_split_on) - - # If there's content before the first header, collect it - first_header = all_headers[0] if all_headers else None - if first_header: - pre_header_content = '' - for elem in first_header.find_all_previous(): - if isinstance(elem, bs4.Tag): - text = elem.get_text(separator=' ', strip=True) - if text: - pre_header_content = text + ' ' + pre_header_content - if pre_header_content.strip(): - documents.append(Document( - page_content=pre_header_content.strip(), - metadata={} # No metadata since there's no header - )) - else: - # If no headers are found, return the whole content - full_text = body.get_text(separator=' ', strip=True) - if full_text.strip(): - documents.append(Document( - page_content=full_text.strip(), - metadata={} - )) - return documents - - # Process each header and its associated content - for header in all_headers: - current_metadata = {} - header_name = header.name - header_text = header.get_text(separator=' ', strip=True) - current_metadata[header_mapping[header_name]] = header_text - - # Collect all sibling elements until the next header of the same or higher level - content_elements = [] - for sibling in header.find_next_siblings(): - if sibling.name in headers_to_split_on: - # Stop at the next header - break - if isinstance(sibling, bs4.Tag): - content_elements.append(sibling) + documents = [] + + # Find the body of the document + body = soup.body if soup.body else soup - # Get the text content of the collected elements - current_content = '' - for elem in content_elements: + # Find all header tags in the order they appear + all_headers = body.find_all(headers_to_split_on) + + # If there's content before the first header, collect it + first_header = all_headers[0] if all_headers else None + if first_header: + pre_header_content = '' + for elem in first_header.find_all_previous(): + if isinstance(elem, Tag): text = elem.get_text(separator=' ', strip=True) if text: - current_content += text + ' ' - - # Create a Document if there is content - if current_content.strip(): - documents.append(Document( - page_content=current_content.strip(), - metadata=current_metadata.copy() - )) - else: - # If there's no content, but we have metadata, still create a Document - documents.append(Document( - page_content='', - metadata=current_metadata.copy() - )) - + pre_header_content = text + ' ' + pre_header_content + if pre_header_content.strip(): + documents.append(Document( + page_content=pre_header_content.strip(), + metadata={} # No metadata since there's no header + )) + else: + # If no headers are found, return the whole content + full_text = body.get_text(separator=' ', strip=True) + if full_text.strip(): + documents.append(Document( + page_content=full_text.strip(), + metadata={} + )) return documents + # Process each header and its associated content + for header in all_headers: + current_metadata = {} + header_name = header.name + header_text = header.get_text(separator=' ', strip=True) + current_metadata[header_mapping[header_name]] = header_text + + # Collect all sibling elements until the next header of the same or higher level + content_elements = [] + for sibling in header.find_next_siblings(): + if sibling.name in headers_to_split_on: + # Stop at the next header + break + if isinstance(sibling, Tag): + content_elements.append(sibling) + + # Get the text content of the collected elements + current_content = '' + for elem in content_elements: + text = elem.get_text(separator=' ', strip=True) + if text: + current_content += text + ' ' + + # Create a Document if there is content + if current_content.strip(): + documents.append(Document( + page_content=current_content.strip(), + metadata=current_metadata.copy() + )) + else: + # If there's no content, but we have metadata, still create a Document + documents.append(Document( + page_content='', + metadata=current_metadata.copy() + )) + + return documents + class HTMLSectionSplitter: From 73c001ceceed0d4958c22a5f6ff50a0bdc299f92 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Fri, 8 Nov 2024 21:38:38 +0200 Subject: [PATCH 03/30] Update html.py --- libs/text-splitters/langchain_text_splitters/html.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 6367de0633559..6af5890e2282e 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -11,7 +11,6 @@ from langchain_text_splitters.character import RecursiveCharacterTextSplitter from bs4 import BeautifulSoup from bs4.element import Tag -from langchain.docstore.document import Document class ElementType(TypedDict): """Element type as typed dict.""" From 9119fe91d95dad8b669068e8993c955b1be5a29c Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Fri, 8 Nov 2024 21:44:38 +0200 Subject: [PATCH 04/30] Update html.py --- .../langchain_text_splitters/html.py | 189 +++++++++--------- 1 file changed, 95 insertions(+), 94 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 6af5890e2282e..e18f39f6eb4e8 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -6,11 +6,12 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, TypedDict, cast import requests -from langchain_core.documents import Document - -from langchain_text_splitters.character import RecursiveCharacterTextSplitter from bs4 import BeautifulSoup from bs4.element import Tag +from langchain.docstore.document import Document + +from langchain_text_splitters.character import RecursiveCharacterTextSplitter + class ElementType(TypedDict): """Element type as typed dict.""" @@ -92,101 +93,101 @@ def split_text(self, text: str) -> List[Document]: return self.split_text_from_file(StringIO(text)) -def split_text_from_file(self, file: Any) -> List[Document]: - """Split HTML file using BeautifulSoup. - - Args: - file: HTML file path or file-like object. - - Returns: - List of Document objects with page_content and metadata. - """ - - # Read the HTML content from the file or file-like object - if isinstance(file, str): - with open(file, 'r', encoding='utf-8') as f: - html_content = f.read() - else: - # Assuming file is a file-like object - html_content = file.read() - - # Parse the HTML content using BeautifulSoup - soup = BeautifulSoup(html_content, 'html.parser') - - # Extract the header tags and their corresponding metadata keys - headers_to_split_on = [tag[0] for tag in self.headers_to_split_on] - header_mapping = dict(self.headers_to_split_on) - - documents = [] - - # Find the body of the document - body = soup.body if soup.body else soup - - # Find all header tags in the order they appear - all_headers = body.find_all(headers_to_split_on) - - # If there's content before the first header, collect it - first_header = all_headers[0] if all_headers else None - if first_header: - pre_header_content = '' - for elem in first_header.find_all_previous(): - if isinstance(elem, Tag): + def split_text_from_file(self, file: Any) -> List[Document]: + """Split HTML file using BeautifulSoup. + + Args: + file: HTML file path or file-like object. + + Returns: + List of Document objects with page_content and metadata. + """ + + # Read the HTML content from the file or file-like object + if isinstance(file, str): + with open(file, 'r', encoding='utf-8') as f: + html_content = f.read() + else: + # Assuming file is a file-like object + html_content = file.read() + + # Parse the HTML content using BeautifulSoup + soup = BeautifulSoup(html_content, 'html.parser') + + # Extract the header tags and their corresponding metadata keys + headers_to_split_on = [tag[0] for tag in self.headers_to_split_on] + header_mapping = dict(self.headers_to_split_on) + + documents = [] + + # Find the body of the document + body = soup.body if soup.body else soup + + # Find all header tags in the order they appear + all_headers = body.find_all(headers_to_split_on) + + # If there's content before the first header, collect it + first_header = all_headers[0] if all_headers else None + if first_header: + pre_header_content = '' + for elem in first_header.find_all_previous(): + if isinstance(elem, Tag): + text = elem.get_text(separator=' ', strip=True) + if text: + pre_header_content = text + ' ' + pre_header_content + if pre_header_content.strip(): + documents.append(Document( + page_content=pre_header_content.strip(), + metadata={} # No metadata since there's no header + )) + else: + # If no headers are found, return the whole content + full_text = body.get_text(separator=' ', strip=True) + if full_text.strip(): + documents.append(Document( + page_content=full_text.strip(), + metadata={} + )) + return documents + + # Process each header and its associated content + for header in all_headers: + current_metadata = {} + header_name = header.name + header_text = header.get_text(separator=' ', strip=True) + current_metadata[header_mapping[header_name]] = header_text + + # Collect all sibling elements until the next header of the same or higher level + content_elements = [] + for sibling in header.find_next_siblings(): + if sibling.name in headers_to_split_on: + # Stop at the next header + break + if isinstance(sibling, Tag): + content_elements.append(sibling) + + # Get the text content of the collected elements + current_content = '' + for elem in content_elements: text = elem.get_text(separator=' ', strip=True) if text: - pre_header_content = text + ' ' + pre_header_content - if pre_header_content.strip(): - documents.append(Document( - page_content=pre_header_content.strip(), - metadata={} # No metadata since there's no header - )) - else: - # If no headers are found, return the whole content - full_text = body.get_text(separator=' ', strip=True) - if full_text.strip(): - documents.append(Document( - page_content=full_text.strip(), - metadata={} - )) + current_content += text + ' ' + + # Create a Document if there is content + if current_content.strip(): + documents.append(Document( + page_content=current_content.strip(), + metadata=current_metadata.copy() + )) + else: + # If there's no content, but we have metadata, still create a Document + documents.append(Document( + page_content='', + metadata=current_metadata.copy() + )) + return documents - # Process each header and its associated content - for header in all_headers: - current_metadata = {} - header_name = header.name - header_text = header.get_text(separator=' ', strip=True) - current_metadata[header_mapping[header_name]] = header_text - - # Collect all sibling elements until the next header of the same or higher level - content_elements = [] - for sibling in header.find_next_siblings(): - if sibling.name in headers_to_split_on: - # Stop at the next header - break - if isinstance(sibling, Tag): - content_elements.append(sibling) - - # Get the text content of the collected elements - current_content = '' - for elem in content_elements: - text = elem.get_text(separator=' ', strip=True) - if text: - current_content += text + ' ' - - # Create a Document if there is content - if current_content.strip(): - documents.append(Document( - page_content=current_content.strip(), - metadata=current_metadata.copy() - )) - else: - # If there's no content, but we have metadata, still create a Document - documents.append(Document( - page_content='', - metadata=current_metadata.copy() - )) - - return documents - class HTMLSectionSplitter: From 7e0ce8ef476eae13f28f8a709e3422a49b0ccea7 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Fri, 8 Nov 2024 21:52:28 +0200 Subject: [PATCH 05/30] Update html.py --- .../langchain_text_splitters/html.py | 33 ++++++++----------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index e18f39f6eb4e8..db6f7d58d5e4f 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -91,18 +91,14 @@ def split_text(self, text: str) -> List[Document]: text: HTML text """ return self.split_text_from_file(StringIO(text)) - - + def split_text_from_file(self, file: Any) -> List[Document]: """Split HTML file using BeautifulSoup. - Args: file: HTML file path or file-like object. - Returns: List of Document objects with page_content and metadata. """ - # Read the HTML content from the file or file-like object if isinstance(file, str): with open(file, 'r', encoding='utf-8') as f: @@ -110,28 +106,28 @@ def split_text_from_file(self, file: Any) -> List[Document]: else: # Assuming file is a file-like object html_content = file.read() - + # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(html_content, 'html.parser') - + # Extract the header tags and their corresponding metadata keys headers_to_split_on = [tag[0] for tag in self.headers_to_split_on] header_mapping = dict(self.headers_to_split_on) - + documents = [] - + # Find the body of the document body = soup.body if soup.body else soup - + # Find all header tags in the order they appear all_headers = body.find_all(headers_to_split_on) - + # If there's content before the first header, collect it first_header = all_headers[0] if all_headers else None if first_header: pre_header_content = '' for elem in first_header.find_all_previous(): - if isinstance(elem, Tag): + if isinstance(elem, bs4.Tag): text = elem.get_text(separator=' ', strip=True) if text: pre_header_content = text + ' ' + pre_header_content @@ -149,30 +145,30 @@ def split_text_from_file(self, file: Any) -> List[Document]: metadata={} )) return documents - + # Process each header and its associated content for header in all_headers: current_metadata = {} header_name = header.name header_text = header.get_text(separator=' ', strip=True) current_metadata[header_mapping[header_name]] = header_text - + # Collect all sibling elements until the next header of the same or higher level content_elements = [] for sibling in header.find_next_siblings(): if sibling.name in headers_to_split_on: # Stop at the next header break - if isinstance(sibling, Tag): + if isinstance(sibling, bs4.Tag): content_elements.append(sibling) - + # Get the text content of the collected elements current_content = '' for elem in content_elements: text = elem.get_text(separator=' ', strip=True) if text: current_content += text + ' ' - + # Create a Document if there is content if current_content.strip(): documents.append(Document( @@ -185,9 +181,8 @@ def split_text_from_file(self, file: Any) -> List[Document]: page_content='', metadata=current_metadata.copy() )) - - return documents + return documents class HTMLSectionSplitter: From 6bfc1587de22ceb4e6d89c98975cb29e933ea299 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Tue, 17 Dec 2024 00:11:05 +0200 Subject: [PATCH 06/30] Update html.py Rewrote the HTMLHeaderSplitter with BS4 to support large files --- .../langchain_text_splitters/html.py | 584 ++++++++++++++---- 1 file changed, 462 insertions(+), 122 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index f486decffaecd..0d33cfa08ad65 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -1,18 +1,76 @@ +''' +This module provides classes for splitting HTML content into structured +Document objects based on specified header tags. + + ElementType(TypedDict): A typed dictionary representing an element type. + Node: Represents a node in a hierarchical structure. + HTMLHeaderTextSplitter: A class to split HTML content into Document objects + based on specified header tags. + HTMLSectionSplitter: A class to split HTML files based on specified tag and + font sizes. + +Functions: + HTMLHeaderTextSplitter.__init__(self, headers_to_split_on: List[Tuple[str, + str]], return_each_element: bool = False) -> None: Initializes the + HTMLHeaderTextSplitter with headers to split on and an optional + parameter to return each HTML element as a separate Document. + HTMLHeaderTextSplitter._header_level(self, element) -> int: Determines the + heading level of an HTML element. + HTMLHeaderTextSplitter._dom_depth(self, element) -> int: Computes the DOM + depth of an HTML element. + HTMLHeaderTextSplitter._build_tree(self, elements) -> None: Builds a tree + structure from a list of HTML elements. + HTMLHeaderTextSplitter.split_text(self, text: str) -> List[Document]: + HTMLHeaderTextSplitter.split_text_from_url(self, url: str, timeout: int = + 10, **kwargs: Any) -> List[Document]: Fetches text content from a URL + and splits it into documents. + HTMLHeaderTextSplitter._finalize_chunk(self, current_chunk: List[str], + active_headers: Dict[str, Tuple[str, int, int]], documents: + List[Document], chunk_dom_depth: int) -> None: Finalizes the current + chunk of text and appends it to the list of documents. + HTMLHeaderTextSplitter._generate_documents(self, nodes: Dict[int, Node]) -> + List[Document]: Generates a list of Document objects from a node + structure. + HTMLHeaderTextSplitter.split_text_from_file(self, file: Any) -> + List[Document]: Splits HTML content from a file into a list of Document + HTMLHeaderTextSplitter._aggregate_documents(self, nodes: Dict[int, Node]) -> + List[Document]: Aggregates documents based on headers. + HTMLHeaderTextSplitter._generate_individual_documents(self, nodes: Dict[int, + Node]) -> List[Document]: Generates individual Document objects for + each element. + HTMLSectionSplitter.__init__(self, headers_to_split_on: List[Tuple[str, + str]], xslt_path: Optional[str] = None, **kwargs: Any) -> None: Creates + a new HTMLSectionSplitter. + HTMLSectionSplitter.split_documents(self, documents: Iterable[Document]) -> + List[Document]: Splits documents. + HTMLSectionSplitter.split_text(self, text: str) -> List[Document]: Splits + HTML text string. + HTMLSectionSplitter.create_documents(self, texts: List[str], metadatas: + Optional[List[dict]] = None) -> List[Document]: Creates documents from + a list of texts. + HTMLSectionSplitter.split_html_by_headers(self, html_doc: str) -> + List[Dict[str, Optional[str]]]: Splits an HTML document into sections + based on specified header tags. + HTMLSectionSplitter.convert_possible_tags_to_header(self, html_content: + str) -> str: Converts specific HTML tags to headers using an XSLT + transformation. + HTMLSectionSplitter.split_text_from_file(self, file: Any) -> + List[Document]: Splits HTML file. +''' from __future__ import annotations import copy import pathlib +from dataclasses import dataclass, field from io import BytesIO, StringIO from typing import Any, Dict, Iterable, List, Optional, Tuple, TypedDict, cast import requests from bs4 import BeautifulSoup -from bs4.element import Tag from langchain.docstore.document import Document from langchain_text_splitters.character import RecursiveCharacterTextSplitter - class ElementType(TypedDict): """Element type as typed dict.""" @@ -22,167 +80,449 @@ class ElementType(TypedDict): metadata: Dict[str, str] -class HTMLHeaderTextSplitter: - """Splitting HTML files based on specified headers. - Requires lxml package. +@dataclass +class Node: + """ + Represents a node in a hierarchical structure. + + Attributes: + name (str): The name of the node. + tag_type (str): The type of the node. + content (str): The content of the node. + level (int): The level of the node in the hierarchy. + dom_depth (int): The depth of the node in the DOM structure. + parent (Optional[Node]): The parent node. Defaults to None. """ + name: str + tag_type: str + content: str + level: int + dom_depth: int + parent: Optional[Node] = field(default=None) + +class HTMLHeaderTextSplitter: + ''' + HTMLHeaderTextSplitter is a class designed to split HTML content into + structured Document objects based on specified header tags. + + Classes: + HTMLHeaderTextSplitter: A class to split HTML content into Document + objects. + + Methods: + __init__(self, headers_to_split_on: List[Tuple[str, str]], + return_each_element: bool = False) -> None: + Initializes the HTMLHeaderTextSplitter with headers to split on + and an optional parameter to return each HTML element as a + separate Document. + + _header_level(self, element) -> int: + Determines the heading level of an HTML element. + + _dom_depth(self, element) -> int: + Computes the DOM depth of an HTML element. + + _build_tree(self, elements) -> None: + Builds a tree structure from a list of HTML elements. + + split_text(self, text: str) -> List[Document]: + Splits the given HTML text into a list of Document objects. + + split_text_from_url(self, url: str, timeout: int = 10, + **kwargs: Any) -> List[Document]: + Fetches text content from a URL and splits it into documents. + + _finalize_chunk(self, current_chunk: List[str], + active_headers: Dict[str, Tuple[str, int, int]], + documents: List[Document], chunk_dom_depth: int) -> None: + Finalizes the current chunk of text and appends it to the list of + documents. + + _generate_documents(self, nodes: Dict[int, Node]) -> List[Document]: + Generates a list of Document objects from a node structure. + + split_text_from_file(self, file: Any) -> List[Document]: + Splits HTML content from a file into a list of Document objects. + + _aggregate_documents(self, nodes: Dict[int, Node]) -> List[Document]: + Aggregates documents based on headers. + + _generate_individual_documents(self, nodes: Dict[int, Node]) -> + List[Document]: + Generates individual Document objects for each element. + ''' def __init__( self, headers_to_split_on: List[Tuple[str, str]], - return_each_element: bool = False, - ): - """Create a new HTMLHeaderTextSplitter. + return_each_element: bool = False # Added parameter + ) -> None: + """ + Initialize with headers to split on. Args: - headers_to_split_on: list of tuples of headers we want to track mapped to - (arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4, - h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2)]. - return_each_element: Return each element w/ associated headers. + headers_to_split_on (List[Tuple[str, str]]): A list of tuples where + each tuple contains a header tag and its corresponding value. + return_each_element (bool, optional): Whether to return each HTML + element as a separate Document. Defaults to False. """ - # Output element-by-element or aggregated into chunks w/ common headers - self.return_each_element = return_each_element - self.headers_to_split_on = sorted(headers_to_split_on) - - def aggregate_elements_to_chunks( - self, elements: List[ElementType] - ) -> List[Document]: - """Combine elements with common metadata into chunks. + self.headers_to_split_on = sorted( + headers_to_split_on, key=lambda x: int(x[0][1]) + ) + self.header_mapping = dict(self.headers_to_split_on) + self.header_tags = [tag for tag, _ in self.headers_to_split_on] + self.elements_tree: Dict[int, Tuple[str, str, int, int]] = {} + self.return_each_element = return_each_element # Store the parameter + + def _header_level(self, element) -> int: + """ + Determine the heading level of an element. Args: - elements: HTML element content with associated identifying info and metadata + element: A BeautifulSoup element. + + Returns: + int: The heading level (1-6) if a heading, else a large number. """ - aggregated_chunks: List[ElementType] = [] - - for element in elements: - if ( - aggregated_chunks - and aggregated_chunks[-1]["metadata"] == element["metadata"] - ): - # If the last element in the aggregated list - # has the same metadata as the current element, - # append the current content to the last element's content - aggregated_chunks[-1]["content"] += " \n" + element["content"] - else: - # Otherwise, append the current element to the aggregated list - aggregated_chunks.append(element) + tag_name = element.name.lower() if hasattr(element, 'name') else '' + if tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + return int(tag_name[1]) + return 9999 - return [ - Document(page_content=chunk["content"], metadata=chunk["metadata"]) - for chunk in aggregated_chunks - ] + def _dom_depth(self, element) -> int: + """ + Compute the DOM depth of an element. + + Args: + element: A BeautifulSoup element. + + Returns: + int: The depth of the element in the DOM tree. + """ + depth = 0 + for _ in element.parents: + depth += 1 + return depth - def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]: - """Split HTML from web URL. + def _build_tree(self, elements) -> None: + """ + Build a tree structure from a list of HTML elements. Args: - url: web URL - **kwargs: Arbitrary additional keyword arguments. These are usually passed - to the fetch url content request. + elements: A list of BeautifulSoup elements. """ - r = requests.get(url, **kwargs) - return self.split_text_from_file(BytesIO(r.content)) + for idx, element in enumerate(elements): + text = ' '.join( + t for t in element.find_all(string=True, recursive=False) + if isinstance(t, str) + ).strip() + + if not text: + continue + + level = self._header_level(element) + dom_depth = self._dom_depth(element) + + self.elements_tree[idx] = ( + element.name, + text, + level, + dom_depth + ) def split_text(self, text: str) -> List[Document]: - """Split HTML text string. + """ + Split the given text into a list of Document objects. Args: - text: HTML text + text (str): The HTML text to split. + + Returns: + List[Document]: A list of split Document objects. """ return self.split_text_from_file(StringIO(text)) - + + def split_text_from_url( + self, + url: str, + timeout: int = 10, + **kwargs: Any + ) -> List[Document]: + """ + Fetch text content from a URL and split it into documents. + + Args: + url (str): The URL to fetch content from. + timeout (int, optional): Timeout for the request. Defaults to 10. + **kwargs: Additional keyword arguments for the request. + + Returns: + List[Document]: A list of split Document objects. + + Raises: + requests.RequestException: If the HTTP request fails. + """ + try: + kwargs.setdefault('timeout', timeout) + response = requests.get(url, **kwargs) # noqa: E501 + response.raise_for_status() + except requests.RequestException as e: + print(f"Error fetching URL {url}: {e}") + raise e + return self.split_text_from_file(BytesIO(response.content)) + + def _finalize_chunk( + self, + current_chunk: List[str], + active_headers: Dict[str, Tuple[str, int, int]], + documents: List[Document], + chunk_dom_depth: int) -> None: + + if current_chunk: + final_meta: Dict[str, str] = { + key: content for key, (content, level, dom_depth) in active_headers.items() + if chunk_dom_depth >= dom_depth + } + combined_text = " \n".join( + line for line in current_chunk if line.strip() + ) + documents.append( + Document(page_content=combined_text, metadata=final_meta) + ) + current_chunk.clear() + chunk_dom_depth = 0 + + + def _generate_documents(self, nodes: Dict[int, Node]) -> List[Document]: + """ + Generate a list of Document objects from a node structure. + + Args: + nodes (Dict[int, Node]): A dictionary of nodes indexed by their position. + + Returns: + List[Document]: A list of generated Document objects. + """ + documents: List[Document] = [] + active_headers: Dict[str, Tuple[str, int, int]] = {} + current_chunk: List[str] = [] + chunk_dom_depth = 0 + + + + def process_node(node: Node) -> None: + """ + Processes a given node and updates the current chunk, active headers, and + documents based on the node's type and content. + Args: + node (Node): The node to be processed. It should have attributes + 'tag_type', 'content', 'level', and 'dom_depth'. + Returns: + None + """ + + nonlocal chunk_dom_depth + node_type = node.tag_type # type: ignore[attr-defined] + node_content = node.content # type: ignore[attr-defined] + node_level = node.level # type: ignore[attr-defined] + node_dom_depth = node.dom_depth # type: ignore[attr-defined] + + if node_type in self.header_tags: + self._finalize_chunk(current_chunk, active_headers, documents, chunk_dom_depth) + headers_to_remove = [ + key for key, (_, lvl, _) in active_headers.items() + if lvl >= node_level + ] + for key in headers_to_remove: + del active_headers[key] + header_key = self.header_mapping[node_type] # type: ignore[attr-defined] + active_headers[header_key] = ( + node_content, + node_level, + node_dom_depth + ) + header_meta: Dict[str, str] = { + key: content for key, (content, lvl, dd) in active_headers.items() + if node_dom_depth >= dd + } + documents.append( + Document( + page_content=node_content, + metadata=header_meta + ) + ) + else: + headers_to_remove = [ + key for key, (_, _, dd) in active_headers.items() + if node_dom_depth < dd + ] + for key in headers_to_remove: + del active_headers[key] + if node_content.strip(): + current_chunk.append(node_content) + chunk_dom_depth = max(chunk_dom_depth, node_dom_depth) + + sorted_nodes = sorted(nodes.items()) + for _, node in sorted_nodes: + process_node(node) + + self._finalize_chunk(current_chunk, active_headers, documents, chunk_dom_depth) + return documents + def split_text_from_file(self, file: Any) -> List[Document]: - """Split HTML file. + """ + Split HTML content from a file into a list of Document objects. Args: - file: HTML file path or file-like object. + file (Any): A file path or a file-like object containing HTML content. Returns: - List of Document objects with page_content and metadata. + List[Document]: A list of split Document objects. """ - # Read the HTML content from the file or file-like object if isinstance(file, str): with open(file, 'r', encoding='utf-8') as f: html_content = f.read() else: - # Assuming file is a file-like object html_content = file.read() - # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(html_content, 'html.parser') + body = soup.body if soup.body else soup - # Extract the header tags and their corresponding metadata keys - headers_to_split_on = [tag[0] for tag in self.headers_to_split_on] - header_mapping = dict(self.headers_to_split_on) + elements = body.find_all() + self._build_tree(elements) + + if not self.elements_tree: + return [] + + min_level = min( + level for (_, _, level, _) in self.elements_tree.values() + ) + root = Node( + "root", + tag_type="root", + content="", + level=min_level - 1, + dom_depth=0 + ) + + nodes = { + idx: Node( + f"{tag}_{idx}", + tag_type=tag, + content=text, + level=level, + dom_depth=dom_depth + ) + for idx, (tag, text, level, dom_depth) in self.elements_tree.items() + } + + stack = [] + for idx in sorted(nodes): + node = nodes[idx] + while stack and ( + stack[-1].level >= node.level + or stack[-1].dom_depth >= node.dom_depth + ): + stack.pop() + if stack: + node.parent = stack[-1] + else: + node.parent = root + stack.append(node) - documents = [] + if not self.return_each_element: + return self._aggregate_documents(nodes) - # Find the body of the document - body = soup.body if soup.body else soup + return self._generate_individual_documents(nodes) - # Find all header tags in the order they appear - all_headers = body.find_all(headers_to_split_on) - - # If there's content before the first header, collect it - first_header = all_headers[0] if all_headers else None - if first_header: - pre_header_content = '' - for elem in first_header.find_all_previous(): - if isinstance(elem, bs4.Tag): - text = elem.get_text(separator=' ', strip=True) - if text: - pre_header_content = text + ' ' + pre_header_content - if pre_header_content.strip(): - documents.append(Document( - page_content=pre_header_content.strip(), - metadata={} # No metadata since there's no header - )) - else: - # If no headers are found, return the whole content - full_text = body.get_text(separator=' ', strip=True) - if full_text.strip(): - documents.append(Document( - page_content=full_text.strip(), - metadata={} - )) - return documents - - # Process each header and its associated content - for header in all_headers: - current_metadata = {} - header_name = header.name - header_text = header.get_text(separator=' ', strip=True) - current_metadata[header_mapping[header_name]] = header_text - - # Collect all sibling elements until the next header of the same or higher level - content_elements = [] - for sibling in header.find_next_siblings(): - if sibling.name in headers_to_split_on: - # Stop at the next header - break - if isinstance(sibling, bs4.Tag): - content_elements.append(sibling) - - # Get the text content of the collected elements - current_content = '' - for elem in content_elements: - text = elem.get_text(separator=' ', strip=True) - if text: - current_content += text + ' ' - - # Create a Document if there is content - if current_content.strip(): - documents.append(Document( - page_content=current_content.strip(), - metadata=current_metadata.copy() - )) + def _aggregate_documents(self, nodes: Dict[int, Node]) -> List[Document]: + """ + Aggregate documents based on headers. + + Args: + nodes (Dict[int, Node]): A dictionary of nodes indexed by their position. + + Returns: + List[Document]: A list of aggregated Document objects. + """ + # Reuse the existing _generate_documents method for aggregation + return self._generate_documents(nodes) + + def _generate_individual_documents(self, nodes: Dict[int, Node]) -> List[Document]: + """ + Generate individual Document objects for each element. + + Args: + nodes (Dict[int, Node]): A dictionary of nodes indexed by their position. + + Returns: + List[Document]: A list of individual Document objects. + """ + documents: List[Document] = [] + active_headers: Dict[str, Tuple[str, int, int]] = {} + + sorted_nodes = sorted(nodes.items()) + + def process_node(node: Node) -> None: + """ + Process a single node to create Document objects based on header tags. + + Args: + node (Node): The node to process. + """ + node_type = node.type # type: ignore[attr-defined] + node_content = node.content # type: ignore[attr-defined] + node_level = node.level # type: ignore[attr-defined] + node_dom_depth = node.dom_depth # type: ignore[attr-defined] + + if node_type in self.header_tags: + # Remove headers of the same or lower level + headers_to_remove = [ + key for key, (_, lvl, _) in active_headers.items() + if lvl >= node_level + ] + for key in headers_to_remove: + del active_headers[key] + + # Update active headers with the current header + header_key = self.header_mapping[node_type] # type: ignore[attr-defined] + active_headers[header_key] = ( + node_content, + node_level, + node_dom_depth + ) + + # Create metadata based on active headers + header_meta: Dict[str, str] = { + key: content for key, (content, lvl, dd) in active_headers.items() + if node_dom_depth >= dd + } + + # Create a Document for the header element + documents.append( + Document( + page_content=node_content, + metadata=header_meta + ) + ) else: - # If there's no content, but we have metadata, still create a Document - documents.append(Document( - page_content='', - metadata=current_metadata.copy() - )) + # For non-header elements, associate with current headers + if node_content.strip(): + header_meta: Dict[str, str] = { + key: content for key, (content, lvl, dd) in active_headers.items() + if node_dom_depth >= dd + } + documents.append( + Document( + page_content=node_content, + metadata=header_meta + ) + ) + + # Process each node using the inner process_node function + for _, node in sorted_nodes: + process_node(node) return documents From b84f13cea88cb534ecfe96efb487adb73c611009 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Tue, 17 Dec 2024 00:28:31 +0200 Subject: [PATCH 07/30] Update test_text_splitters.py added extra tests for the new HTML class --- .../tests/unit_tests/test_text_splitters.py | 519 ++++++++++++++++-- 1 file changed, 484 insertions(+), 35 deletions(-) diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 95f170d52b7c4..ecc4658d16662 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -1634,49 +1634,498 @@ def test_haskell_code_splitter() -> None: assert chunks == expected_chunks -@pytest.mark.requires("lxml") -def test_html_header_text_splitter(tmp_path: Path) -> None: - splitter = HTMLHeaderTextSplitter( - headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")] - ) - - content = """ -

Sample Document

-

Section

-

Reference content.

+@pytest.fixture +def html_header_splitter_splitter_factory() -> HTMLHeaderTextSplitter: + """ + Fixture to create an HTMLHeaderTextSplitter instance with given headers. + This factory allows dynamic creation of splitters with different headers. + """ + def _create_splitter(headers_to_split_on: List[Tuple[str, str]]) -> HTMLHeaderTextSplitter: + return HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) + return _create_splitter -

Lists

-
    -
  • Item 1
  • -
  • Item 2
  • -
  • Item 3
  • -
-

A block

-
-

Some text

-

Some more text

-
+@pytest.mark.parametrize( + "headers_to_split_on, html_input, expected_documents, test_case", + [ + ( + # Test Case 1: Split on h1 and h2 + [("h1", "Header 1"), ("h2", "Header 2")], + """ + + +

Introduction

+

This is the introduction.

+

Background

+

Background information.

+

Conclusion

+

Final thoughts.

+ + + """, + [ + Document( + page_content="Introduction", + metadata={"Header 1": "Introduction"} + ), + Document( + page_content="This is the introduction.", + metadata={"Header 1": "Introduction"} + ), + Document( + page_content="Background", + metadata={ + "Header 1": "Introduction", + "Header 2": "Background" + } + ), + Document( + page_content="Background information.", + metadata={ + "Header 1": "Introduction", + "Header 2": "Background" + } + ), + Document( + page_content="Conclusion", + metadata={"Header 1": "Conclusion"} + ), + Document( + page_content="Final thoughts.", + metadata={"Header 1": "Conclusion"} + ) + ], + "Simple headers and paragraphs" + ), + ( + # Test Case 2: Nested headers with h1, h2, and h3 + [("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")], + """ + + +
+

Main Title

+
+

Subsection

+

Details of subsection.

+
+

Sub-subsection

+

More details.

+
+
+
+

Another Main Title

+

Content under another main title.

+ + + """, + [ + Document( + page_content="Main Title", + metadata={"Header 1": "Main Title"} + ), + Document( + page_content="Subsection", + metadata={ + "Header 1": "Main Title", + "Header 2": "Subsection" + } + ), + Document( + page_content="Details of subsection.", + metadata={ + "Header 1": "Main Title", + "Header 2": "Subsection" + } + ), + Document( + page_content="Sub-subsection", + metadata={ + "Header 1": "Main Title", + "Header 2": "Subsection", + "Header 3": "Sub-subsection" + } + ), + Document( + page_content="More details.", + metadata={ + "Header 1": "Main Title", + "Header 2": "Subsection", + "Header 3": "Sub-subsection" + } + ), + Document( + page_content="Another Main Title", + metadata={"Header 1": "Another Main Title"} + ), + Document( + page_content="Content under another main title.", + metadata={"Header 1": "Another Main Title"} + ) + ], + "Nested headers with h1, h2, and h3" + ), + ( + # Test Case 3: No headers + [("h1", "Header 1")], + """ + + +

Paragraph one.

+

Paragraph two.

+
+

Paragraph three.

+
+ + + """, + [ + Document( + page_content="Paragraph one. \nParagraph two. \nParagraph three.", + metadata={} + ) + ], + "No headers present" + ), + ( + # Test Case 4: Multiple headers of the same level + [("h1", "Header 1")], + """ + + +

Chapter 1

+

Content of chapter 1.

+

Chapter 2

+

Content of chapter 2.

+

Chapter 3

+

Content of chapter 3.

+ + + """, + [ + Document( + page_content="Chapter 1", + metadata={"Header 1": "Chapter 1"} + ), + Document( + page_content="Content of chapter 1.", + metadata={"Header 1": "Chapter 1"} + ), + Document( + page_content="Chapter 2", + metadata={"Header 1": "Chapter 2"} + ), + Document( + page_content="Content of chapter 2.", + metadata={"Header 1": "Chapter 2"} + ), + Document( + page_content="Chapter 3", + metadata={"Header 1": "Chapter 3"} + ), + Document( + page_content="Content of chapter 3.", + metadata={"Header 1": "Chapter 3"} + ) + ], + "Multiple headers of the same level" + ), + ( + # Test Case 5: Headers with no content + [("h1", "Header 1"), ("h2", "Header 2")], + """ + + +

Header 1

+

Header 2

+

Header 3

+ + + """, + [ + Document( + page_content="Header 1", + metadata={"Header 1": "Header 1"} + ), + Document( + page_content="Header 2", + metadata={ + "Header 1": "Header 1", + "Header 2": "Header 2" + } + ), + Document( + page_content="Header 3", + metadata={"Header 1": "Header 3"} + ) + ], + "Headers with no associated content" + ), + ] +) +def test_html_header_text_splitter( + + html_header_splitter_splitter_factory: Any, + headers_to_split_on: List[Tuple[str, str]], + html_input: str, + expected_documents: List[Document], + test_case: str +): + """ + Test the HTML header text splitter. + + Args: + html_header_splitter_splitter_factory (Any): Factory function to create + the HTML header splitter. + headers_to_split_on (List[Tuple[str, str]]): List of headers to split on. + html_input (str): The HTML input string to be split. + expected_documents (List[Document]): List of expected Document objects. + test_case (str): Description of the test case. + + Raises: + AssertionError: If the number of documents or their content/metadata + does not match the expected values. """ - docs = splitter.split_text(content) - expected = [ - Document( - page_content="Reference content.", - metadata={"Header 1": "Sample Document", "Header 2": "Section"}, - ), - Document( - page_content="Item 1 Item 2 Item 3 \nSome text \nSome more text", - metadata={"Header 1": "Sample Document", "Header 2": "Lists"}, + splitter = html_header_splitter_splitter_factory(headers_to_split_on=headers_to_split_on) + docs = splitter.split_text(html_input) + + assert len(docs) == len(expected_documents), ( + f"Test Case '{test_case}' Failed: Number of documents mismatch. " + f"Expected {len(expected_documents)}, got {len(docs)}." + ) + for idx, (doc, expected) in enumerate(zip(docs, expected_documents), start=1): + assert doc.page_content == expected.page_content, ( + f"Test Case '{test_case}' Failed at Document {idx}: " + f"Content mismatch.\nExpected: {expected.page_content}\nGot: {doc.page_content}" + ) + assert doc.metadata == expected.metadata, ( + f"Test Case '{test_case}' Failed at Document {idx}: " + f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}" + ) + + +@pytest.mark.parametrize( + "headers_to_split_on, html_content, expected_output, test_case", + [ + ( + # Test Case A: Split on h1 and h2 with h3 in content + [("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")], + """ + + + +
+

Foo

+

Some intro text about Foo.

+
+

Bar main section

+

Some intro text about Bar.

+

Bar subsection 1

+

Some text about the first subtopic of Bar.

+

Bar subsection 2

+

Some text about the second subtopic of Bar.

+
+
+

Baz

+

Some text about Baz

+
+
+

Some concluding text about Foo

+
+ + + """, + [ + Document( + metadata={'Header 1': 'Foo'}, + page_content='Foo' + ), + Document( + metadata={'Header 1': 'Foo'}, + page_content='Some intro text about Foo.' + ), + Document( + metadata={'Header 1': 'Foo', 'Header 2': 'Bar main section'}, + page_content='Bar main section' + ), + Document( + metadata={'Header 1': 'Foo', 'Header 2': 'Bar main section'}, + page_content='Some intro text about Bar.' + ), + Document( + metadata={ + 'Header 1': 'Foo', + 'Header 2': 'Bar main section', + 'Header 3': 'Bar subsection 1' + }, + page_content='Bar subsection 1' + ), + Document( + metadata={ + 'Header 1': 'Foo', + 'Header 2': 'Bar main section', + 'Header 3': 'Bar subsection 1' + }, + page_content='Some text about the first subtopic of Bar.' + ), + Document( + metadata={ + 'Header 1': 'Foo', + 'Header 2': 'Bar main section', + 'Header 3': 'Bar subsection 2' + }, + page_content='Bar subsection 2' + ), + Document( + metadata={ + 'Header 1': 'Foo', + 'Header 2': 'Bar main section', + 'Header 3': 'Bar subsection 2' + }, + page_content='Some text about the second subtopic of Bar.' + ), + Document( + metadata={'Header 1': 'Foo', 'Header 2': 'Baz'}, + page_content='Baz' + ), + Document( + metadata={'Header 1': 'Foo'}, + page_content='Some text about Baz \nSome concluding text about Foo' + ) + ], + "Test Case A: Split on h1, h2, and h3 with nested headers" ), + ( + # Test Case B: Split on h1 only without any headers + [("h1", "Header 1")], + """ + + +

Paragraph one.

+

Paragraph two.

+

Paragraph three.

+ + + """, + [ + Document( + metadata={}, + page_content='Paragraph one. \nParagraph two. \nParagraph three.' + ) + ], + "Test Case B: Split on h1 only without any headers" + ) + ] +) +def test_additional_html_header_text_splitter( + html_header_splitter_splitter_factory: Any, + headers_to_split_on: List[Tuple[str, str]], + html_content: str, + expected_output: List[Document], + test_case: str +): + """ + Test the HTML header text splitter. + + Args: + html_header_splitter_splitter_factory (Any): Factory function to create + the HTML header splitter. + headers_to_split_on (List[Tuple[str, str]]): List of headers to split on. + html_content (str): HTML content to be split. + expected_output (List[Document]): Expected list of Document objects. + test_case (str): Description of the test case. + + Raises: + AssertionError: If the number of documents or their content/metadata + does not match the expected output. + """ + splitter = html_header_splitter_splitter_factory(headers_to_split_on=headers_to_split_on) + docs = splitter.split_text(html_content) + + + + assert len(docs) == len(expected_output), ( + f"{test_case} Failed: Number of documents mismatch. " + f"Expected {len(expected_output)}, got {len(docs)}." + ) + for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1): + assert doc.page_content == expected.page_content, ( + f"{test_case} Failed at Document {idx}: " + f"Content mismatch.\nExpected: {expected.page_content}\nGot: {doc.page_content}" + ) + assert doc.metadata == expected.metadata, ( + f"{test_case} Failed at Document {idx}: " + f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}" + ) + + +@pytest.mark.parametrize( + "headers_to_split_on, html_content, expected_output, test_case", + [ + ( + # Test Case C: Split on h1, h2, and h3 with no headers present + [("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")], + """ + + +

Just some random text without headers.

+
+ More text here. +
+ + + """, + [ + Document( + page_content='Just some random text without headers. \nMore text here.', + metadata={} + ) + ], + "Test Case C: Split on h1, h2, and h3 without any headers" + ) ] - assert docs == expected +) +def test_no_headers_with_multiple_splitters( + html_header_splitter_splitter_factory: Any, + headers_to_split_on: List[Tuple[str, str]], + html_content: str, + expected_output: List[Document], + test_case: str +): + """ + Test HTML content splitting without headers using multiple splitters. + Args: + html_header_splitter_splitter_factory (Any): Factory to create the + HTML header splitter. + headers_to_split_on (List[Tuple[str, str]]): List of headers to split on. + html_content (str): HTML content to be split. + expected_output (List[Document]): Expected list of Document objects + after splitting. + test_case (str): Description of the test case. + Raises: + AssertionError: If the number of documents or their content/metadata + does not match the expected output. + """ + splitter = html_header_splitter_splitter_factory(headers_to_split_on=headers_to_split_on) + docs = splitter.split_text(html_content) - with open(tmp_path / "doc.html", "w") as tmp: - tmp.write(content) - docs_from_file = splitter.split_text_from_file(tmp_path / "doc.html") - assert docs_from_file == expected + + assert len(docs) == len(expected_output), ( + f"{test_case} Failed: Number of documents mismatch. " + f"Expected {len(expected_output)}, got {len(docs)}." + ) + for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1): + assert doc.page_content == expected.page_content, ( + f"{test_case} Failed at Document {idx}: " + f"Content mismatch.\nExpected: {expected.page_content}\nGot: {doc.page_content}" + ) + assert doc.metadata == expected.metadata, ( + f"{test_case} Failed at Document {idx}: " + f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}" + ) + def test_split_text_on_tokens() -> None: From 17ae8b9cedfb0827ef6d96599d134eb2bb8919b0 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Tue, 17 Dec 2024 18:21:41 +0000 Subject: [PATCH 08/30] added import Tuple --- libs/text-splitters/tests/unit_tests/test_text_splitters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index ecc4658d16662..6e5371c5c9254 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -4,7 +4,7 @@ import re import string from pathlib import Path -from typing import Any, List +from typing import Any, List, Tuple import pytest from langchain_core.documents import Document From 03069519ebdbdc222629f2efd5ee710b13fb44f7 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Tue, 17 Dec 2024 23:42:44 +0000 Subject: [PATCH 09/30] added beautifulsoup4 to poetry depedencies --- libs/text-splitters/poetry.lock | 4 ++-- libs/text-splitters/pyproject.toml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/libs/text-splitters/poetry.lock b/libs/text-splitters/poetry.lock index 101d8e9205c9d..c79955eb0f440 100644 --- a/libs/text-splitters/poetry.lock +++ b/libs/text-splitters/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "annotated-types" @@ -4769,4 +4769,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "b72c31150d78895613927b7d21e7111edbd9de04d99bc335f63c7825b49d8e4c" +content-hash = "83606476221176f70c74ab55c68e5c2080b3fa4759f652f39f79e7a8ecaa71ca" diff --git a/libs/text-splitters/pyproject.toml b/libs/text-splitters/pyproject.toml index 0d33d20587fb4..76ea1ced153f0 100644 --- a/libs/text-splitters/pyproject.toml +++ b/libs/text-splitters/pyproject.toml @@ -37,6 +37,7 @@ ignore_missing_imports = "True" [tool.poetry.dependencies] python = ">=3.9,<4.0" langchain-core = "^0.3.25" +beautifulsoup4 = "^4.12.3" [tool.ruff.lint] select = ["E", "F", "I", "T201", "D"] From ae50b32dd6eb31ad53ce0b874aaf2ed08f4ea38c Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Wed, 18 Dec 2024 10:46:09 +0000 Subject: [PATCH 10/30] discarded bs4 dependency --- libs/text-splitters/poetry.lock | 4 ++-- libs/text-splitters/pyproject.toml | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/libs/text-splitters/poetry.lock b/libs/text-splitters/poetry.lock index c79955eb0f440..101d8e9205c9d 100644 --- a/libs/text-splitters/poetry.lock +++ b/libs/text-splitters/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. [[package]] name = "annotated-types" @@ -4769,4 +4769,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "83606476221176f70c74ab55c68e5c2080b3fa4759f652f39f79e7a8ecaa71ca" +content-hash = "b72c31150d78895613927b7d21e7111edbd9de04d99bc335f63c7825b49d8e4c" diff --git a/libs/text-splitters/pyproject.toml b/libs/text-splitters/pyproject.toml index 76ea1ced153f0..0d33d20587fb4 100644 --- a/libs/text-splitters/pyproject.toml +++ b/libs/text-splitters/pyproject.toml @@ -37,7 +37,6 @@ ignore_missing_imports = "True" [tool.poetry.dependencies] python = ">=3.9,<4.0" langchain-core = "^0.3.25" -beautifulsoup4 = "^4.12.3" [tool.ruff.lint] select = ["E", "F", "I", "T201", "D"] From f9a93d0cb6df3cc4311370b5f56d96fbe7347610 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Wed, 18 Dec 2024 11:06:43 +0000 Subject: [PATCH 11/30] Removed uncessary module docstring, updated docstring of HTMLHeaderTextSplitter to include whats essential only --- .../langchain_text_splitters/html.py | 59 ------------------- 1 file changed, 59 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 0d33cfa08ad65..2a6b1ef6d5b2e 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -1,62 +1,3 @@ -''' -This module provides classes for splitting HTML content into structured -Document objects based on specified header tags. - - ElementType(TypedDict): A typed dictionary representing an element type. - Node: Represents a node in a hierarchical structure. - HTMLHeaderTextSplitter: A class to split HTML content into Document objects - based on specified header tags. - HTMLSectionSplitter: A class to split HTML files based on specified tag and - font sizes. - -Functions: - HTMLHeaderTextSplitter.__init__(self, headers_to_split_on: List[Tuple[str, - str]], return_each_element: bool = False) -> None: Initializes the - HTMLHeaderTextSplitter with headers to split on and an optional - parameter to return each HTML element as a separate Document. - HTMLHeaderTextSplitter._header_level(self, element) -> int: Determines the - heading level of an HTML element. - HTMLHeaderTextSplitter._dom_depth(self, element) -> int: Computes the DOM - depth of an HTML element. - HTMLHeaderTextSplitter._build_tree(self, elements) -> None: Builds a tree - structure from a list of HTML elements. - HTMLHeaderTextSplitter.split_text(self, text: str) -> List[Document]: - HTMLHeaderTextSplitter.split_text_from_url(self, url: str, timeout: int = - 10, **kwargs: Any) -> List[Document]: Fetches text content from a URL - and splits it into documents. - HTMLHeaderTextSplitter._finalize_chunk(self, current_chunk: List[str], - active_headers: Dict[str, Tuple[str, int, int]], documents: - List[Document], chunk_dom_depth: int) -> None: Finalizes the current - chunk of text and appends it to the list of documents. - HTMLHeaderTextSplitter._generate_documents(self, nodes: Dict[int, Node]) -> - List[Document]: Generates a list of Document objects from a node - structure. - HTMLHeaderTextSplitter.split_text_from_file(self, file: Any) -> - List[Document]: Splits HTML content from a file into a list of Document - HTMLHeaderTextSplitter._aggregate_documents(self, nodes: Dict[int, Node]) -> - List[Document]: Aggregates documents based on headers. - HTMLHeaderTextSplitter._generate_individual_documents(self, nodes: Dict[int, - Node]) -> List[Document]: Generates individual Document objects for - each element. - HTMLSectionSplitter.__init__(self, headers_to_split_on: List[Tuple[str, - str]], xslt_path: Optional[str] = None, **kwargs: Any) -> None: Creates - a new HTMLSectionSplitter. - HTMLSectionSplitter.split_documents(self, documents: Iterable[Document]) -> - List[Document]: Splits documents. - HTMLSectionSplitter.split_text(self, text: str) -> List[Document]: Splits - HTML text string. - HTMLSectionSplitter.create_documents(self, texts: List[str], metadatas: - Optional[List[dict]] = None) -> List[Document]: Creates documents from - a list of texts. - HTMLSectionSplitter.split_html_by_headers(self, html_doc: str) -> - List[Dict[str, Optional[str]]]: Splits an HTML document into sections - based on specified header tags. - HTMLSectionSplitter.convert_possible_tags_to_header(self, html_content: - str) -> str: Converts specific HTML tags to headers using an XSLT - transformation. - HTMLSectionSplitter.split_text_from_file(self, file: Any) -> - List[Document]: Splits HTML file. -''' from __future__ import annotations import copy From 438aedd28df0e05cc315660f35324f05c29c73f9 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Wed, 18 Dec 2024 11:12:32 +0000 Subject: [PATCH 12/30] improved docstring for the class `HTMLHeaderTextSplitter` --- .../langchain_text_splitters/html.py | 63 +++++-------------- 1 file changed, 16 insertions(+), 47 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 2a6b1ef6d5b2e..6b9b67ec27e1c 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -43,56 +43,25 @@ class Node: parent: Optional[Node] = field(default=None) class HTMLHeaderTextSplitter: - ''' - HTMLHeaderTextSplitter is a class designed to split HTML content into - structured Document objects based on specified header tags. - - Classes: - HTMLHeaderTextSplitter: A class to split HTML content into Document - objects. - - Methods: - __init__(self, headers_to_split_on: List[Tuple[str, str]], - return_each_element: bool = False) -> None: - Initializes the HTMLHeaderTextSplitter with headers to split on - and an optional parameter to return each HTML element as a - separate Document. - - _header_level(self, element) -> int: - Determines the heading level of an HTML element. - - _dom_depth(self, element) -> int: - Computes the DOM depth of an HTML element. - - _build_tree(self, elements) -> None: - Builds a tree structure from a list of HTML elements. - - split_text(self, text: str) -> List[Document]: - Splits the given HTML text into a list of Document objects. - - split_text_from_url(self, url: str, timeout: int = 10, - **kwargs: Any) -> List[Document]: - Fetches text content from a URL and splits it into documents. - - _finalize_chunk(self, current_chunk: List[str], - active_headers: Dict[str, Tuple[str, int, int]], - documents: List[Document], chunk_dom_depth: int) -> None: - Finalizes the current chunk of text and appends it to the list of - documents. - - _generate_documents(self, nodes: Dict[int, Node]) -> List[Document]: - Generates a list of Document objects from a node structure. + """ + Splits HTML content into structured `Document` objects based on specified header + tags. - split_text_from_file(self, file: Any) -> List[Document]: - Splits HTML content from a file into a list of Document objects. + This splitter processes HTML by identifying header elements (e.g., `

`, + `

`) and segments the content accordingly. Each header and the text that + follows, up to the next header of the same or higher level, are grouped into a + `Document`. The metadata of each `Document` reflects the hierarchy of headers, + providing an organized content structure. - _aggregate_documents(self, nodes: Dict[int, Node]) -> List[Document]: - Aggregates documents based on headers. + If the content does not contain any of the specified headers, the splitter + returns a single `Document` with the aggregated content and no additional + metadata. - _generate_individual_documents(self, nodes: Dict[int, Node]) -> - List[Document]: - Generates individual Document objects for each element. - ''' + Attributes: + headers_to_split_on (List[Tuple[str, str]]): List of header tags to split + on, specified as tuples of (`tag_name`, `display_name`), e.g., + `("h1", "Header 1")`. + """ def __init__( self, From d573723d0d694ab5e7aedde8fd5e29ce4ebaa1b2 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Wed, 18 Dec 2024 11:23:26 +0000 Subject: [PATCH 13/30] removed typing from docstring when type is hinted. --- .../langchain_text_splitters/html.py | 61 ++++++++----------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 6b9b67ec27e1c..e5e0da62d3395 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -28,12 +28,12 @@ class Node: Represents a node in a hierarchical structure. Attributes: - name (str): The name of the node. - tag_type (str): The type of the node. - content (str): The content of the node. - level (int): The level of the node in the hierarchy. - dom_depth (int): The depth of the node in the DOM structure. - parent (Optional[Node]): The parent node. Defaults to None. + name: The name of the node. + tag_type: The type of the node. + content: The content of the node. + level: The level of the node in the hierarchy. + dom_depth: The depth of the node in the DOM structure. + parent: The parent node. Defaults to None. """ name: str tag_type: str @@ -56,25 +56,20 @@ class HTMLHeaderTextSplitter: If the content does not contain any of the specified headers, the splitter returns a single `Document` with the aggregated content and no additional metadata. - - Attributes: - headers_to_split_on (List[Tuple[str, str]]): List of header tags to split - on, specified as tuples of (`tag_name`, `display_name`), e.g., - `("h1", "Header 1")`. """ def __init__( self, headers_to_split_on: List[Tuple[str, str]], - return_each_element: bool = False # Added parameter + return_each_element: bool = False ) -> None: """ Initialize with headers to split on. Args: - headers_to_split_on (List[Tuple[str, str]]): A list of tuples where + headers_to_split_on: A list of tuples where each tuple contains a header tag and its corresponding value. - return_each_element (bool, optional): Whether to return each HTML + return_each_element: Whether to return each HTML element as a separate Document. Defaults to False. """ self.headers_to_split_on = sorted( @@ -83,7 +78,7 @@ def __init__( self.header_mapping = dict(self.headers_to_split_on) self.header_tags = [tag for tag, _ in self.headers_to_split_on] self.elements_tree: Dict[int, Tuple[str, str, int, int]] = {} - self.return_each_element = return_each_element # Store the parameter + self.return_each_element = return_each_element def _header_level(self, element) -> int: """ @@ -93,7 +88,7 @@ def _header_level(self, element) -> int: element: A BeautifulSoup element. Returns: - int: The heading level (1-6) if a heading, else a large number. + The heading level (1-6) if a heading, else a large number. """ tag_name = element.name.lower() if hasattr(element, 'name') else '' if tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: @@ -108,7 +103,7 @@ def _dom_depth(self, element) -> int: element: A BeautifulSoup element. Returns: - int: The depth of the element in the DOM tree. + The depth of the element in the DOM tree. """ depth = 0 for _ in element.parents: @@ -146,10 +141,10 @@ def split_text(self, text: str) -> List[Document]: Split the given text into a list of Document objects. Args: - text (str): The HTML text to split. + text: The HTML text to split. Returns: - List[Document]: A list of split Document objects. + A list of split Document objects. """ return self.split_text_from_file(StringIO(text)) @@ -163,12 +158,12 @@ def split_text_from_url( Fetch text content from a URL and split it into documents. Args: - url (str): The URL to fetch content from. - timeout (int, optional): Timeout for the request. Defaults to 10. + url: The URL to fetch content from. + timeout: Timeout for the request. Defaults to 10. **kwargs: Additional keyword arguments for the request. Returns: - List[Document]: A list of split Document objects. + A list of split Document objects. Raises: requests.RequestException: If the HTTP request fails. @@ -209,10 +204,10 @@ def _generate_documents(self, nodes: Dict[int, Node]) -> List[Document]: Generate a list of Document objects from a node structure. Args: - nodes (Dict[int, Node]): A dictionary of nodes indexed by their position. + A dictionary of nodes indexed by their position. Returns: - List[Document]: A list of generated Document objects. + A list of generated Document objects. """ documents: List[Document] = [] active_headers: Dict[str, Tuple[str, int, int]] = {} @@ -226,10 +221,8 @@ def process_node(node: Node) -> None: Processes a given node and updates the current chunk, active headers, and documents based on the node's type and content. Args: - node (Node): The node to be processed. It should have attributes + node: The node to be processed. It should have attributes 'tag_type', 'content', 'level', and 'dom_depth'. - Returns: - None """ nonlocal chunk_dom_depth @@ -285,10 +278,10 @@ def split_text_from_file(self, file: Any) -> List[Document]: Split HTML content from a file into a list of Document objects. Args: - file (Any): A file path or a file-like object containing HTML content. + file: A file path or a file-like object containing HTML content. Returns: - List[Document]: A list of split Document objects. + A list of split Document objects. """ if isinstance(file, str): with open(file, 'r', encoding='utf-8') as f: @@ -351,10 +344,10 @@ def _aggregate_documents(self, nodes: Dict[int, Node]) -> List[Document]: Aggregate documents based on headers. Args: - nodes (Dict[int, Node]): A dictionary of nodes indexed by their position. + nodes: A dictionary of nodes indexed by their position. Returns: - List[Document]: A list of aggregated Document objects. + A list of aggregated Document objects. """ # Reuse the existing _generate_documents method for aggregation return self._generate_documents(nodes) @@ -364,10 +357,10 @@ def _generate_individual_documents(self, nodes: Dict[int, Node]) -> List[Documen Generate individual Document objects for each element. Args: - nodes (Dict[int, Node]): A dictionary of nodes indexed by their position. + nodes: A dictionary of nodes indexed by their position. Returns: - List[Document]: A list of individual Document objects. + A list of individual Document objects. """ documents: List[Document] = [] active_headers: Dict[str, Tuple[str, int, int]] = {} @@ -379,7 +372,7 @@ def process_node(node: Node) -> None: Process a single node to create Document objects based on header tags. Args: - node (Node): The node to process. + node: The node to process. """ node_type = node.type # type: ignore[attr-defined] node_content = node.content # type: ignore[attr-defined] From b82bfc92cee4fdb9920bb6f5ab5a091302d49f3a Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Thu, 19 Dec 2024 22:08:48 +0000 Subject: [PATCH 14/30] added pytest mark require bs4 --- libs/text-splitters/tests/unit_tests/test_text_splitters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 66e0c849e8d5a..0a8990bca501d 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -2049,7 +2049,6 @@ def _create_splitter(headers_to_split_on: List[Tuple[str, str]]) -> HTMLHeaderTe return HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) return _create_splitter - @pytest.mark.parametrize( "headers_to_split_on, html_input, expected_documents, test_case", [ @@ -2268,6 +2267,7 @@ def _create_splitter(headers_to_split_on: List[Tuple[str, str]]) -> HTMLHeaderTe ), ] ) +@pytest.mark.requires("bs4") def test_html_header_text_splitter( html_header_splitter_splitter_factory: Any, From 429778739e941043318727d261956188285f2546 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Thu, 19 Dec 2024 22:21:00 +0000 Subject: [PATCH 15/30] added requirement bs4 marker for the test cases --- libs/text-splitters/tests/unit_tests/test_text_splitters.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 0a8990bca501d..259218a1aefdc 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -2040,6 +2040,7 @@ def test_haskell_code_splitter() -> None: @pytest.fixture +@pytest.mark.requires("bs4") def html_header_splitter_splitter_factory() -> HTMLHeaderTextSplitter: """ Fixture to create an HTMLHeaderTextSplitter instance with given headers. @@ -3047,7 +3048,7 @@ def test_html_splitter_with_small_chunk_size() -> None: """Test HTML splitting with a very small chunk size to validate chunking.""" html_content = """

Section 1

-

This is some long text that should be split into multiple chunks due to the +

This is some long text that should be split into multiple chunks due to the small chunk size.

""" splitter = HTMLSemanticPreservingSplitter( From c2107b14e423247f2d4a38996d3e8b2859001755 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Thu, 19 Dec 2024 22:35:06 +0000 Subject: [PATCH 16/30] all test function involving HTMLHeaderTextSplitter has bs4 requirment mark. --- libs/text-splitters/tests/unit_tests/test_text_splitters.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 259218a1aefdc..1dd0992b5a7da 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -2424,6 +2424,7 @@ def test_html_header_text_splitter( ) ] ) +@pytest.mark.requires("bs4") def test_additional_html_header_text_splitter( html_header_splitter_splitter_factory: Any, headers_to_split_on: List[Tuple[str, str]], @@ -2492,7 +2493,8 @@ def test_additional_html_header_text_splitter( ) ] ) -def test_no_headers_with_multiple_splitters( +@pytest.mark.requires("bs4") +def test_html_no_headers_with_multiple_splitters( html_header_splitter_splitter_factory: Any, headers_to_split_on: List[Tuple[str, str]], html_content: str, From 42618856d1742852a014a51ab9b3eb23a7763733 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Thu, 19 Dec 2024 22:52:43 +0000 Subject: [PATCH 17/30] added bs4 import in the split_file_function and removed it from top level. The class can be imported without bs4 depedency and once the split function is called, the bs4 will be imported. This makes bs4 optional depedency. --- libs/text-splitters/langchain_text_splitters/html.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 657c187b3c1d0..58d9ca590b0ca 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -19,10 +19,8 @@ ) import requests -from bs4 import BeautifulSoup -from langchain.docstore.document import Document as DocstoreDocument from langchain_core._api import beta -from langchain_core.documents import BaseDocumentTransformer, Document as CoreDocument +from langchain_core.documents import BaseDocumentTransformer, Document from langchain_text_splitters.character import RecursiveCharacterTextSplitter @@ -297,6 +295,14 @@ def split_text_from_file(self, file: Any) -> List[Document]: Returns: A list of split Document objects. """ + try: + from bs4 import BeautifulSoup # type: ignore[import-untyped] + except ImportError as e: + raise ImportError( + "Unable to import BeautifulSoup/PageElement, \ + please install with `pip install \ + bs4`." + ) from e if isinstance(file, str): with open(file, 'r', encoding='utf-8') as f: html_content = f.read() From 567318a48fd29acc25093812eef19836a2b619b6 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Thu, 19 Dec 2024 23:16:15 +0000 Subject: [PATCH 18/30] fixing linting errors and improved documentation for HTMLHeaderTextSplitter --- .../langchain_text_splitters/html.py | 126 +++++++++++++----- 1 file changed, 95 insertions(+), 31 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 58d9ca590b0ca..adff4de4c8ebf 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -36,8 +36,7 @@ class ElementType(TypedDict): @dataclass class Node: - """ - Represents a node in a hierarchical structure. + """Represents a node in a hierarchical structure. Attributes: name: The name of the node. @@ -55,19 +54,81 @@ class Node: parent: Optional[Node] = field(default=None) class HTMLHeaderTextSplitter: - """ - Splits HTML content into structured `Document` objects based on specified header - tags. - - This splitter processes HTML by identifying header elements (e.g., `

`, - `

`) and segments the content accordingly. Each header and the text that - follows, up to the next header of the same or higher level, are grouped into a - `Document`. The metadata of each `Document` reflects the hierarchy of headers, - providing an organized content structure. - - If the content does not contain any of the specified headers, the splitter - returns a single `Document` with the aggregated content and no additional - metadata. + """Split HTML content into structured Documents based on specified headers. + + Splits HTML content by detecting specified header tags (e.g.,

,

) and + creating hierarchical Document objects that reflect the semantic structure + of the original content. For each identified section, the splitter associates + the extracted text with metadata corresponding to the encountered headers. + + If no specified headers are found, the entire content is returned as a single + Document. This allows for flexible handling of HTML input, ensuring that + information is organized according to its semantic headers. + + The splitter provides the option to return each HTML element as a separate + Document or aggregate them into semantically meaningful chunks. It also + gracefully handles multiple levels of nested headers, creating a rich, + hierarchical representation of the content. + + Args: + headers_to_split_on (List[Tuple[str, str]]): A list of (header_tag, + header_name) pairs representing the headers that define splitting + boundaries. For example, [("h1", "Header 1"), ("h2", "Header 2")] + will split content by

and

tags, assigning their textual + content to the Document metadata. + return_each_element (bool): If True, every HTML element encountered + (including headers, paragraphs, etc.) is returned as a separate + Document. If False, content under the same header hierarchy is + aggregated into fewer Documents. + + Returns: + List[Document]: A list of Document objects. Each Document contains + `page_content` holding the extracted text and `metadata` that maps + the header hierarchy to their corresponding titles. + + Example: + .. code-block:: python + + from langchain_text_splitters.html_header_text_splitter import ( + HTMLHeaderTextSplitter, + ) + + # Define headers for splitting on h1 and h2 tags. + headers_to_split_on = [("h1", "Main Topic"), ("h2", "Sub Topic")] + + splitter = HTMLHeaderTextSplitter( + headers_to_split_on=headers_to_split_on, + return_each_element=False + ) + + html_content = \"\"\" + + +

Introduction

+

Welcome to the introduction section.

+

Background

+

Some background details here.

+

Conclusion

+

Final thoughts.

+ + + \"\"\" + + documents = splitter.split_text(html_content) + + # 'documents' now contains Document objects reflecting the hierarchy: + # - Document with metadata={"Main Topic": "Introduction"} and + # content="Introduction" + # - Document with metadata={"Main Topic": "Introduction"} and + # content="Welcome to the introduction section." + # - Document with metadata={"Main Topic": "Introduction", + # "Sub Topic": "Background"} and content="Background" + # - Document with metadata={"Main Topic": "Introduction", + # "Sub Topic": "Background"} and content="Some background details here." + # - Document with metadata={"Main Topic": "Conclusion"} and + # content="Conclusion" + # - Document with metadata={"Main Topic": "Conclusion"} and + # content="Final thoughts." """ def __init__( @@ -185,8 +246,8 @@ def split_text_from_url( response = requests.get(url, **kwargs) # noqa: E501 response.raise_for_status() except requests.RequestException as e: - print(f"Error fetching URL {url}: {e}") - raise e + msg = f"Error fetching URL {url}: {e}" + raise requests.RequestException(msg) from e return self.split_text_from_file(BytesIO(response.content)) def _finalize_chunk( @@ -198,7 +259,8 @@ def _finalize_chunk( if current_chunk: final_meta: Dict[str, str] = { - key: content for key, (content, level, dom_depth) in active_headers.items() + key: content + for key, (content, level, dom_depth) in active_headers.items() if chunk_dom_depth >= dom_depth } combined_text = " \n".join( @@ -229,9 +291,11 @@ def _generate_documents(self, nodes: Dict[int, Node]) -> List[Document]: def process_node(node: Node) -> None: - """ - Processes a given node and updates the current chunk, active headers, and - documents based on the node's type and content. + """Process a node and update chunk, headers, and documents accordingly. + + Updates current chunk, active headers, and documents based on the node's type + and content. + Args: node: The node to be processed. It should have attributes 'tag_type', 'content', 'level', and 'dom_depth'. @@ -244,7 +308,11 @@ def process_node(node: Node) -> None: node_dom_depth = node.dom_depth # type: ignore[attr-defined] if node_type in self.header_tags: - self._finalize_chunk(current_chunk, active_headers, documents, chunk_dom_depth) + self._finalize_chunk( + current_chunk, + active_headers, + documents, + chunk_dom_depth) headers_to_remove = [ key for key, (_, lvl, _) in active_headers.items() if lvl >= node_level @@ -360,21 +428,18 @@ def split_text_from_file(self, file: Any) -> List[Document]: return self._generate_individual_documents(nodes) def _aggregate_documents(self, nodes: Dict[int, Node]) -> List[Document]: - """ - Aggregate documents based on headers. + """Generate documents from a list of nodes. Args: - nodes: A dictionary of nodes indexed by their position. + nodes: List of Node objects representing the HTML structure. Returns: - A list of aggregated Document objects. + List of CoreDocument objects containing the processed text chunks. """ - # Reuse the existing _generate_documents method for aggregation return self._generate_documents(nodes) def _generate_individual_documents(self, nodes: Dict[int, Node]) -> List[Document]: - """ - Generate individual Document objects for each element. + """Generate individual Document objects for each element. Args: nodes: A dictionary of nodes indexed by their position. @@ -388,8 +453,7 @@ def _generate_individual_documents(self, nodes: Dict[int, Node]) -> List[Documen sorted_nodes = sorted(nodes.items()) def process_node(node: Node) -> None: - """ - Process a single node to create Document objects based on header tags. + """Process a single node to create Document objects based on header tags. Args: node: The node to process. From 53685eb448e9e34fa5a907f249b3a176294ec573 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Thu, 19 Dec 2024 23:31:45 +0000 Subject: [PATCH 19/30] fixed docstring issue and sorted imports --- .../langchain_text_splitters/html.py | 45 +++++++++---------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index adff4de4c8ebf..2a5677b1e84b5 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -21,7 +21,6 @@ import requests from langchain_core._api import beta from langchain_core.documents import BaseDocumentTransformer, Document - from langchain_text_splitters.character import RecursiveCharacterTextSplitter class ElementType(TypedDict): @@ -136,8 +135,7 @@ def __init__( headers_to_split_on: List[Tuple[str, str]], return_each_element: bool = False ) -> None: - """ - Initialize with headers to split on. + """Initialize with headers to split on. Args: headers_to_split_on: A list of tuples where @@ -154,8 +152,7 @@ def __init__( self.return_each_element = return_each_element def _header_level(self, element) -> int: - """ - Determine the heading level of an element. + """Determine the heading level of an element. Args: element: A BeautifulSoup element. @@ -169,8 +166,7 @@ def _header_level(self, element) -> int: return 9999 def _dom_depth(self, element) -> int: - """ - Compute the DOM depth of an element. + """Compute the DOM depth of an element. Args: element: A BeautifulSoup element. @@ -184,8 +180,7 @@ def _dom_depth(self, element) -> int: return depth def _build_tree(self, elements) -> None: - """ - Build a tree structure from a list of HTML elements. + """Build a tree structure from a list of HTML elements. Args: elements: A list of BeautifulSoup elements. @@ -210,8 +205,7 @@ def _build_tree(self, elements) -> None: ) def split_text(self, text: str) -> List[Document]: - """ - Split the given text into a list of Document objects. + """Split the given text into a list of Document objects. Args: text: The HTML text to split. @@ -227,8 +221,7 @@ def split_text_from_url( timeout: int = 10, **kwargs: Any ) -> List[Document]: - """ - Fetch text content from a URL and split it into documents. + """Fetch text content from a URL and split it into documents. Args: url: The URL to fetch content from. @@ -274,8 +267,7 @@ def _finalize_chunk( def _generate_documents(self, nodes: Dict[int, Node]) -> List[Document]: - """ - Generate a list of Document objects from a node structure. + """Generate a list of Document objects from a node structure. Args: A dictionary of nodes indexed by their position. @@ -293,14 +285,13 @@ def _generate_documents(self, nodes: Dict[int, Node]) -> List[Document]: def process_node(node: Node) -> None: """Process a node and update chunk, headers, and documents accordingly. - Updates current chunk, active headers, and documents based on the node's type - and content. + Updates current chunk, active headers, and documents based on the + node's type and content. Args: node: The node to be processed. It should have attributes 'tag_type', 'content', 'level', and 'dom_depth'. """ - nonlocal chunk_dom_depth node_type = node.tag_type # type: ignore[attr-defined] node_content = node.content # type: ignore[attr-defined] @@ -319,14 +310,15 @@ def process_node(node: Node) -> None: ] for key in headers_to_remove: del active_headers[key] - header_key = self.header_mapping[node_type] # type: ignore[attr-defined] + header_key = self.header_mapping[node_type] # type: ignore[attr-defined] active_headers[header_key] = ( node_content, node_level, node_dom_depth ) header_meta: Dict[str, str] = { - key: content for key, (content, lvl, dd) in active_headers.items() + key: content + for key, (content, _, dd) in active_headers.items() if node_dom_depth >= dd } documents.append( @@ -350,12 +342,14 @@ def process_node(node: Node) -> None: for _, node in sorted_nodes: process_node(node) - self._finalize_chunk(current_chunk, active_headers, documents, chunk_dom_depth) + self._finalize_chunk(current_chunk, + active_headers, + documents, + chunk_dom_depth) return documents def split_text_from_file(self, file: Any) -> List[Document]: - """ - Split HTML content from a file into a list of Document objects. + """Split HTML content from a file into a list of Document objects. Args: file: A file path or a file-like object containing HTML content. @@ -473,7 +467,7 @@ def process_node(node: Node) -> None: del active_headers[key] # Update active headers with the current header - header_key = self.header_mapping[node_type] # type: ignore[attr-defined] + header_key = self.header_mapping[node_type] # type: ignore[attr-defined] active_headers[header_key] = ( node_content, node_level, @@ -497,7 +491,8 @@ def process_node(node: Node) -> None: # For non-header elements, associate with current headers if node_content.strip(): header_meta: Dict[str, str] = { - key: content for key, (content, lvl, dd) in active_headers.items() + key: content + for key, (content, lvl, dd) in active_headers.items() if node_dom_depth >= dd } documents.append( From 9ff0bfa2279c9aa23ee6c0d4ec5ef7b1a3be925a Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Thu, 19 Dec 2024 23:39:28 +0000 Subject: [PATCH 20/30] sorted imports and defined `nodes` in `_generate_documents` docstring --- .../langchain_text_splitters/html.py | 20 ++++++------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 2a5677b1e84b5..a3bacdb8e9c06 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -5,24 +5,15 @@ import re from dataclasses import dataclass, field from io import BytesIO, StringIO -from typing import ( - Any, - Callable, - Dict, - Iterable, - List, - Optional, - Sequence, - Tuple, - TypedDict, - cast, -) +from typing import (Any, Callable, Dict, Iterable, List, Optional, Sequence, + Tuple, TypedDict, cast) import requests from langchain_core._api import beta from langchain_core.documents import BaseDocumentTransformer, Document from langchain_text_splitters.character import RecursiveCharacterTextSplitter + class ElementType(TypedDict): """Element type as typed dict.""" @@ -270,7 +261,7 @@ def _generate_documents(self, nodes: Dict[int, Node]) -> List[Document]: """Generate a list of Document objects from a node structure. Args: - A dictionary of nodes indexed by their position. + nodes: A dictionary of nodes indexed by their position. Returns: A list of generated Document objects. @@ -600,7 +591,8 @@ def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]] - 'tag_name': The name of the header tag (e.g., "h1", "h2"). """ try: - from bs4 import BeautifulSoup, PageElement # type: ignore[import-untyped] + from bs4 import (BeautifulSoup, # type: ignore[import-untyped] + PageElement) except ImportError as e: raise ImportError( "Unable to import BeautifulSoup/PageElement, \ From aeae28c8391b8507c752c03ba6f2fa818df4c7af Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Thu, 19 Dec 2024 23:47:01 +0000 Subject: [PATCH 21/30] updated import order --- .../langchain_text_splitters/html.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index a3bacdb8e9c06..70cb65fca7d6c 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -1,19 +1,28 @@ from __future__ import annotations import copy +from dataclasses import dataclass, field import pathlib import re -from dataclasses import dataclass, field from io import BytesIO, StringIO -from typing import (Any, Callable, Dict, Iterable, List, Optional, Sequence, - Tuple, TypedDict, cast) +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Sequence, + Tuple, + TypedDict, + cast, +) import requests from langchain_core._api import beta from langchain_core.documents import BaseDocumentTransformer, Document from langchain_text_splitters.character import RecursiveCharacterTextSplitter - class ElementType(TypedDict): """Element type as typed dict.""" @@ -591,8 +600,8 @@ def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]] - 'tag_name': The name of the header tag (e.g., "h1", "h2"). """ try: - from bs4 import (BeautifulSoup, # type: ignore[import-untyped] - PageElement) + from bs4 import BeautifulSoup # type: ignore[import-untyped] + from bs4 import PageElement except ImportError as e: raise ImportError( "Unable to import BeautifulSoup/PageElement, \ From e67f6bdb33ac3170e8752e2acbd1b58a26d2783a Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Fri, 20 Dec 2024 00:18:52 +0000 Subject: [PATCH 22/30] fixed all linting issues with Ruff --- .../langchain_text_splitters/html.py | 127 +++++++----------- 1 file changed, 47 insertions(+), 80 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 70cb65fca7d6c..8dd0823d948eb 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -1,9 +1,9 @@ from __future__ import annotations import copy -from dataclasses import dataclass, field import pathlib import re +from dataclasses import dataclass, field from io import BytesIO, StringIO from typing import ( Any, @@ -21,8 +21,10 @@ import requests from langchain_core._api import beta from langchain_core.documents import BaseDocumentTransformer, Document + from langchain_text_splitters.character import RecursiveCharacterTextSplitter + class ElementType(TypedDict): """Element type as typed dict.""" @@ -32,7 +34,6 @@ class ElementType(TypedDict): metadata: Dict[str, str] - @dataclass class Node: """Represents a node in a hierarchical structure. @@ -45,6 +46,7 @@ class Node: dom_depth: The depth of the node in the DOM structure. parent: The parent node. Defaults to None. """ + name: str tag_type: str content: str @@ -52,6 +54,7 @@ class Node: dom_depth: int parent: Optional[Node] = field(default=None) + class HTMLHeaderTextSplitter: """Split HTML content into structured Documents based on specified headers. @@ -133,7 +136,7 @@ class HTMLHeaderTextSplitter: def __init__( self, headers_to_split_on: List[Tuple[str, str]], - return_each_element: bool = False + return_each_element: bool = False, ) -> None: """Initialize with headers to split on. @@ -160,8 +163,8 @@ def _header_level(self, element) -> int: Returns: The heading level (1-6) if a heading, else a large number. """ - tag_name = element.name.lower() if hasattr(element, 'name') else '' - if tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + tag_name = element.name.lower() if hasattr(element, "name") else "" + if tag_name in ["h1", "h2", "h3", "h4", "h5", "h6"]: return int(tag_name[1]) return 9999 @@ -186,8 +189,9 @@ def _build_tree(self, elements) -> None: elements: A list of BeautifulSoup elements. """ for idx, element in enumerate(elements): - text = ' '.join( - t for t in element.find_all(string=True, recursive=False) + text = " ".join( + t + for t in element.find_all(string=True, recursive=False) if isinstance(t, str) ).strip() @@ -197,12 +201,7 @@ def _build_tree(self, elements) -> None: level = self._header_level(element) dom_depth = self._dom_depth(element) - self.elements_tree[idx] = ( - element.name, - text, - level, - dom_depth - ) + self.elements_tree[idx] = (element.name, text, level, dom_depth) def split_text(self, text: str) -> List[Document]: """Split the given text into a list of Document objects. @@ -216,10 +215,7 @@ def split_text(self, text: str) -> List[Document]: return self.split_text_from_file(StringIO(text)) def split_text_from_url( - self, - url: str, - timeout: int = 10, - **kwargs: Any + self, url: str, timeout: int = 10, **kwargs: Any ) -> List[Document]: """Fetch text content from a URL and split it into documents. @@ -235,7 +231,7 @@ def split_text_from_url( requests.RequestException: If the HTTP request fails. """ try: - kwargs.setdefault('timeout', timeout) + kwargs.setdefault("timeout", timeout) response = requests.get(url, **kwargs) # noqa: E501 response.raise_for_status() except requests.RequestException as e: @@ -248,7 +244,8 @@ def _finalize_chunk( current_chunk: List[str], active_headers: Dict[str, Tuple[str, int, int]], documents: List[Document], - chunk_dom_depth: int) -> None: + chunk_dom_depth: int, + ) -> None: if current_chunk: final_meta: Dict[str, str] = { @@ -256,16 +253,11 @@ def _finalize_chunk( for key, (content, level, dom_depth) in active_headers.items() if chunk_dom_depth >= dom_depth } - combined_text = " \n".join( - line for line in current_chunk if line.strip() - ) - documents.append( - Document(page_content=combined_text, metadata=final_meta) - ) + combined_text = " \n".join(line for line in current_chunk if line.strip()) + documents.append(Document(page_content=combined_text, metadata=final_meta)) current_chunk.clear() chunk_dom_depth = 0 - def _generate_documents(self, nodes: Dict[int, Node]) -> List[Document]: """Generate a list of Document objects from a node structure. @@ -280,8 +272,6 @@ def _generate_documents(self, nodes: Dict[int, Node]) -> List[Document]: current_chunk: List[str] = [] chunk_dom_depth = 0 - - def process_node(node: Node) -> None: """Process a node and update chunk, headers, and documents accordingly. @@ -300,36 +290,29 @@ def process_node(node: Node) -> None: if node_type in self.header_tags: self._finalize_chunk( - current_chunk, - active_headers, - documents, - chunk_dom_depth) + current_chunk, active_headers, documents, chunk_dom_depth + ) headers_to_remove = [ - key for key, (_, lvl, _) in active_headers.items() + key + for key, (_, lvl, _) in active_headers.items() if lvl >= node_level ] for key in headers_to_remove: del active_headers[key] - header_key = self.header_mapping[node_type] # type: ignore[attr-defined] - active_headers[header_key] = ( - node_content, - node_level, - node_dom_depth - ) + header_key = self.header_mapping[node_type] # type: ignore[attr-defined] + active_headers[header_key] = (node_content, node_level, node_dom_depth) header_meta: Dict[str, str] = { key: content for key, (content, _, dd) in active_headers.items() if node_dom_depth >= dd } documents.append( - Document( - page_content=node_content, - metadata=header_meta - ) + Document(page_content=node_content, metadata=header_meta) ) else: headers_to_remove = [ - key for key, (_, _, dd) in active_headers.items() + key + for key, (_, _, dd) in active_headers.items() if node_dom_depth < dd ] for key in headers_to_remove: @@ -342,10 +325,7 @@ def process_node(node: Node) -> None: for _, node in sorted_nodes: process_node(node) - self._finalize_chunk(current_chunk, - active_headers, - documents, - chunk_dom_depth) + self._finalize_chunk(current_chunk, active_headers, documents, chunk_dom_depth) return documents def split_text_from_file(self, file: Any) -> List[Document]: @@ -366,12 +346,12 @@ def split_text_from_file(self, file: Any) -> List[Document]: bs4`." ) from e if isinstance(file, str): - with open(file, 'r', encoding='utf-8') as f: + with open(file, "r", encoding="utf-8") as f: html_content = f.read() else: html_content = file.read() - soup = BeautifulSoup(html_content, 'html.parser') + soup = BeautifulSoup(html_content, "html.parser") body = soup.body if soup.body else soup elements = body.find_all() @@ -380,15 +360,9 @@ def split_text_from_file(self, file: Any) -> List[Document]: if not self.elements_tree: return [] - min_level = min( - level for (_, _, level, _) in self.elements_tree.values() - ) + min_level = min(level for (_, _, level, _) in self.elements_tree.values()) root = Node( - "root", - tag_type="root", - content="", - level=min_level - 1, - dom_depth=0 + "root", tag_type="root", content="", level=min_level - 1, dom_depth=0 ) nodes = { @@ -397,7 +371,7 @@ def split_text_from_file(self, file: Any) -> List[Document]: tag_type=tag, content=text, level=level, - dom_depth=dom_depth + dom_depth=dom_depth, ) for idx, (tag, text, level, dom_depth) in self.elements_tree.items() } @@ -406,9 +380,8 @@ def split_text_from_file(self, file: Any) -> List[Document]: for idx in sorted(nodes): node = nodes[idx] while stack and ( - stack[-1].level >= node.level - or stack[-1].dom_depth >= node.dom_depth - ): + stack[-1].level >= node.level or stack[-1].dom_depth >= node.dom_depth + ): stack.pop() if stack: node.parent = stack[-1] @@ -460,32 +433,27 @@ def process_node(node: Node) -> None: if node_type in self.header_tags: # Remove headers of the same or lower level headers_to_remove = [ - key for key, (_, lvl, _) in active_headers.items() + key + for key, (_, lvl, _) in active_headers.items() if lvl >= node_level ] for key in headers_to_remove: del active_headers[key] # Update active headers with the current header - header_key = self.header_mapping[node_type] # type: ignore[attr-defined] - active_headers[header_key] = ( - node_content, - node_level, - node_dom_depth - ) + header_key = self.header_mapping[node_type] # type: ignore[attr-defined] + active_headers[header_key] = (node_content, node_level, node_dom_depth) # Create metadata based on active headers header_meta: Dict[str, str] = { - key: content for key, (content, lvl, dd) in active_headers.items() + key: content + for key, (content, lvl, dd) in active_headers.items() if node_dom_depth >= dd } # Create a Document for the header element documents.append( - Document( - page_content=node_content, - metadata=header_meta - ) + Document(page_content=node_content, metadata=header_meta) ) else: # For non-header elements, associate with current headers @@ -496,10 +464,7 @@ def process_node(node: Node) -> None: if node_dom_depth >= dd } documents.append( - Document( - page_content=node_content, - metadata=header_meta - ) + Document(page_content=node_content, metadata=header_meta) ) # Process each node using the inner process_node function @@ -600,8 +565,10 @@ def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]] - 'tag_name': The name of the header tag (e.g., "h1", "h2"). """ try: - from bs4 import BeautifulSoup # type: ignore[import-untyped] - from bs4 import PageElement + from bs4 import ( + BeautifulSoup, # type: ignore[import-untyped] + PageElement, + ) except ImportError as e: raise ImportError( "Unable to import BeautifulSoup/PageElement, \ From cdd62b78ee61121c4f3fedc9773d3c189be9d845 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Fri, 20 Dec 2024 00:39:41 +0000 Subject: [PATCH 23/30] removed extra blank space from `_finalize_chunk` --- libs/text-splitters/langchain_text_splitters/html.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 8dd0823d948eb..2c388d7947efa 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -246,7 +246,6 @@ def _finalize_chunk( documents: List[Document], chunk_dom_depth: int, ) -> None: - if current_chunk: final_meta: Dict[str, str] = { key: content From b4d4e575d9e2131f35b709cd03dc97900ddaf37d Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Fri, 20 Dec 2024 00:54:53 +0000 Subject: [PATCH 24/30] added types for untyped function paramters. Typed `stack` variable as it was not explicitly typed. avoided header_meta redefinition --- .../langchain_text_splitters/html.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 2c388d7947efa..5c934a3125ab3 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -154,7 +154,7 @@ def __init__( self.elements_tree: Dict[int, Tuple[str, str, int, int]] = {} self.return_each_element = return_each_element - def _header_level(self, element) -> int: + def _header_level(self, element: Any) -> int: """Determine the heading level of an element. Args: @@ -168,7 +168,7 @@ def _header_level(self, element) -> int: return int(tag_name[1]) return 9999 - def _dom_depth(self, element) -> int: + def _dom_depth(self, element: Any) -> int: """Compute the DOM depth of an element. Args: @@ -182,7 +182,7 @@ def _dom_depth(self, element) -> int: depth += 1 return depth - def _build_tree(self, elements) -> None: + def _build_tree(self, elements: Any) -> None: """Build a tree structure from a list of HTML elements. Args: @@ -375,7 +375,7 @@ def split_text_from_file(self, file: Any) -> List[Document]: for idx, (tag, text, level, dom_depth) in self.elements_tree.items() } - stack = [] + stack: List[Node] = [] for idx in sorted(nodes): node = nodes[idx] while stack and ( @@ -428,7 +428,7 @@ def process_node(node: Node) -> None: node_content = node.content # type: ignore[attr-defined] node_level = node.level # type: ignore[attr-defined] node_dom_depth = node.dom_depth # type: ignore[attr-defined] - + header_meta: Dict[str, str] if node_type in self.header_tags: # Remove headers of the same or lower level headers_to_remove = [ @@ -444,7 +444,7 @@ def process_node(node: Node) -> None: active_headers[header_key] = (node_content, node_level, node_dom_depth) # Create metadata based on active headers - header_meta: Dict[str, str] = { + header_meta = { key: content for key, (content, lvl, dd) in active_headers.items() if node_dom_depth >= dd @@ -457,7 +457,7 @@ def process_node(node: Node) -> None: else: # For non-header elements, associate with current headers if node_content.strip(): - header_meta: Dict[str, str] = { + header_meta = { key: content for key, (content, lvl, dd) in active_headers.items() if node_dom_depth >= dd From d7ea998448715e7e0d98c228860534d19643240d Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Fri, 20 Dec 2024 01:10:51 +0000 Subject: [PATCH 25/30] fixed "line too long" in test_text_splitters --- .../tests/unit_tests/test_text_splitters.py | 224 ++++++++---------- 1 file changed, 99 insertions(+), 125 deletions(-) diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 1dd0992b5a7da..27d751c126503 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -2046,10 +2046,14 @@ def html_header_splitter_splitter_factory() -> HTMLHeaderTextSplitter: Fixture to create an HTMLHeaderTextSplitter instance with given headers. This factory allows dynamic creation of splitters with different headers. """ - def _create_splitter(headers_to_split_on: List[Tuple[str, str]]) -> HTMLHeaderTextSplitter: + def _create_splitter( + headers_to_split_on: List[Tuple[str, str]] + ) -> HTMLHeaderTextSplitter: return HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) + return _create_splitter + @pytest.mark.parametrize( "headers_to_split_on, html_input, expected_documents, test_case", [ @@ -2070,37 +2074,28 @@ def _create_splitter(headers_to_split_on: List[Tuple[str, str]]) -> HTMLHeaderTe """, [ Document( - page_content="Introduction", - metadata={"Header 1": "Introduction"} + page_content="Introduction", metadata={"Header 1": "Introduction"} ), Document( page_content="This is the introduction.", - metadata={"Header 1": "Introduction"} + metadata={"Header 1": "Introduction"}, ), Document( page_content="Background", - metadata={ - "Header 1": "Introduction", - "Header 2": "Background" - } + metadata={"Header 1": "Introduction", "Header 2": "Background"}, ), Document( page_content="Background information.", - metadata={ - "Header 1": "Introduction", - "Header 2": "Background" - } + metadata={"Header 1": "Introduction", "Header 2": "Background"}, ), Document( - page_content="Conclusion", - metadata={"Header 1": "Conclusion"} + page_content="Conclusion", metadata={"Header 1": "Conclusion"} ), Document( - page_content="Final thoughts.", - metadata={"Header 1": "Conclusion"} - ) + page_content="Final thoughts.", metadata={"Header 1": "Conclusion"} + ), ], - "Simple headers and paragraphs" + "Simple headers and paragraphs", ), ( # Test Case 2: Nested headers with h1, h2, and h3 @@ -2126,49 +2121,42 @@ def _create_splitter(headers_to_split_on: List[Tuple[str, str]]) -> HTMLHeaderTe """, [ Document( - page_content="Main Title", - metadata={"Header 1": "Main Title"} + page_content="Main Title", metadata={"Header 1": "Main Title"} ), Document( page_content="Subsection", - metadata={ - "Header 1": "Main Title", - "Header 2": "Subsection" - } + metadata={"Header 1": "Main Title", "Header 2": "Subsection"}, ), Document( page_content="Details of subsection.", - metadata={ - "Header 1": "Main Title", - "Header 2": "Subsection" - } + metadata={"Header 1": "Main Title", "Header 2": "Subsection"}, ), Document( page_content="Sub-subsection", metadata={ "Header 1": "Main Title", "Header 2": "Subsection", - "Header 3": "Sub-subsection" - } + "Header 3": "Sub-subsection", + }, ), Document( page_content="More details.", metadata={ "Header 1": "Main Title", "Header 2": "Subsection", - "Header 3": "Sub-subsection" - } + "Header 3": "Sub-subsection", + }, ), Document( page_content="Another Main Title", - metadata={"Header 1": "Another Main Title"} + metadata={"Header 1": "Another Main Title"}, ), Document( page_content="Content under another main title.", - metadata={"Header 1": "Another Main Title"} - ) + metadata={"Header 1": "Another Main Title"}, + ), ], - "Nested headers with h1, h2, and h3" + "Nested headers with h1, h2, and h3", ), ( # Test Case 3: No headers @@ -2187,10 +2175,10 @@ def _create_splitter(headers_to_split_on: List[Tuple[str, str]]) -> HTMLHeaderTe [ Document( page_content="Paragraph one. \nParagraph two. \nParagraph three.", - metadata={} + metadata={}, ) ], - "No headers present" + "No headers present", ), ( # Test Case 4: Multiple headers of the same level @@ -2208,32 +2196,23 @@ def _create_splitter(headers_to_split_on: List[Tuple[str, str]]) -> HTMLHeaderTe """, [ - Document( - page_content="Chapter 1", - metadata={"Header 1": "Chapter 1"} - ), + Document(page_content="Chapter 1", metadata={"Header 1": "Chapter 1"}), Document( page_content="Content of chapter 1.", - metadata={"Header 1": "Chapter 1"} - ), - Document( - page_content="Chapter 2", - metadata={"Header 1": "Chapter 2"} + metadata={"Header 1": "Chapter 1"}, ), + Document(page_content="Chapter 2", metadata={"Header 1": "Chapter 2"}), Document( page_content="Content of chapter 2.", - metadata={"Header 1": "Chapter 2"} - ), - Document( - page_content="Chapter 3", - metadata={"Header 1": "Chapter 3"} + metadata={"Header 1": "Chapter 2"}, ), + Document(page_content="Chapter 3", metadata={"Header 1": "Chapter 3"}), Document( page_content="Content of chapter 3.", - metadata={"Header 1": "Chapter 3"} - ) + metadata={"Header 1": "Chapter 3"}, + ), ], - "Multiple headers of the same level" + "Multiple headers of the same level", ), ( # Test Case 5: Headers with no content @@ -2248,34 +2227,24 @@ def _create_splitter(headers_to_split_on: List[Tuple[str, str]]) -> HTMLHeaderTe """, [ - Document( - page_content="Header 1", - metadata={"Header 1": "Header 1"} - ), + Document(page_content="Header 1", metadata={"Header 1": "Header 1"}), Document( page_content="Header 2", - metadata={ - "Header 1": "Header 1", - "Header 2": "Header 2" - } + metadata={"Header 1": "Header 1", "Header 2": "Header 2"}, ), - Document( - page_content="Header 3", - metadata={"Header 1": "Header 3"} - ) + Document(page_content="Header 3", metadata={"Header 1": "Header 3"}), ], - "Headers with no associated content" + "Headers with no associated content", ), - ] + ], ) @pytest.mark.requires("bs4") def test_html_header_text_splitter( - html_header_splitter_splitter_factory: Any, headers_to_split_on: List[Tuple[str, str]], html_input: str, expected_documents: List[Document], - test_case: str + test_case: str, ): """ Test the HTML header text splitter. @@ -2293,7 +2262,9 @@ def test_html_header_text_splitter( does not match the expected values. """ - splitter = html_header_splitter_splitter_factory(headers_to_split_on=headers_to_split_on) + splitter = html_header_splitter_splitter_factory( + headers_to_split_on=headers_to_split_on + ) docs = splitter.split_text(html_input) assert len(docs) == len(expected_documents), ( @@ -2303,7 +2274,8 @@ def test_html_header_text_splitter( for idx, (doc, expected) in enumerate(zip(docs, expected_documents), start=1): assert doc.page_content == expected.page_content, ( f"Test Case '{test_case}' Failed at Document {idx}: " - f"Content mismatch.\nExpected: {expected.page_content}\nGot: {doc.page_content}" + f"Content mismatch.\nExpected: {expected.page_content}" + "\nGot: {doc.page_content}" ) assert doc.metadata == expected.metadata, ( f"Test Case '{test_case}' Failed at Document {idx}: " @@ -2343,64 +2315,63 @@ def test_html_header_text_splitter( """, [ + Document(metadata={"Header 1": "Foo"}, page_content="Foo"), Document( - metadata={'Header 1': 'Foo'}, - page_content='Foo' - ), - Document( - metadata={'Header 1': 'Foo'}, - page_content='Some intro text about Foo.' + metadata={"Header 1": "Foo"}, + page_content="Some intro text about Foo.", ), Document( - metadata={'Header 1': 'Foo', 'Header 2': 'Bar main section'}, - page_content='Bar main section' + metadata={"Header 1": "Foo", "Header 2": "Bar main section"}, + page_content="Bar main section", ), Document( - metadata={'Header 1': 'Foo', 'Header 2': 'Bar main section'}, - page_content='Some intro text about Bar.' + metadata={"Header 1": "Foo", "Header 2": "Bar main section"}, + page_content="Some intro text about Bar.", ), Document( metadata={ - 'Header 1': 'Foo', - 'Header 2': 'Bar main section', - 'Header 3': 'Bar subsection 1' + "Header 1": "Foo", + "Header 2": "Bar main section", + "Header 3": "Bar subsection 1", }, - page_content='Bar subsection 1' + page_content="Bar subsection 1", ), Document( metadata={ - 'Header 1': 'Foo', - 'Header 2': 'Bar main section', - 'Header 3': 'Bar subsection 1' + "Header 1": "Foo", + "Header 2": "Bar main section", + "Header 3": "Bar subsection 1", }, - page_content='Some text about the first subtopic of Bar.' + page_content="Some text about the first subtopic of Bar.", ), Document( metadata={ - 'Header 1': 'Foo', - 'Header 2': 'Bar main section', - 'Header 3': 'Bar subsection 2' + "Header 1": "Foo", + "Header 2": "Bar main section", + "Header 3": "Bar subsection 2", }, - page_content='Bar subsection 2' + page_content="Bar subsection 2", ), Document( metadata={ - 'Header 1': 'Foo', - 'Header 2': 'Bar main section', - 'Header 3': 'Bar subsection 2' + "Header 1": "Foo", + "Header 2": "Bar main section", + "Header 3": "Bar subsection 2", }, - page_content='Some text about the second subtopic of Bar.' + page_content="Some text about the second subtopic of Bar.", ), Document( - metadata={'Header 1': 'Foo', 'Header 2': 'Baz'}, - page_content='Baz' + metadata={"Header 1": "Foo", "Header 2": "Baz"}, page_content="Baz" ), Document( - metadata={'Header 1': 'Foo'}, - page_content='Some text about Baz \nSome concluding text about Foo' - ) + metadata={"Header 1": "Foo"}, + page_content=( + "Some text about Baz \n" + "Some concluding text about Foo" + ), + ), ], - "Test Case A: Split on h1, h2, and h3 with nested headers" + "Test Case A: Split on h1, h2, and h3 with nested headers", ), ( # Test Case B: Split on h1 only without any headers @@ -2417,12 +2388,12 @@ def test_html_header_text_splitter( [ Document( metadata={}, - page_content='Paragraph one. \nParagraph two. \nParagraph three.' + page_content="Paragraph one. \nParagraph two. \nParagraph three.", ) ], - "Test Case B: Split on h1 only without any headers" - ) - ] + "Test Case B: Split on h1 only without any headers", + ), + ], ) @pytest.mark.requires("bs4") def test_additional_html_header_text_splitter( @@ -2430,7 +2401,7 @@ def test_additional_html_header_text_splitter( headers_to_split_on: List[Tuple[str, str]], html_content: str, expected_output: List[Document], - test_case: str + test_case: str, ): """ Test the HTML header text splitter. @@ -2447,11 +2418,11 @@ def test_additional_html_header_text_splitter( AssertionError: If the number of documents or their content/metadata does not match the expected output. """ - splitter = html_header_splitter_splitter_factory(headers_to_split_on=headers_to_split_on) + splitter = html_header_splitter_splitter_factory( + headers_to_split_on=headers_to_split_on + ) docs = splitter.split_text(html_content) - - assert len(docs) == len(expected_output), ( f"{test_case} Failed: Number of documents mismatch. " f"Expected {len(expected_output)}, got {len(docs)}." @@ -2459,7 +2430,8 @@ def test_additional_html_header_text_splitter( for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1): assert doc.page_content == expected.page_content, ( f"{test_case} Failed at Document {idx}: " - f"Content mismatch.\nExpected: {expected.page_content}\nGot: {doc.page_content}" + f"Content mismatch.\nExpected: {expected.page_content}\n" + "Got: {doc.page_content}" ) assert doc.metadata == expected.metadata, ( f"{test_case} Failed at Document {idx}: " @@ -2485,13 +2457,14 @@ def test_additional_html_header_text_splitter( """, [ Document( - page_content='Just some random text without headers. \nMore text here.', - metadata={} + page_content="Just some random text without headers." + " \nMore text here.", + metadata={}, ) ], - "Test Case C: Split on h1, h2, and h3 without any headers" + "Test Case C: Split on h1, h2, and h3 without any headers", ) - ] + ], ) @pytest.mark.requires("bs4") def test_html_no_headers_with_multiple_splitters( @@ -2499,7 +2472,7 @@ def test_html_no_headers_with_multiple_splitters( headers_to_split_on: List[Tuple[str, str]], html_content: str, expected_output: List[Document], - test_case: str + test_case: str, ): """ Test HTML content splitting without headers using multiple splitters. @@ -2515,11 +2488,11 @@ def test_html_no_headers_with_multiple_splitters( AssertionError: If the number of documents or their content/metadata does not match the expected output. """ - splitter = html_header_splitter_splitter_factory(headers_to_split_on=headers_to_split_on) + splitter = html_header_splitter_splitter_factory( + headers_to_split_on=headers_to_split_on + ) docs = splitter.split_text(html_content) - - assert len(docs) == len(expected_output), ( f"{test_case} Failed: Number of documents mismatch. " f"Expected {len(expected_output)}, got {len(docs)}." @@ -2527,7 +2500,8 @@ def test_html_no_headers_with_multiple_splitters( for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1): assert doc.page_content == expected.page_content, ( f"{test_case} Failed at Document {idx}: " - f"Content mismatch.\nExpected: {expected.page_content}\nGot: {doc.page_content}" + f"Content mismatch.\nExpected: {expected.page_content}\n" + "Got: {doc.page_content}" ) assert doc.metadata == expected.metadata, ( f"{test_case} Failed at Document {idx}: " @@ -2535,7 +2509,6 @@ def test_html_no_headers_with_multiple_splitters( ) - def test_split_text_on_tokens() -> None: """Test splitting by tokens per chunk.""" text = "foo bar baz 123" @@ -2932,7 +2905,8 @@ def test_html_splitter_with_custom_extractor() -> None: expected = [ Document( - page_content="This is an iframe: [iframe:http://example.com](http://example.com)", + page_content="This is an iframe: " + "[iframe:http://example.com](http://example.com)", metadata={"Header 1": "Section 1"}, ), ] From 2bf37261e7f679aa1074c49197147616301c380e Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Fri, 20 Dec 2024 01:33:00 +0000 Subject: [PATCH 26/30] fixed linter issues in test_text_splitter.py --- libs/text-splitters/tests/unit_tests/test_text_splitters.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 27d751c126503..70576a2b06947 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -2046,8 +2046,9 @@ def html_header_splitter_splitter_factory() -> HTMLHeaderTextSplitter: Fixture to create an HTMLHeaderTextSplitter instance with given headers. This factory allows dynamic creation of splitters with different headers. """ + def _create_splitter( - headers_to_split_on: List[Tuple[str, str]] + headers_to_split_on: List[Tuple[str, str]], ) -> HTMLHeaderTextSplitter: return HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) @@ -2366,8 +2367,7 @@ def test_html_header_text_splitter( Document( metadata={"Header 1": "Foo"}, page_content=( - "Some text about Baz \n" - "Some concluding text about Foo" + "Some text about Baz \nSome concluding text about Foo" ), ), ], From 7dd9f15e10e9129066a0882f23d9892412685ea4 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Fri, 20 Dec 2024 01:46:17 +0000 Subject: [PATCH 27/30] fixed mypy issues --- .../tests/unit_tests/test_text_splitters.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 70576a2b06947..2e039f1764223 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -4,7 +4,7 @@ import re import string from pathlib import Path -from typing import Any, List, Tuple +from typing import Any, Callable, List, Tuple import pytest from langchain_core.documents import Document @@ -2041,7 +2041,8 @@ def test_haskell_code_splitter() -> None: @pytest.fixture @pytest.mark.requires("bs4") -def html_header_splitter_splitter_factory() -> HTMLHeaderTextSplitter: +def html_header_splitter_splitter_factory() -> Callable[ + [List[Tuple[str, str]]], HTMLHeaderTextSplitter]: """ Fixture to create an HTMLHeaderTextSplitter instance with given headers. This factory allows dynamic creation of splitters with different headers. @@ -2402,7 +2403,7 @@ def test_additional_html_header_text_splitter( html_content: str, expected_output: List[Document], test_case: str, -): +) -> None: """ Test the HTML header text splitter. @@ -2473,7 +2474,7 @@ def test_html_no_headers_with_multiple_splitters( html_content: str, expected_output: List[Document], test_case: str, -): +) -> None: """ Test HTML content splitting without headers using multiple splitters. Args: From 456c36ac84261bbf2ac37bfc7e0b99f9a40e9a19 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Fri, 20 Dec 2024 01:59:41 +0000 Subject: [PATCH 28/30] fixed all formatting issues and checked with pre-commit --- .../text-splitters/tests/unit_tests/test_text_splitters.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 2e039f1764223..4e5c0bca4c106 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -2041,8 +2041,9 @@ def test_haskell_code_splitter() -> None: @pytest.fixture @pytest.mark.requires("bs4") -def html_header_splitter_splitter_factory() -> Callable[ - [List[Tuple[str, str]]], HTMLHeaderTextSplitter]: +def html_header_splitter_splitter_factory() -> ( + Callable[[List[Tuple[str, str]]], HTMLHeaderTextSplitter] +): """ Fixture to create an HTMLHeaderTextSplitter instance with given headers. This factory allows dynamic creation of splitters with different headers. @@ -2247,7 +2248,7 @@ def test_html_header_text_splitter( html_input: str, expected_documents: List[Document], test_case: str, -): +) -> None: """ Test the HTML header text splitter. From bbe5616ae8ae3f026a30a7b04709d98953394a77 Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Sat, 21 Dec 2024 00:41:40 +0000 Subject: [PATCH 29/30] simplified HTMLHeaderSplitter Logic --- .../langchain_text_splitters/html.py | 385 ++++++------------ 1 file changed, 117 insertions(+), 268 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 5c934a3125ab3..074a3825e35e6 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -3,8 +3,7 @@ import copy import pathlib import re -from dataclasses import dataclass, field -from io import BytesIO, StringIO +from io import StringIO from typing import ( Any, Callable, @@ -34,27 +33,6 @@ class ElementType(TypedDict): metadata: Dict[str, str] -@dataclass -class Node: - """Represents a node in a hierarchical structure. - - Attributes: - name: The name of the node. - tag_type: The type of the node. - content: The content of the node. - level: The level of the node in the hierarchy. - dom_depth: The depth of the node in the DOM structure. - parent: The parent node. Defaults to None. - """ - - name: str - tag_type: str - content: str - level: int - dom_depth: int - parent: Optional[Node] = field(default=None) - - class HTMLHeaderTextSplitter: """Split HTML content into structured Documents based on specified headers. @@ -151,58 +129,8 @@ def __init__( ) self.header_mapping = dict(self.headers_to_split_on) self.header_tags = [tag for tag, _ in self.headers_to_split_on] - self.elements_tree: Dict[int, Tuple[str, str, int, int]] = {} self.return_each_element = return_each_element - def _header_level(self, element: Any) -> int: - """Determine the heading level of an element. - - Args: - element: A BeautifulSoup element. - - Returns: - The heading level (1-6) if a heading, else a large number. - """ - tag_name = element.name.lower() if hasattr(element, "name") else "" - if tag_name in ["h1", "h2", "h3", "h4", "h5", "h6"]: - return int(tag_name[1]) - return 9999 - - def _dom_depth(self, element: Any) -> int: - """Compute the DOM depth of an element. - - Args: - element: A BeautifulSoup element. - - Returns: - The depth of the element in the DOM tree. - """ - depth = 0 - for _ in element.parents: - depth += 1 - return depth - - def _build_tree(self, elements: Any) -> None: - """Build a tree structure from a list of HTML elements. - - Args: - elements: A list of BeautifulSoup elements. - """ - for idx, element in enumerate(elements): - text = " ".join( - t - for t in element.find_all(string=True, recursive=False) - if isinstance(t, str) - ).strip() - - if not text: - continue - - level = self._header_level(element) - dom_depth = self._dom_depth(element) - - self.elements_tree[idx] = (element.name, text, level, dom_depth) - def split_text(self, text: str) -> List[Document]: """Split the given text into a list of Document objects. @@ -230,111 +158,40 @@ def split_text_from_url( Raises: requests.RequestException: If the HTTP request fails. """ - try: - kwargs.setdefault("timeout", timeout) - response = requests.get(url, **kwargs) # noqa: E501 - response.raise_for_status() - except requests.RequestException as e: - msg = f"Error fetching URL {url}: {e}" - raise requests.RequestException(msg) from e - return self.split_text_from_file(BytesIO(response.content)) - - def _finalize_chunk( - self, - current_chunk: List[str], - active_headers: Dict[str, Tuple[str, int, int]], - documents: List[Document], - chunk_dom_depth: int, - ) -> None: - if current_chunk: - final_meta: Dict[str, str] = { - key: content - for key, (content, level, dom_depth) in active_headers.items() - if chunk_dom_depth >= dom_depth - } - combined_text = " \n".join(line for line in current_chunk if line.strip()) - documents.append(Document(page_content=combined_text, metadata=final_meta)) - current_chunk.clear() - chunk_dom_depth = 0 - - def _generate_documents(self, nodes: Dict[int, Node]) -> List[Document]: - """Generate a list of Document objects from a node structure. - - Args: - nodes: A dictionary of nodes indexed by their position. - - Returns: - A list of generated Document objects. - """ - documents: List[Document] = [] - active_headers: Dict[str, Tuple[str, int, int]] = {} - current_chunk: List[str] = [] - chunk_dom_depth = 0 - - def process_node(node: Node) -> None: - """Process a node and update chunk, headers, and documents accordingly. - - Updates current chunk, active headers, and documents based on the - node's type and content. - - Args: - node: The node to be processed. It should have attributes - 'tag_type', 'content', 'level', and 'dom_depth'. - """ - nonlocal chunk_dom_depth - node_type = node.tag_type # type: ignore[attr-defined] - node_content = node.content # type: ignore[attr-defined] - node_level = node.level # type: ignore[attr-defined] - node_dom_depth = node.dom_depth # type: ignore[attr-defined] - - if node_type in self.header_tags: - self._finalize_chunk( - current_chunk, active_headers, documents, chunk_dom_depth - ) - headers_to_remove = [ - key - for key, (_, lvl, _) in active_headers.items() - if lvl >= node_level - ] - for key in headers_to_remove: - del active_headers[key] - header_key = self.header_mapping[node_type] # type: ignore[attr-defined] - active_headers[header_key] = (node_content, node_level, node_dom_depth) - header_meta: Dict[str, str] = { - key: content - for key, (content, _, dd) in active_headers.items() - if node_dom_depth >= dd - } - documents.append( - Document(page_content=node_content, metadata=header_meta) - ) - else: - headers_to_remove = [ - key - for key, (_, _, dd) in active_headers.items() - if node_dom_depth < dd - ] - for key in headers_to_remove: - del active_headers[key] - if node_content.strip(): - current_chunk.append(node_content) - chunk_dom_depth = max(chunk_dom_depth, node_dom_depth) + kwargs.setdefault("timeout", timeout) + response = requests.get(url, **kwargs) + response.raise_for_status() + return self.split_text(response.text) + + def _header_level(self, tag_name: str) -> int: + """Determine the heading level of a tag.""" + if tag_name.lower() in ["h1", "h2", "h3", "h4", "h5", "h6"]: + return int(tag_name[1]) + # Returns high level if it isn't a header + return 9999 - sorted_nodes = sorted(nodes.items()) - for _, node in sorted_nodes: - process_node(node) + def _dom_depth(self, element: Any) -> int: + """Determine the DOM depth of an element by counting its parents.""" + depth = 0 + for _ in element.parents: + depth += 1 + return depth - self._finalize_chunk(current_chunk, active_headers, documents, chunk_dom_depth) - return documents + def _get_elements(self, html_content: str) -> List[Any]: + """Parse HTML content and return a list of BeautifulSoup elements. - def split_text_from_file(self, file: Any) -> List[Document]: - """Split HTML content from a file into a list of Document objects. + This helper function takes HTML content as input, parses it using BeautifulSoup4, + and returns all HTML elements found in the document body. If no body tag exists, + it returns all elements in the full document. Args: - file: A file path or a file-like object containing HTML content. + html_content (str): Raw HTML content to be parsed. Returns: - A list of split Document objects. + List[Any]: A list of BeautifulSoup elements found in the HTML document. + + Raises: + ImportError: If the BeautifulSoup4 package is not installed. """ try: from bs4 import BeautifulSoup # type: ignore[import-untyped] @@ -344,131 +201,120 @@ def split_text_from_file(self, file: Any) -> List[Document]: please install with `pip install \ bs4`." ) from e - if isinstance(file, str): - with open(file, "r", encoding="utf-8") as f: - html_content = f.read() - else: - html_content = file.read() - soup = BeautifulSoup(html_content, "html.parser") body = soup.body if soup.body else soup + return body.find_all() - elements = body.find_all() - self._build_tree(elements) - - if not self.elements_tree: - return [] - - min_level = min(level for (_, _, level, _) in self.elements_tree.values()) - root = Node( - "root", tag_type="root", content="", level=min_level - 1, dom_depth=0 - ) - - nodes = { - idx: Node( - f"{tag}_{idx}", - tag_type=tag, - content=text, - level=level, - dom_depth=dom_depth, - ) - for idx, (tag, text, level, dom_depth) in self.elements_tree.items() - } - - stack: List[Node] = [] - for idx in sorted(nodes): - node = nodes[idx] - while stack and ( - stack[-1].level >= node.level or stack[-1].dom_depth >= node.dom_depth - ): - stack.pop() - if stack: - node.parent = stack[-1] - else: - node.parent = root - stack.append(node) - - if not self.return_each_element: - return self._aggregate_documents(nodes) - - return self._generate_individual_documents(nodes) - - def _aggregate_documents(self, nodes: Dict[int, Node]) -> List[Document]: - """Generate documents from a list of nodes. - - Args: - nodes: List of Node objects representing the HTML structure. - - Returns: - List of CoreDocument objects containing the processed text chunks. - """ - return self._generate_documents(nodes) - - def _generate_individual_documents(self, nodes: Dict[int, Node]) -> List[Document]: - """Generate individual Document objects for each element. + def split_text_from_file(self, file: Any) -> List[Document]: + """Split HTML content from a file into a list of Document objects. Args: - nodes: A dictionary of nodes indexed by their position. + file: A file path or a file-like object containing HTML content. Returns: - A list of individual Document objects. + A list of split Document objects. """ + if isinstance(file, str): + with open(file, "r", encoding="utf-8") as f: + html_content = f.read() + else: + html_content = file.read() + elements = self._get_elements(html_content) documents: List[Document] = [] active_headers: Dict[str, Tuple[str, int, int]] = {} + current_chunk: List[str] = [] + chunk_dom_depth = 0 - sorted_nodes = sorted(nodes.items()) + def finalize_chunk(): + if current_chunk: + final_meta = { + key: content + for key, (content, level, dom_depth) in active_headers.items() + if chunk_dom_depth >= dom_depth + } + combined_text = " \n".join( + line for line in current_chunk if line.strip() + ) + if combined_text.strip(): + documents.append( + Document(page_content=combined_text, metadata=final_meta) + ) + current_chunk.clear() - def process_node(node: Node) -> None: - """Process a single node to create Document objects based on header tags. + for element in elements: + tag = element.name + if not tag: + continue + text = " ".join( + t + for t in element.find_all(string=True, recursive=False) + if isinstance(t, str) + ).strip() + if not text: + continue - Args: - node: The node to process. - """ - node_type = node.type # type: ignore[attr-defined] - node_content = node.content # type: ignore[attr-defined] - node_level = node.level # type: ignore[attr-defined] - node_dom_depth = node.dom_depth # type: ignore[attr-defined] - header_meta: Dict[str, str] - if node_type in self.header_tags: - # Remove headers of the same or lower level + level = self._header_level(tag) + dom_depth = self._dom_depth(element) + + if tag in self.header_tags: + if not self.return_each_element: + finalize_chunk() + + # Remove headers at same or deeper level headers_to_remove = [ - key - for key, (_, lvl, _) in active_headers.items() - if lvl >= node_level + key for key, (_, lvl, _) in active_headers.items() if lvl >= level ] for key in headers_to_remove: del active_headers[key] - # Update active headers with the current header - header_key = self.header_mapping[node_type] # type: ignore[attr-defined] - active_headers[header_key] = (node_content, node_level, node_dom_depth) + header_key = self.header_mapping[tag] + active_headers[header_key] = (text, level, dom_depth) - # Create metadata based on active headers + # Produce a document for the header itself header_meta = { key: content for key, (content, lvl, dd) in active_headers.items() - if node_dom_depth >= dd + if dom_depth >= dd } - - # Create a Document for the header element - documents.append( - Document(page_content=node_content, metadata=header_meta) - ) + documents.append(Document(page_content=text, metadata=header_meta)) + # After encountering a header, no immediate content goes to current_chunk + # (if return_each_element is False, we wait for next content) + # (if return_each_element is True, we create docs per element anyway) else: - # For non-header elements, associate with current headers - if node_content.strip(): - header_meta = { + # Non-header element logic + # Remove headers that don't apply if dom_depth < their dom_depth + headers_to_remove = [ + key for key, (_, _, dd) in active_headers.items() if dom_depth < dd + ] + for key in headers_to_remove: + del active_headers[key] + + if self.return_each_element: + # Produce a doc for this element immediately + element_meta = { key: content for key, (content, lvl, dd) in active_headers.items() - if node_dom_depth >= dd + if dom_depth >= dd } - documents.append( - Document(page_content=node_content, metadata=header_meta) - ) + if text.strip(): + documents.append( + Document(page_content=text, metadata=element_meta) + ) + else: + # Accumulate content in current_chunk + if text.strip(): + current_chunk.append(text) + chunk_dom_depth = max(chunk_dom_depth, dom_depth) - # Process each node using the inner process_node function - for _, node in sorted_nodes: - process_node(node) + if not self.return_each_element: + # finalize any remaining chunk + finalize_chunk() + + # If no headers were found at all and return_each_element=False, behavior is: + # The entire content should be in one document. + # The logic above naturally handles it: + # If no recognized headers, we never split; we ended up just accumulating text + # in current_chunk and finalizing once at the end. return documents @@ -1142,3 +988,6 @@ def _reinsert_preserved_elements( for placeholder, preserved_content in preserved_elements.items(): content = content.replace(placeholder, preserved_content.strip()) return content + + +# %% From 5637dc7f4b0699ee36e53132668ad696570d394a Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Sat, 21 Dec 2024 00:49:39 +0000 Subject: [PATCH 30/30] improved documentation and formatting --- .../langchain_text_splitters/html.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 074a3825e35e6..99767fda860ef 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -180,12 +180,13 @@ def _dom_depth(self, element: Any) -> int: def _get_elements(self, html_content: str) -> List[Any]: """Parse HTML content and return a list of BeautifulSoup elements. - This helper function takes HTML content as input, parses it using BeautifulSoup4, - and returns all HTML elements found in the document body. If no body tag exists, + This helper function takes HTML content as input, + parses it using BeautifulSoup4, and returns all HTML elements + found in the document body. If no body tag exists, it returns all elements in the full document. Args: - html_content (str): Raw HTML content to be parsed. + html_content: Raw HTML content to be parsed. Returns: List[Any]: A list of BeautifulSoup elements found in the HTML document. @@ -225,7 +226,7 @@ def split_text_from_file(self, file: Any) -> List[Document]: current_chunk: List[str] = [] chunk_dom_depth = 0 - def finalize_chunk(): + def finalize_chunk() -> None: if current_chunk: final_meta = { key: content @@ -277,7 +278,8 @@ def finalize_chunk(): if dom_depth >= dd } documents.append(Document(page_content=text, metadata=header_meta)) - # After encountering a header, no immediate content goes to current_chunk + # After encountering a header, + # no immediate content goes to current_chunk # (if return_each_element is False, we wait for next content) # (if return_each_element is True, we create docs per element anyway) else: @@ -487,10 +489,13 @@ def convert_possible_tags_to_header(self, html_content: str) -> str: return str(result) def split_text_from_file(self, file: Any) -> List[Document]: - """Split HTML file. + """Split HTML content from a file into a list of Document objects. Args: - file: HTML file + file: A file path or a file-like object containing HTML content. + + Returns: + A list of split Document objects. """ file_content = file.getvalue() file_content = self.convert_possible_tags_to_header(file_content)