From 5637dc7f4b0699ee36e53132668ad696570d394a Mon Sep 17 00:00:00 2001 From: Ahmed Tammaa Date: Sat, 21 Dec 2024 00:49:39 +0000 Subject: [PATCH] improved documentation and formatting --- .../langchain_text_splitters/html.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 074a3825e35e6..99767fda860ef 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -180,12 +180,13 @@ def _dom_depth(self, element: Any) -> int: def _get_elements(self, html_content: str) -> List[Any]: """Parse HTML content and return a list of BeautifulSoup elements. - This helper function takes HTML content as input, parses it using BeautifulSoup4, - and returns all HTML elements found in the document body. If no body tag exists, + This helper function takes HTML content as input, + parses it using BeautifulSoup4, and returns all HTML elements + found in the document body. If no body tag exists, it returns all elements in the full document. Args: - html_content (str): Raw HTML content to be parsed. + html_content: Raw HTML content to be parsed. Returns: List[Any]: A list of BeautifulSoup elements found in the HTML document. @@ -225,7 +226,7 @@ def split_text_from_file(self, file: Any) -> List[Document]: current_chunk: List[str] = [] chunk_dom_depth = 0 - def finalize_chunk(): + def finalize_chunk() -> None: if current_chunk: final_meta = { key: content @@ -277,7 +278,8 @@ def finalize_chunk(): if dom_depth >= dd } documents.append(Document(page_content=text, metadata=header_meta)) - # After encountering a header, no immediate content goes to current_chunk + # After encountering a header, + # no immediate content goes to current_chunk # (if return_each_element is False, we wait for next content) # (if return_each_element is True, we create docs per element anyway) else: @@ -487,10 +489,13 @@ def convert_possible_tags_to_header(self, html_content: str) -> str: return str(result) def split_text_from_file(self, file: Any) -> List[Document]: - """Split HTML file. + """Split HTML content from a file into a list of Document objects. Args: - file: HTML file + file: A file path or a file-like object containing HTML content. + + Returns: + A list of split Document objects. """ file_content = file.getvalue() file_content = self.convert_possible_tags_to_header(file_content)