Skip to content

Commit

Permalink
improved documentation and formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
AhmedTammaa committed Dec 21, 2024
1 parent bbe5616 commit 5637dc7
Showing 1 changed file with 12 additions and 7 deletions.
19 changes: 12 additions & 7 deletions libs/text-splitters/langchain_text_splitters/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,12 +180,13 @@ def _dom_depth(self, element: Any) -> int:
def _get_elements(self, html_content: str) -> List[Any]:
"""Parse HTML content and return a list of BeautifulSoup elements.
This helper function takes HTML content as input, parses it using BeautifulSoup4,
and returns all HTML elements found in the document body. If no body tag exists,
This helper function takes HTML content as input,
parses it using BeautifulSoup4, and returns all HTML elements
found in the document body. If no body tag exists,
it returns all elements in the full document.
Args:
html_content (str): Raw HTML content to be parsed.
html_content: Raw HTML content to be parsed.
Returns:
List[Any]: A list of BeautifulSoup elements found in the HTML document.
Expand Down Expand Up @@ -225,7 +226,7 @@ def split_text_from_file(self, file: Any) -> List[Document]:
current_chunk: List[str] = []
chunk_dom_depth = 0

def finalize_chunk():
def finalize_chunk() -> None:
if current_chunk:
final_meta = {
key: content
Expand Down Expand Up @@ -277,7 +278,8 @@ def finalize_chunk():
if dom_depth >= dd
}
documents.append(Document(page_content=text, metadata=header_meta))
# After encountering a header, no immediate content goes to current_chunk
# After encountering a header,
# no immediate content goes to current_chunk
# (if return_each_element is False, we wait for next content)
# (if return_each_element is True, we create docs per element anyway)
else:
Expand Down Expand Up @@ -487,10 +489,13 @@ def convert_possible_tags_to_header(self, html_content: str) -> str:
return str(result)

def split_text_from_file(self, file: Any) -> List[Document]:
"""Split HTML file.
"""Split HTML content from a file into a list of Document objects.
Args:
file: HTML file
file: A file path or a file-like object containing HTML content.
Returns:
A list of split Document objects.
"""
file_content = file.getvalue()
file_content = self.convert_possible_tags_to_header(file_content)
Expand Down

0 comments on commit 5637dc7

Please sign in to comment.