Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

community: fallback on core async atransform_documents method for MarkdownifyTransformer #27866

Merged
merged 14 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import asyncio
import re
from typing import Any, List, Optional, Sequence, Union

Expand Down Expand Up @@ -75,9 +76,52 @@ def transform_documents(

return converted_documents

async def _atransform_document(self, document: Document, **kwargs: Any) -> Document:
"""
Transform a single document asynchronously.
"""
# This logic is copied from the `transform_documents` method.
# To reduce redundancy, a transform_document method could be
# created to transform a single document, then used in async mode with
# asyncio.to_thread(transform_document(...)) or in sync mode with a
# list comprehension inside the transform_documents method.
try:
from markdownify import markdownify
except ImportError:
raise ImportError(
"""markdownify package not found, please
install it with `pip install markdownify`"""
)

markdown_content = (
markdownify(
html=document.page_content,
strip=self.strip,
convert=self.convert,
autolinks=self.autolinks,
heading_style=self.heading_style,
**self.additional_options,
)
.replace("\xa0", " ") # replace non-breaking space with a space
.strip()
)
cleaned_markdown = re.sub(r"\n\s*\n", "\n\n", markdown_content)
return Document(cleaned_markdown, metadata=document.metadata)

async def atransform_documents(
self,
documents: Sequence[Document],
**kwargs: Any,
) -> Sequence[Document]:
raise NotImplementedError
"""
Transform a list of documents asynchronously.
"""
# NOTE: consider implementing progress tracking using tqdm.asyncio,
# see an example here:
# langchain_community/document_loaders/async_html.py:_lazy_fetch_all()
# Direct link: https://github.com/langchain-ai/langchain/blob/33d445550e649b5de25bb2600b9b86c4b3de1b76/libs/community/langchain_community/document_loaders/async_html.py#L173
tasks = [
asyncio.create_task(self._atransform_document(doc, **kwargs))
for doc in documents
]
return await asyncio.gather(*tasks)
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ def test_extract_html() -> None:
documents = [Document(page_content=basic_html)]
docs_transformed = markdownify.transform_documents(documents)
assert docs_transformed[0].page_content == (
"Simple Test Page # Test Header\n\n "
"Simple Test Page "
"# Test Header\n\n "
"First paragraph.\n\n "
"Second paragraph.\n\n "
"[Example Link](https://example.com)"
Expand Down Expand Up @@ -105,7 +106,8 @@ def test_convert_tags() -> None:
assert docs_transformed[0].page_content == (
"Header "
"**1st paragraph.** "
"2nd paragraph. Here is [link](http://example.com) "
"2nd paragraph. "
"Here is [link](http://example.com) "
"Ignore at end"
)

Expand Down Expand Up @@ -137,3 +139,139 @@ def test_strip_convert_conflict_error() -> None:
)
documents = [Document(page_content=paragraphs_html)]
markdownify.transform_documents(documents)


# Async variants: exact duplicates of the above functions, using atransform_documents()
@pytest.mark.requires("markdownify")
async def test_empty_html_async() -> None:
markdownify = MarkdownifyTransformer()
empty_html = "<html></html>"
documents = [Document(page_content=empty_html)]
docs_transformed = await markdownify.atransform_documents(documents)
assert docs_transformed[0].page_content == ""


@pytest.mark.requires("markdownify")
async def test_extract_paragraphs_async() -> None:
markdownify = MarkdownifyTransformer()
paragraphs_html = (
"<html><h1>Header</h1><p>First paragraph.</p>"
"<p>Second paragraph.</p><h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)]
docs_transformed = await markdownify.atransform_documents(documents)
assert docs_transformed[0].page_content == (
"# Header\n\n" "First paragraph.\n\n" "Second paragraph.\n\n" "# Ignore at end"
)


@pytest.mark.requires("markdownify")
async def test_extract_html_async() -> None:
markdownify = MarkdownifyTransformer(skip="title")
basic_html = (
"<!DOCTYPE html>"
'<html lang="en">'
"<head>"
' <meta charset="UTF-8">'
" <title>Simple Test Page</title>"
"</head>"
"<body>"
" <h1>Test Header</h1>"
" <p>First paragraph.</p>"
" <p>Second paragraph.</p>"
' <a href="https://example.com">Example Link</a>'
"</body>"
"</html>"
)
documents = [Document(page_content=basic_html)]
docs_transformed = await markdownify.atransform_documents(documents)
assert docs_transformed[0].page_content == (
"Simple Test Page "
"# Test Header\n\n "
"First paragraph.\n\n "
"Second paragraph.\n\n "
"[Example Link](https://example.com)"
)


@pytest.mark.requires("markdownify")
async def test_strip_tags_async() -> None:
markdownify = MarkdownifyTransformer(strip="strong")
paragraphs_html = (
"<html>"
"<h1>Header</h1>"
" <p><strong>1st paragraph.</strong></p>"
' <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
' <img src="image.jpg" alt="Sample Image">'
"<h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)]
docs_transformed = await markdownify.atransform_documents(documents)
assert docs_transformed[0].page_content == (
"# Header\n\n "
"1st paragraph.\n\n "
"2nd paragraph. Here is [link](http://example.com)\n\n "
"![Sample Image](image.jpg)"
"# Ignore at end"
)

markdownify = MarkdownifyTransformer(strip=["strong", "a", "img"])
documents = [Document(page_content=paragraphs_html)]
docs_transformed = await markdownify.atransform_documents(documents)
assert docs_transformed[0].page_content == (
"# Header\n\n "
"1st paragraph.\n\n "
"2nd paragraph. Here is link\n\n "
"# Ignore at end"
)


@pytest.mark.requires("markdownify")
async def test_convert_tags_async() -> None:
markdownify = MarkdownifyTransformer(convert=["strong", "a"])
paragraphs_html = (
"<html>"
"<h1>Header</h1>"
" <p><strong>1st paragraph.</strong></p>"
' <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
' <img src="image.jpg" alt="Sample Image">'
"<h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)]
docs_transformed = await markdownify.atransform_documents(documents)
assert docs_transformed[0].page_content == (
"Header "
"**1st paragraph.** "
"2nd paragraph. "
"Here is [link](http://example.com) "
"Ignore at end"
)

markdownify = MarkdownifyTransformer(convert="p")
documents = [Document(page_content=paragraphs_html)]
docs_transformed = await markdownify.atransform_documents(documents)
assert docs_transformed[0].page_content == (
"Header "
"1st paragraph.\n\n "
"2nd paragraph. Here is link\n\n "
"Ignore at end"
)


@pytest.mark.requires("markdownify")
async def test_strip_convert_conflict_error_async() -> None:
with pytest.raises(
ValueError,
match="You may specify either tags to strip or tags to convert, but not both.",
):
markdownify = MarkdownifyTransformer(strip="h1", convert=["strong", "a"])
paragraphs_html = (
"<html>"
"<h1>Header</h1>"
" <p><strong>1st paragraph.</strong></p>"
' <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
' <img src="image.jpg" alt="Sample Image">'
"<h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)]
await markdownify.atransform_documents(documents)
Loading