Skip to content

Commit

Permalink
community: fallback on core async atransform_documents method for `Ma…
Browse files Browse the repository at this point in the history
…rkdownifyTransformer` (#27866)

# Description
Implements the `atransform_documents` method for
`MarkdownifyTransformer` using the `asyncio` built-in library for
concurrency.

Note that this is mainly for API completeness when working with async
frameworks rather than for performance, since the `markdownify` function
is not I/O bound because it works with `Document` objects already in
memory.

# Issue
Fixes #27865

# Dependencies
No new dependencies added, but
[`markdownify`](https://github.com/matthewwithanm/python-markdownify) is
required since this PR updates the `markdownify` integration.

# Tests and docs
- Tests added
- I did not modify the docstrings since they already described the basic
functionality, and [the API docs also already included a
description](https://python.langchain.com/api_reference/community/document_transformers/langchain_community.document_transformers.markdownify.MarkdownifyTransformer.html#langchain_community.document_transformers.markdownify.MarkdownifyTransformer.atransform_documents).
If it would be helpful, I would be happy to update the docstrings and/or
the API docs.

# Lint and test
- [x] format
- [x] lint
- [x] test

I ran formatting with `make format`, linting with `make lint`, and
confirmed that tests pass using `make test`. Note that some unit tests
pass in CI but may fail when running `make_test`. Those unit tests are:
- `test_extract_html` (and `test_extract_html_async`)
- `test_strip_tags` (and `test_strip_tags_async`)
- `test_convert_tags` (and `test_convert_tags_async`)

The reason for the difference is that there are trailing spaces when the
tests are run in the CI checks, and no trailing spaces when run with
`make test`. I ensured that the tests pass in CI, but they may fail with
`make test` due to the addition of trailing spaces.

---------

Co-authored-by: Erick Friis <[email protected]>
  • Loading branch information
rparkr and efriis authored Dec 13, 2024
1 parent af2e0a7 commit 12111cb
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,3 @@ def transform_documents(
)

return converted_documents

async def atransform_documents(
self,
documents: Sequence[Document],
**kwargs: Any,
) -> Sequence[Document]:
raise NotImplementedError
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ def test_extract_html() -> None:
documents = [Document(page_content=basic_html)]
docs_transformed = markdownify.transform_documents(documents)
assert docs_transformed[0].page_content == (
"Simple Test Page # Test Header\n\n "
"Simple Test Page "
"# Test Header\n\n "
"First paragraph.\n\n "
"Second paragraph.\n\n "
"[Example Link](https://example.com)"
Expand Down Expand Up @@ -105,7 +106,8 @@ def test_convert_tags() -> None:
assert docs_transformed[0].page_content == (
"Header "
"**1st paragraph.** "
"2nd paragraph. Here is [link](http://example.com) "
"2nd paragraph. "
"Here is [link](http://example.com) "
"Ignore at end"
)

Expand Down Expand Up @@ -137,3 +139,139 @@ def test_strip_convert_conflict_error() -> None:
)
documents = [Document(page_content=paragraphs_html)]
markdownify.transform_documents(documents)


# Async variants: exact duplicates of the above functions, using atransform_documents()
@pytest.mark.requires("markdownify")
async def test_empty_html_async() -> None:
markdownify = MarkdownifyTransformer()
empty_html = "<html></html>"
documents = [Document(page_content=empty_html)]
docs_transformed = await markdownify.atransform_documents(documents)
assert docs_transformed[0].page_content == ""


@pytest.mark.requires("markdownify")
async def test_extract_paragraphs_async() -> None:
markdownify = MarkdownifyTransformer()
paragraphs_html = (
"<html><h1>Header</h1><p>First paragraph.</p>"
"<p>Second paragraph.</p><h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)]
docs_transformed = await markdownify.atransform_documents(documents)
assert docs_transformed[0].page_content == (
"# Header\n\n" "First paragraph.\n\n" "Second paragraph.\n\n" "# Ignore at end"
)


@pytest.mark.requires("markdownify")
async def test_extract_html_async() -> None:
markdownify = MarkdownifyTransformer(skip="title")
basic_html = (
"<!DOCTYPE html>"
'<html lang="en">'
"<head>"
' <meta charset="UTF-8">'
" <title>Simple Test Page</title>"
"</head>"
"<body>"
" <h1>Test Header</h1>"
" <p>First paragraph.</p>"
" <p>Second paragraph.</p>"
' <a href="https://example.com">Example Link</a>'
"</body>"
"</html>"
)
documents = [Document(page_content=basic_html)]
docs_transformed = await markdownify.atransform_documents(documents)
assert docs_transformed[0].page_content == (
"Simple Test Page "
"# Test Header\n\n "
"First paragraph.\n\n "
"Second paragraph.\n\n "
"[Example Link](https://example.com)"
)


@pytest.mark.requires("markdownify")
async def test_strip_tags_async() -> None:
markdownify = MarkdownifyTransformer(strip="strong")
paragraphs_html = (
"<html>"
"<h1>Header</h1>"
" <p><strong>1st paragraph.</strong></p>"
' <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
' <img src="image.jpg" alt="Sample Image">'
"<h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)]
docs_transformed = await markdownify.atransform_documents(documents)
assert docs_transformed[0].page_content == (
"# Header\n\n "
"1st paragraph.\n\n "
"2nd paragraph. Here is [link](http://example.com)\n\n "
"![Sample Image](image.jpg)"
"# Ignore at end"
)

markdownify = MarkdownifyTransformer(strip=["strong", "a", "img"])
documents = [Document(page_content=paragraphs_html)]
docs_transformed = await markdownify.atransform_documents(documents)
assert docs_transformed[0].page_content == (
"# Header\n\n "
"1st paragraph.\n\n "
"2nd paragraph. Here is link\n\n "
"# Ignore at end"
)


@pytest.mark.requires("markdownify")
async def test_convert_tags_async() -> None:
markdownify = MarkdownifyTransformer(convert=["strong", "a"])
paragraphs_html = (
"<html>"
"<h1>Header</h1>"
" <p><strong>1st paragraph.</strong></p>"
' <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
' <img src="image.jpg" alt="Sample Image">'
"<h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)]
docs_transformed = await markdownify.atransform_documents(documents)
assert docs_transformed[0].page_content == (
"Header "
"**1st paragraph.** "
"2nd paragraph. "
"Here is [link](http://example.com) "
"Ignore at end"
)

markdownify = MarkdownifyTransformer(convert="p")
documents = [Document(page_content=paragraphs_html)]
docs_transformed = await markdownify.atransform_documents(documents)
assert docs_transformed[0].page_content == (
"Header "
"1st paragraph.\n\n "
"2nd paragraph. Here is link\n\n "
"Ignore at end"
)


@pytest.mark.requires("markdownify")
async def test_strip_convert_conflict_error_async() -> None:
with pytest.raises(
ValueError,
match="You may specify either tags to strip or tags to convert, but not both.",
):
markdownify = MarkdownifyTransformer(strip="h1", convert=["strong", "a"])
paragraphs_html = (
"<html>"
"<h1>Header</h1>"
" <p><strong>1st paragraph.</strong></p>"
' <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
' <img src="image.jpg" alt="Sample Image">'
"<h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)]
await markdownify.atransform_documents(documents)

0 comments on commit 12111cb

Please sign in to comment.