langchain-ai · efriis · Dec 13, 2024 · Oct 30, 2024 · Nov 1, 2024 · Nov 1, 2024
diff --git a/libs/community/langchain_community/document_transformers/markdownify.py b/libs/community/langchain_community/document_transformers/markdownify.py
@@ -74,10 +74,3 @@ def transform_documents(
             )
 
         return converted_documents
-
-    async def atransform_documents(
-        self,
-        documents: Sequence[Document],
-        **kwargs: Any,
-    ) -> Sequence[Document]:
-        raise NotImplementedError
diff --git a/libs/community/tests/unit_tests/document_transformers/test_markdownify.py b/libs/community/tests/unit_tests/document_transformers/test_markdownify.py
@@ -50,7 +50,8 @@ def test_extract_html() -> None:
     documents = [Document(page_content=basic_html)]
     docs_transformed = markdownify.transform_documents(documents)
     assert docs_transformed[0].page_content == (
-        "Simple Test Page # Test Header\n\n "
+        "Simple Test Page "
+        "# Test Header\n\n "
         "First paragraph.\n\n "
         "Second paragraph.\n\n "
         "[Example Link](https://example.com)"
@@ -105,7 +106,8 @@ def test_convert_tags() -> None:
     assert docs_transformed[0].page_content == (
         "Header "
         "**1st paragraph.** "
-        "2nd paragraph. Here is [link](http://example.com) "
+        "2nd paragraph. "
+        "Here is [link](http://example.com) "
         "Ignore at end"
     )
 
@@ -137,3 +139,139 @@ def test_strip_convert_conflict_error() -> None:
         )
         documents = [Document(page_content=paragraphs_html)]
         markdownify.transform_documents(documents)
+
+
+# Async variants: exact duplicates of the above functions, using atransform_documents()
+@pytest.mark.requires("markdownify")
+async def test_empty_html_async() -> None:
+    markdownify = MarkdownifyTransformer()
+    empty_html = "<html></html>"
+    documents = [Document(page_content=empty_html)]
+    docs_transformed = await markdownify.atransform_documents(documents)
+    assert docs_transformed[0].page_content == ""
+
+
+@pytest.mark.requires("markdownify")
+async def test_extract_paragraphs_async() -> None:
+    markdownify = MarkdownifyTransformer()
+    paragraphs_html = (
+        "<html><h1>Header</h1><p>First paragraph.</p>"
+        "<p>Second paragraph.</p><h1>Ignore at end</h1></html>"
+    )
+    documents = [Document(page_content=paragraphs_html)]
+    docs_transformed = await markdownify.atransform_documents(documents)
+    assert docs_transformed[0].page_content == (
+        "# Header\n\n" "First paragraph.\n\n" "Second paragraph.\n\n" "# Ignore at end"
+    )
+
+
+@pytest.mark.requires("markdownify")
+async def test_extract_html_async() -> None:
+    markdownify = MarkdownifyTransformer(skip="title")
+    basic_html = (
+        "<!DOCTYPE html>"
+        '<html lang="en">'
+        "<head>"
+        '    <meta charset="UTF-8">'
+        "    <title>Simple Test Page</title>"
+        "</head>"
+        "<body>"
+        "    <h1>Test Header</h1>"
+        "    <p>First paragraph.</p>"
+        "    <p>Second paragraph.</p>"
+        '    <a href="https://example.com">Example Link</a>'
+        "</body>"
+        "</html>"
+    )
+    documents = [Document(page_content=basic_html)]
+    docs_transformed = await markdownify.atransform_documents(documents)
+    assert docs_transformed[0].page_content == (
+        "Simple Test Page "
+        "# Test Header\n\n "
+        "First paragraph.\n\n "
+        "Second paragraph.\n\n "
+        "[Example Link](https://example.com)"
+    )
+
+
+@pytest.mark.requires("markdownify")
+async def test_strip_tags_async() -> None:
+    markdownify = MarkdownifyTransformer(strip="strong")
+    paragraphs_html = (
+        "<html>"
+        "<h1>Header</h1>"
+        "   <p><strong>1st paragraph.</strong></p>"
+        '   <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
+        '   <img src="image.jpg" alt="Sample Image">'
+        "<h1>Ignore at end</h1></html>"
+    )
+    documents = [Document(page_content=paragraphs_html)]
+    docs_transformed = await markdownify.atransform_documents(documents)
+    assert docs_transformed[0].page_content == (
+        "# Header\n\n "
+        "1st paragraph.\n\n "
+        "2nd paragraph. Here is [link](http://example.com)\n\n "
+        "![Sample Image](image.jpg)"
+        "# Ignore at end"
+    )
+
+    markdownify = MarkdownifyTransformer(strip=["strong", "a", "img"])
+    documents = [Document(page_content=paragraphs_html)]
+    docs_transformed = await markdownify.atransform_documents(documents)
+    assert docs_transformed[0].page_content == (
+        "# Header\n\n "
+        "1st paragraph.\n\n "
+        "2nd paragraph. Here is link\n\n "
+        "# Ignore at end"
+    )
+
+
+@pytest.mark.requires("markdownify")
+async def test_convert_tags_async() -> None:
+    markdownify = MarkdownifyTransformer(convert=["strong", "a"])
+    paragraphs_html = (
+        "<html>"
+        "<h1>Header</h1>"
+        "   <p><strong>1st paragraph.</strong></p>"
+        '   <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
+        '   <img src="image.jpg" alt="Sample Image">'
+        "<h1>Ignore at end</h1></html>"
+    )
+    documents = [Document(page_content=paragraphs_html)]
+    docs_transformed = await markdownify.atransform_documents(documents)
+    assert docs_transformed[0].page_content == (
+        "Header "
+        "**1st paragraph.** "
+        "2nd paragraph. "
+        "Here is [link](http://example.com) "
+        "Ignore at end"
+    )
+
+    markdownify = MarkdownifyTransformer(convert="p")
+    documents = [Document(page_content=paragraphs_html)]
+    docs_transformed = await markdownify.atransform_documents(documents)
+    assert docs_transformed[0].page_content == (
+        "Header "
+        "1st paragraph.\n\n "
+        "2nd paragraph. Here is link\n\n "
+        "Ignore at end"
+    )
+
+
+@pytest.mark.requires("markdownify")
+async def test_strip_convert_conflict_error_async() -> None:
+    with pytest.raises(
+        ValueError,
+        match="You may specify either tags to strip or tags to convert, but not both.",
+    ):
+        markdownify = MarkdownifyTransformer(strip="h1", convert=["strong", "a"])
+        paragraphs_html = (
+            "<html>"
+            "<h1>Header</h1>"
+            "   <p><strong>1st paragraph.</strong></p>"
+            '   <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
+            '   <img src="image.jpg" alt="Sample Image">'
+            "<h1>Ignore at end</h1></html>"
+        )
+        documents = [Document(page_content=paragraphs_html)]
+        await markdownify.atransform_documents(documents)