From c29b2acaf9be1a7da7031aa96ecb470e9ac3730b Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Thu, 19 Sep 2024 11:16:30 -0400 Subject: [PATCH 1/4] add assertions to test_loader_partitions_locally --- .../test_document_loaders.py | 53 +++++++++++++++++-- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py b/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py index f4fa4e7f9aab8..3e0efd5307466 100644 --- a/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py +++ b/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py @@ -1,8 +1,9 @@ import os from pathlib import Path -from typing import Callable +from typing import Callable, List import pytest +from langchain_core.documents import Document from langchain_unstructured import UnstructuredLoader @@ -13,6 +14,51 @@ UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY") +def _check_docs_content(docs: List[Document]) -> None: + assert all( + doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs + ) + assert ( + sum(doc.metadata.get("category") == "PageBreak" for doc in docs) == 16 + ) # 16 page doc + + expected_metadata_keys = [ + "source", + "languages", + "page_number", + "category", + "coordinates", + "element_id", + ] + for doc in docs: + if doc.page_content: + for key in expected_metadata_keys: + assert key in doc.metadata + else: + assert doc.metadata.get("category") == "PageBreak" + + page_numbers = [] + for doc in docs: + if page_number := doc.metadata.get("page_number"): + page_numbers.append(page_number) + + assert set(page_numbers) == set(range(1, 17)) + assert len(docs) >= 32 # (16 pages * (>=1 element per page) + 16 page breaks) + + page_1_content = "" + for doc in docs: + if doc.metadata.get("page_number") == 1: + page_1_content += f" {doc.page_content}" + assert ( + "LayoutParser: A Uniļ¬ed Toolkit for Deep Learning " + "Based Document Image Analysis" + ) in page_1_content + + categories = set(doc.metadata.get("category") for doc in docs) + assert "NarrativeText" in categories + assert "Title" in categories + + # -- Local partition -- @@ -27,10 +73,7 @@ def test_loader_partitions_locally() -> None: include_page_breaks=True, ).load() - assert all( - doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs - ) - assert any(doc.metadata.get("category") == "PageBreak" for doc in docs) + _check_docs_content(docs) @pytest.mark.local From e09837eb1bc725d8afd407c1c5d648bd8e3a0f15 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Thu, 19 Sep 2024 11:17:26 -0400 Subject: [PATCH 2/4] add assertions to test_loader_partitions_via_api --- .../tests/integration_tests/test_document_loaders.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py b/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py index 3e0efd5307466..e91c0998761b8 100644 --- a/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py +++ b/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py @@ -122,16 +122,12 @@ def test_loader_partitions_via_api() -> None: # Unstructured kwargs strategy="fast", include_page_breaks=True, + coordinates=True, ) docs = loader.load() - assert len(docs) > 1 - assert any(doc.metadata.get("category") == "PageBreak" for doc in docs) - assert all( - doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs - ) - assert docs[0].metadata.get("element_id") is not None + _check_docs_content(docs) def test_loader_partitions_multiple_via_api() -> None: From 8ee0e9c8d2e6d6fbfaddad58ec08da897066d1fe Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Thu, 19 Sep 2024 11:21:22 -0400 Subject: [PATCH 3/4] add hi-res test --- .../integration_tests/test_document_loaders.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py b/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py index e91c0998761b8..2fb973c82434a 100644 --- a/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py +++ b/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py @@ -163,6 +163,22 @@ def test_loader_partition_via_api_raises_TypeError_with_invalid_arg() -> None: loader.load() +def test_loader_partitions_via_api_hi_res() -> None: + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf") + loader = UnstructuredLoader( + file_path=file_path, + partition_via_api=True, + # Unstructured kwargs + strategy="hi_res", + ) + + docs = loader.load() + + categories = set(doc.metadata.get("category") for doc in docs) + assert "Table" in categories + assert "Image" in categories + + # -- fixtures --- From c9971fc90be58b8a12a05a4f9eb1fb98e8622a39 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Thu, 19 Sep 2024 11:23:38 -0400 Subject: [PATCH 4/4] add tests for async and lazy loading --- .../test_document_loaders.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py b/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py index 2fb973c82434a..f27ddf718670e 100644 --- a/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py +++ b/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py @@ -76,6 +76,23 @@ def test_loader_partitions_locally() -> None: _check_docs_content(docs) +@pytest.mark.local +async def test_loader_partitions_locally_async_lazy() -> None: + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf") + + loader = UnstructuredLoader( + file_path=file_path, + # Unstructured kwargs + strategy="fast", + include_page_breaks=True, + ) + docs = [] + async for doc in loader.alazy_load(): + docs.append(doc) + + _check_docs_content(docs) + + @pytest.mark.local def test_loader_partition_ignores_invalid_arg() -> None: file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf") @@ -130,6 +147,24 @@ def test_loader_partitions_via_api() -> None: _check_docs_content(docs) +async def test_loader_partitions_via_api_async_lazy() -> None: + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf") + loader = UnstructuredLoader( + file_path=file_path, + partition_via_api=True, + # Unstructured kwargs + strategy="fast", + include_page_breaks=True, + coordinates=True, + ) + + docs = [] + async for doc in loader.alazy_load(): + docs.append(doc) + + _check_docs_content(docs) + + def test_loader_partitions_multiple_via_api() -> None: file_paths = [ os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"),