From 337fed80a5f1ee879262efe03bba764278199871 Mon Sep 17 00:00:00 2001 From: Vimpas Date: Sat, 14 Dec 2024 07:30:29 +0800 Subject: [PATCH] =?UTF-8?q?community:=20=20=F0=9F=90=9B=20PDF=20Filter=20T?= =?UTF-8?q?ype=20Error=20(#27154)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for contributing to LangChain! **PR title**: "community: fix PDF Filter Type Error" - **Description:** fix PDF Filter Type Error" - **Issue:** the issue #27153 it fixes, - **Dependencies:** no - **Twitter handle:** if your PR gets announced, and you'd like a mention, we'll gladly shout you out! - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Erick Friis --- .../document_loaders/parsers/pdf.py | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index ef9ca7561abd6..c603dde71eb32 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -236,17 +236,39 @@ def get_image(layout_object: Any) -> Any: images = [] - for img in list(filter(bool, map(get_image, page))): - if img.stream["Filter"].name in _PDF_FILTER_WITHOUT_LOSS: + for img in filter(bool, map(get_image, page)): + img_filter = img.stream["Filter"] + if isinstance(img_filter, list): + filter_names = [f.name for f in img_filter] + else: + filter_names = [img_filter.name] + + without_loss = any( + name in _PDF_FILTER_WITHOUT_LOSS for name in filter_names + ) + with_loss = any(name in _PDF_FILTER_WITH_LOSS for name in filter_names) + non_matching = {name for name in filter_names} - { + *_PDF_FILTER_WITHOUT_LOSS, + *_PDF_FILTER_WITH_LOSS, + } + + if without_loss and with_loss: + warnings.warn( + "Image has both lossy and lossless filters. Defaulting to lossless" + ) + + if non_matching: + warnings.warn(f"Unknown PDF Filter(s): {non_matching}") + + if without_loss: images.append( np.frombuffer(img.stream.get_data(), dtype=np.uint8).reshape( img.stream["Height"], img.stream["Width"], -1 ) ) - elif img.stream["Filter"].name in _PDF_FILTER_WITH_LOSS: + elif with_loss: images.append(img.stream.get_data()) - else: - warnings.warn("Unknown PDF Filter!") + return extract_from_images_with_rapidocr(images)