diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index c603dde71eb32..702d5998dd7c2 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -427,6 +427,13 @@ def __init__( text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()`` dedupe: Avoiding the error of duplicate characters if `dedupe=True`. """ + try: + import PIL # noqa:F401 + except ImportError: + raise ImportError( + "pillow package not found, please install it with" + " `pip install pillow`" + ) self.text_kwargs = text_kwargs or {} self.dedupe = dedupe self.extract_images = extract_images @@ -468,17 +475,30 @@ def _process_page_content(self, page: pdfplumber.page.Page) -> str: def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str: """Extract images from page and get the text with RapidOCR.""" + from PIL import Image + if not self.extract_images: return "" images = [] for img in page.images: if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS: - images.append( - np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape( - img["stream"]["Height"], img["stream"]["Width"], -1 + if img["stream"]["BitsPerComponent"] == 1: + images.append( + np.array( + Image.frombytes( + "1", + (img["stream"]["Width"], img["stream"]["Height"]), + img["stream"].get_data(), + ).convert("L") + ) + ) + else: + images.append( + np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape( + img["stream"]["Height"], img["stream"]["Width"], -1 + ) ) - ) elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS: images.append(img["stream"].get_data()) else: