[fix]: Image Extraction Fixed

langchain-ai · Dec 3, 2024 · a3a72c8 · a3a72c8
1 parent a220ee5
commit a3a72c8
Showing 1 changed file with 7 additions and 2 deletions.
diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 from langchain_core.documents import Document
+from PIL import Image
 
 from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob
@@ -453,8 +454,12 @@ def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
         for img in page.images:
             if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
                 images.append(
-                    np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
-                        img["stream"]["Height"], img["stream"]["Width"], -1
+                    np.array(
+                        Image.frombytes(
+                            "1",
+                            (img["stream"]["Width"], img["stream"]["Height"]),
+                            img["stream"].get_data(),
+                        ).convert("L")
                     )
                 )
             elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS: