Skip to content

Commit

Permalink
[fix]: Image Extraction Fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
keenborder786 committed Dec 3, 2024
1 parent a220ee5 commit a3a72c8
Showing 1 changed file with 7 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import numpy as np
from langchain_core.documents import Document
from PIL import Image

from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
Expand Down Expand Up @@ -453,8 +454,12 @@ def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
for img in page.images:
if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
images.append(
np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
img["stream"]["Height"], img["stream"]["Width"], -1
np.array(
Image.frombytes(
"1",
(img["stream"]["Width"], img["stream"]["Height"]),
img["stream"].get_data(),
).convert("L")
)
)
elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS:
Expand Down

0 comments on commit a3a72c8

Please sign in to comment.