diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py index a17abd34d9..555cf1df58 100644 --- a/haystack/components/converters/pypdf.py +++ b/haystack/components/converters/pypdf.py @@ -4,6 +4,7 @@ import io import warnings +from enum import Enum from pathlib import Path from typing import Any, Dict, List, Optional, Protocol, Union @@ -39,6 +40,33 @@ def from_dict(cls, data): # noqa: D102 ... +class PyPDFExtractionMode(Enum): + """ + The mode to use for extracting text from a PDF. + """ + + PLAIN = "plain" + LAYOUT = "layout" + + def __str__(self) -> str: + """ + Convert a PyPDFExtractionMode enum to a string. + """ + return self.value + + @staticmethod + def from_str(string: str) -> "PyPDFExtractionMode": + """ + Convert a string to a PyPDFExtractionMode enum. + """ + enum_map = {e.value: e for e in PyPDFExtractionMode} + mode = enum_map.get(string) + if mode is None: + msg = f"Unknown extraction mode '{string}'. Supported modes are: {list(enum_map.keys())}" + raise ValueError(msg) + return mode + + @component class PyPDFToDocument: """ @@ -60,13 +88,49 @@ class PyPDFToDocument: ``` """ - def __init__(self, converter: Optional[PyPDFConverter] = None): + def __init__( + self, + converter: Optional[PyPDFConverter] = None, + *, + extraction_mode: Union[str, PyPDFExtractionMode] = PyPDFExtractionMode.PLAIN, + plain_mode_orientations: tuple = (0, 90, 180, 270), + plain_mode_space_width: float = 200.0, + layout_mode_space_vertically: bool = True, + layout_mode_scale_weight: float = 1.25, + layout_mode_strip_rotated: bool = True, + layout_mode_font_height_weight: float = 1.0, + ): """ Create an PyPDFToDocument component. :param converter: An instance of a PyPDFConverter compatible class. This is deprecated and will be removed in Haystack 2.9.0. For in-depth customization of the conversion process, consider implementing a custom component. + + All the following parameters are applied only if `converter` is None. + + :param extraction_mode: + The mode to use for extracting text from a PDF. + Layout mode is an experimental mode that adheres to the rendered layout of the PDF. + :param plain_mode_orientations: + Tuple of orientations to look for when extracting text from a PDF in plain mode. + Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`. + :param plain_mode_space_width: + Forces default space width if not extracted from font. + Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`. + :param layout_mode_space_vertically: + Whether to include blank lines inferred from y distance + font height. + Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`. + :param layout_mode_scale_weight: + Multiplier for string length when calculating weighted average character width. + Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`. + :param layout_mode_strip_rotated: + Layout mode does not support rotated text. Set to `False` to include rotated text anyway. + If rotated text is discovered, layout will be degraded and a warning will be logged. + Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`. + :param layout_mode_font_height_weight: + Multiplier for font height when calculating blank line height. + Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`. """ pypdf_import.check() @@ -79,6 +143,16 @@ def __init__(self, converter: Optional[PyPDFConverter] = None): self.converter = converter + if isinstance(extraction_mode, str): + extraction_mode = PyPDFExtractionMode.from_str(extraction_mode) + self.extraction_mode = extraction_mode + self.plain_mode_orientations = plain_mode_orientations + self.plain_mode_space_width = plain_mode_space_width + self.layout_mode_space_vertically = layout_mode_space_vertically + self.layout_mode_scale_weight = layout_mode_scale_weight + self.layout_mode_strip_rotated = layout_mode_strip_rotated + self.layout_mode_font_height_weight = layout_mode_font_height_weight + def to_dict(self): """ Serializes the component to a dictionary. @@ -87,7 +161,15 @@ def to_dict(self): Dictionary with serialized data. """ return default_to_dict( - self, converter=(serialize_class_instance(self.converter) if self.converter is not None else None) + self, + converter=(serialize_class_instance(self.converter) if self.converter else None), + extraction_mode=str(self.extraction_mode), + plain_mode_orientations=self.plain_mode_orientations, + plain_mode_space_width=self.plain_mode_space_width, + layout_mode_space_vertically=self.layout_mode_space_vertically, + layout_mode_scale_weight=self.layout_mode_scale_weight, + layout_mode_strip_rotated=self.layout_mode_strip_rotated, + layout_mode_font_height_weight=self.layout_mode_font_height_weight, ) @classmethod @@ -108,7 +190,20 @@ def from_dict(cls, data): return default_from_dict(cls, data) def _default_convert(self, reader: "PdfReader") -> Document: - text = "\f".join(page.extract_text() for page in reader.pages) + texts = [] + for page in reader.pages: + texts.append( + page.extract_text( + orientations=self.plain_mode_orientations, + extraction_mode=self.extraction_mode.value, + space_width=self.plain_mode_space_width, + layout_mode_space_vertically=self.layout_mode_space_vertically, + layout_mode_scale_weight=self.layout_mode_scale_weight, + layout_mode_strip_rotated=self.layout_mode_strip_rotated, + layout_mode_font_height_weight=self.layout_mode_font_height_weight, + ) + ) + text = "\f".join(texts) return Document(content=text) @component.output_types(documents=List[Document]) diff --git a/releasenotes/notes/pypdf-add-customization-params-3da578deff7f83a5.yaml b/releasenotes/notes/pypdf-add-customization-params-3da578deff7f83a5.yaml new file mode 100644 index 0000000000..4e3f5176ac --- /dev/null +++ b/releasenotes/notes/pypdf-add-customization-params-3da578deff7f83a5.yaml @@ -0,0 +1,5 @@ +--- +enhancements: + - | + Added new initialization parameters to the `PyPDFToDocument` component to customize the text extraction process + from PDF files. diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py index 234b545eac..d9aee0020f 100644 --- a/test/components/converters/test_pypdf_to_document.py +++ b/test/components/converters/test_pypdf_to_document.py @@ -2,17 +2,17 @@ # # SPDX-License-Identifier: Apache-2.0 import logging -from unittest.mock import patch +from unittest.mock import patch, Mock import pytest from haystack import Document, default_from_dict, default_to_dict -from haystack.components.converters.pypdf import PyPDFToDocument +from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode from haystack.dataclasses import ByteStream @pytest.fixture -def pypdf_converter(): +def pypdf_component(): return PyPDFToDocument() @@ -30,38 +30,104 @@ def from_dict(cls, data): class TestPyPDFToDocument: - def test_init(self, pypdf_converter): - assert pypdf_converter.converter is None + def test_init(self, pypdf_component): + assert pypdf_component.converter is None + assert pypdf_component.extraction_mode == PyPDFExtractionMode.PLAIN + assert pypdf_component.plain_mode_orientations == (0, 90, 180, 270) + assert pypdf_component.plain_mode_space_width == 200.0 + assert pypdf_component.layout_mode_space_vertically is True + assert pypdf_component.layout_mode_scale_weight == 1.25 + assert pypdf_component.layout_mode_strip_rotated is True + assert pypdf_component.layout_mode_font_height_weight == 1.0 - def test_init_params(self): - pypdf_converter = PyPDFToDocument(converter=CustomConverter()) - assert isinstance(pypdf_converter.converter, CustomConverter) + def test_init_converter(self): + pypdf_component = PyPDFToDocument(converter=CustomConverter()) + assert isinstance(pypdf_component.converter, CustomConverter) - def test_to_dict(self, pypdf_converter): - data = pypdf_converter.to_dict() + def test_init_custom_params(self): + pypdf_component = PyPDFToDocument( + extraction_mode="layout", + plain_mode_orientations=(0, 90), + plain_mode_space_width=150.0, + layout_mode_space_vertically=False, + layout_mode_scale_weight=2.0, + layout_mode_strip_rotated=False, + layout_mode_font_height_weight=0.5, + ) + + assert pypdf_component.extraction_mode == PyPDFExtractionMode.LAYOUT + assert pypdf_component.plain_mode_orientations == (0, 90) + assert pypdf_component.plain_mode_space_width == 150.0 + assert pypdf_component.layout_mode_space_vertically is False + assert pypdf_component.layout_mode_scale_weight == 2.0 + assert pypdf_component.layout_mode_strip_rotated is False + assert pypdf_component.layout_mode_font_height_weight == 0.5 + + def test_init_invalid_extraction_mode(self): + with pytest.raises(ValueError): + PyPDFToDocument(extraction_mode="invalid") + + def test_to_dict(self, pypdf_component): + data = pypdf_component.to_dict() assert data == { "type": "haystack.components.converters.pypdf.PyPDFToDocument", - "init_parameters": {"converter": None}, + "init_parameters": { + "converter": None, + "extraction_mode": "plain", + "plain_mode_orientations": (0, 90, 180, 270), + "plain_mode_space_width": 200.0, + "layout_mode_space_vertically": True, + "layout_mode_scale_weight": 1.25, + "layout_mode_strip_rotated": True, + "layout_mode_font_height_weight": 1.0, + }, } def test_to_dict_custom_converter(self): - pypdf_converter = PyPDFToDocument(converter=CustomConverter()) - data = pypdf_converter.to_dict() + pypdf_component = PyPDFToDocument(converter=CustomConverter()) + data = pypdf_component.to_dict() assert data == { "type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": { "converter": { "data": {"key": "value", "more": False}, "type": "converters.test_pypdf_to_document.CustomConverter", - } + }, + "extraction_mode": "plain", + "plain_mode_orientations": (0, 90, 180, 270), + "plain_mode_space_width": 200.0, + "layout_mode_space_vertically": True, + "layout_mode_scale_weight": 1.25, + "layout_mode_strip_rotated": True, + "layout_mode_font_height_weight": 1.0, }, } def test_from_dict(self): - data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {"converter": None}} + data = { + "type": "haystack.components.converters.pypdf.PyPDFToDocument", + "init_parameters": { + "converter": None, + "extraction_mode": "plain", + "plain_mode_orientations": (0, 90, 180, 270), + "plain_mode_space_width": 200.0, + "layout_mode_space_vertically": True, + "layout_mode_scale_weight": 1.25, + "layout_mode_strip_rotated": True, + "layout_mode_font_height_weight": 1.0, + }, + } + instance = PyPDFToDocument.from_dict(data) assert isinstance(instance, PyPDFToDocument) assert instance.converter is None + assert instance.extraction_mode == PyPDFExtractionMode.PLAIN + assert instance.plain_mode_orientations == (0, 90, 180, 270) + assert instance.plain_mode_space_width == 200.0 + assert instance.layout_mode_space_vertically is True + assert instance.layout_mode_scale_weight == 1.25 + assert instance.layout_mode_strip_rotated is True + assert instance.layout_mode_font_height_weight == 1.0 def test_from_dict_defaults(self): data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}} @@ -83,30 +149,63 @@ def test_from_dict_custom_converter(self): assert isinstance(instance, PyPDFToDocument) assert isinstance(instance.converter, CustomConverter) + def test_default_convert(self): + mock_page1 = Mock() + mock_page2 = Mock() + mock_page1.extract_text.return_value = "Page 1 content" + mock_page2.extract_text.return_value = "Page 2 content" + mock_reader = Mock() + mock_reader.pages = [mock_page1, mock_page2] + + converter = PyPDFToDocument( + extraction_mode="layout", + plain_mode_orientations=(0, 90), + plain_mode_space_width=150.0, + layout_mode_space_vertically=False, + layout_mode_scale_weight=2.0, + layout_mode_strip_rotated=False, + layout_mode_font_height_weight=1.5, + ) + + doc = converter._default_convert(mock_reader) + assert doc.content == "Page 1 content\fPage 2 content" + + expected_params = { + "extraction_mode": "layout", + "orientations": (0, 90), + "space_width": 150.0, + "layout_mode_space_vertically": False, + "layout_mode_scale_weight": 2.0, + "layout_mode_strip_rotated": False, + "layout_mode_font_height_weight": 1.5, + } + for mock_page in mock_reader.pages: + mock_page.extract_text.assert_called_once_with(**expected_params) + @pytest.mark.integration - def test_run(self, test_files_path, pypdf_converter): + def test_run(self, test_files_path, pypdf_component): """ Test if the component runs correctly. """ paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"] - output = pypdf_converter.run(sources=paths) + output = pypdf_component.run(sources=paths) docs = output["documents"] assert len(docs) == 1 assert "History" in docs[0].content @pytest.mark.integration - def test_page_breaks_added(self, test_files_path, pypdf_converter): + def test_page_breaks_added(self, test_files_path, pypdf_component): paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"] - output = pypdf_converter.run(sources=paths) + output = pypdf_component.run(sources=paths) docs = output["documents"] assert len(docs) == 1 assert docs[0].content.count("\f") == 3 - def test_run_with_meta(self, test_files_path, pypdf_converter): + def test_run_with_meta(self, test_files_path, pypdf_component): bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"}) with patch("haystack.components.converters.pypdf.PdfReader"): - output = pypdf_converter.run( + output = pypdf_component.run( sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"} ) @@ -115,17 +214,17 @@ def test_run_with_meta(self, test_files_path, pypdf_converter): assert output["documents"][0].meta["language"] == "it" assert output["documents"][1].meta["language"] == "it" - def test_run_error_handling(self, test_files_path, pypdf_converter, caplog): + def test_run_error_handling(self, test_files_path, pypdf_component, caplog): """ Test if the component correctly handles errors. """ paths = ["non_existing_file.pdf"] with caplog.at_level(logging.WARNING): - pypdf_converter.run(sources=paths) + pypdf_component.run(sources=paths) assert "Could not read non_existing_file.pdf" in caplog.text @pytest.mark.integration - def test_mixed_sources_run(self, test_files_path, pypdf_converter): + def test_mixed_sources_run(self, test_files_path, pypdf_component): """ Test if the component runs correctly when mixed sources are provided. """ @@ -133,7 +232,7 @@ def test_mixed_sources_run(self, test_files_path, pypdf_converter): with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f: paths.append(ByteStream(f.read())) - output = pypdf_converter.run(sources=paths) + output = pypdf_component.run(sources=paths) docs = output["documents"] assert len(docs) == 2 assert "History and standardization" in docs[0].content