Skip to content

Commit

Permalink
feat: PyPDFToDocument - add new customization parameters (#8574)
Browse files Browse the repository at this point in the history
* deprecat converter in pypdf

* fix linting of MetaFieldGroupingRanker

* linting

* pypdftodocument: add customization params

* fix mypy

* incorporate feedback
  • Loading branch information
anakin87 authored Nov 26, 2024
1 parent 2440a5e commit fb42c03
Show file tree
Hide file tree
Showing 3 changed files with 227 additions and 28 deletions.
101 changes: 98 additions & 3 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import io
import warnings
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Protocol, Union

Expand Down Expand Up @@ -39,6 +40,33 @@ def from_dict(cls, data): # noqa: D102
...


class PyPDFExtractionMode(Enum):
"""
The mode to use for extracting text from a PDF.
"""

PLAIN = "plain"
LAYOUT = "layout"

def __str__(self) -> str:
"""
Convert a PyPDFExtractionMode enum to a string.
"""
return self.value

@staticmethod
def from_str(string: str) -> "PyPDFExtractionMode":
"""
Convert a string to a PyPDFExtractionMode enum.
"""
enum_map = {e.value: e for e in PyPDFExtractionMode}
mode = enum_map.get(string)
if mode is None:
msg = f"Unknown extraction mode '{string}'. Supported modes are: {list(enum_map.keys())}"
raise ValueError(msg)
return mode


@component
class PyPDFToDocument:
"""
Expand All @@ -60,13 +88,49 @@ class PyPDFToDocument:
```
"""

def __init__(self, converter: Optional[PyPDFConverter] = None):
def __init__(
self,
converter: Optional[PyPDFConverter] = None,
*,
extraction_mode: Union[str, PyPDFExtractionMode] = PyPDFExtractionMode.PLAIN,
plain_mode_orientations: tuple = (0, 90, 180, 270),
plain_mode_space_width: float = 200.0,
layout_mode_space_vertically: bool = True,
layout_mode_scale_weight: float = 1.25,
layout_mode_strip_rotated: bool = True,
layout_mode_font_height_weight: float = 1.0,
):
"""
Create an PyPDFToDocument component.
:param converter:
An instance of a PyPDFConverter compatible class. This is deprecated and will be removed in Haystack 2.9.0.
For in-depth customization of the conversion process, consider implementing a custom component.
All the following parameters are applied only if `converter` is None.
:param extraction_mode:
The mode to use for extracting text from a PDF.
Layout mode is an experimental mode that adheres to the rendered layout of the PDF.
:param plain_mode_orientations:
Tuple of orientations to look for when extracting text from a PDF in plain mode.
Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
:param plain_mode_space_width:
Forces default space width if not extracted from font.
Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
:param layout_mode_space_vertically:
Whether to include blank lines inferred from y distance + font height.
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
:param layout_mode_scale_weight:
Multiplier for string length when calculating weighted average character width.
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
:param layout_mode_strip_rotated:
Layout mode does not support rotated text. Set to `False` to include rotated text anyway.
If rotated text is discovered, layout will be degraded and a warning will be logged.
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
:param layout_mode_font_height_weight:
Multiplier for font height when calculating blank line height.
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
"""
pypdf_import.check()

Expand All @@ -79,6 +143,16 @@ def __init__(self, converter: Optional[PyPDFConverter] = None):

self.converter = converter

if isinstance(extraction_mode, str):
extraction_mode = PyPDFExtractionMode.from_str(extraction_mode)
self.extraction_mode = extraction_mode
self.plain_mode_orientations = plain_mode_orientations
self.plain_mode_space_width = plain_mode_space_width
self.layout_mode_space_vertically = layout_mode_space_vertically
self.layout_mode_scale_weight = layout_mode_scale_weight
self.layout_mode_strip_rotated = layout_mode_strip_rotated
self.layout_mode_font_height_weight = layout_mode_font_height_weight

def to_dict(self):
"""
Serializes the component to a dictionary.
Expand All @@ -87,7 +161,15 @@ def to_dict(self):
Dictionary with serialized data.
"""
return default_to_dict(
self, converter=(serialize_class_instance(self.converter) if self.converter is not None else None)
self,
converter=(serialize_class_instance(self.converter) if self.converter else None),
extraction_mode=str(self.extraction_mode),
plain_mode_orientations=self.plain_mode_orientations,
plain_mode_space_width=self.plain_mode_space_width,
layout_mode_space_vertically=self.layout_mode_space_vertically,
layout_mode_scale_weight=self.layout_mode_scale_weight,
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
)

@classmethod
Expand All @@ -108,7 +190,20 @@ def from_dict(cls, data):
return default_from_dict(cls, data)

def _default_convert(self, reader: "PdfReader") -> Document:
text = "\f".join(page.extract_text() for page in reader.pages)
texts = []
for page in reader.pages:
texts.append(
page.extract_text(
orientations=self.plain_mode_orientations,
extraction_mode=self.extraction_mode.value,
space_width=self.plain_mode_space_width,
layout_mode_space_vertically=self.layout_mode_space_vertically,
layout_mode_scale_weight=self.layout_mode_scale_weight,
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
)
)
text = "\f".join(texts)
return Document(content=text)

@component.output_types(documents=List[Document])
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
enhancements:
- |
Added new initialization parameters to the `PyPDFToDocument` component to customize the text extraction process
from PDF files.
149 changes: 124 additions & 25 deletions test/components/converters/test_pypdf_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@
#
# SPDX-License-Identifier: Apache-2.0
import logging
from unittest.mock import patch
from unittest.mock import patch, Mock

import pytest

from haystack import Document, default_from_dict, default_to_dict
from haystack.components.converters.pypdf import PyPDFToDocument
from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode
from haystack.dataclasses import ByteStream


@pytest.fixture
def pypdf_converter():
def pypdf_component():
return PyPDFToDocument()


Expand All @@ -30,38 +30,104 @@ def from_dict(cls, data):


class TestPyPDFToDocument:
def test_init(self, pypdf_converter):
assert pypdf_converter.converter is None
def test_init(self, pypdf_component):
assert pypdf_component.converter is None
assert pypdf_component.extraction_mode == PyPDFExtractionMode.PLAIN
assert pypdf_component.plain_mode_orientations == (0, 90, 180, 270)
assert pypdf_component.plain_mode_space_width == 200.0
assert pypdf_component.layout_mode_space_vertically is True
assert pypdf_component.layout_mode_scale_weight == 1.25
assert pypdf_component.layout_mode_strip_rotated is True
assert pypdf_component.layout_mode_font_height_weight == 1.0

def test_init_params(self):
pypdf_converter = PyPDFToDocument(converter=CustomConverter())
assert isinstance(pypdf_converter.converter, CustomConverter)
def test_init_converter(self):
pypdf_component = PyPDFToDocument(converter=CustomConverter())
assert isinstance(pypdf_component.converter, CustomConverter)

def test_to_dict(self, pypdf_converter):
data = pypdf_converter.to_dict()
def test_init_custom_params(self):
pypdf_component = PyPDFToDocument(
extraction_mode="layout",
plain_mode_orientations=(0, 90),
plain_mode_space_width=150.0,
layout_mode_space_vertically=False,
layout_mode_scale_weight=2.0,
layout_mode_strip_rotated=False,
layout_mode_font_height_weight=0.5,
)

assert pypdf_component.extraction_mode == PyPDFExtractionMode.LAYOUT
assert pypdf_component.plain_mode_orientations == (0, 90)
assert pypdf_component.plain_mode_space_width == 150.0
assert pypdf_component.layout_mode_space_vertically is False
assert pypdf_component.layout_mode_scale_weight == 2.0
assert pypdf_component.layout_mode_strip_rotated is False
assert pypdf_component.layout_mode_font_height_weight == 0.5

def test_init_invalid_extraction_mode(self):
with pytest.raises(ValueError):
PyPDFToDocument(extraction_mode="invalid")

def test_to_dict(self, pypdf_component):
data = pypdf_component.to_dict()
assert data == {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {"converter": None},
"init_parameters": {
"converter": None,
"extraction_mode": "plain",
"plain_mode_orientations": (0, 90, 180, 270),
"plain_mode_space_width": 200.0,
"layout_mode_space_vertically": True,
"layout_mode_scale_weight": 1.25,
"layout_mode_strip_rotated": True,
"layout_mode_font_height_weight": 1.0,
},
}

def test_to_dict_custom_converter(self):
pypdf_converter = PyPDFToDocument(converter=CustomConverter())
data = pypdf_converter.to_dict()
pypdf_component = PyPDFToDocument(converter=CustomConverter())
data = pypdf_component.to_dict()
assert data == {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {
"converter": {
"data": {"key": "value", "more": False},
"type": "converters.test_pypdf_to_document.CustomConverter",
}
},
"extraction_mode": "plain",
"plain_mode_orientations": (0, 90, 180, 270),
"plain_mode_space_width": 200.0,
"layout_mode_space_vertically": True,
"layout_mode_scale_weight": 1.25,
"layout_mode_strip_rotated": True,
"layout_mode_font_height_weight": 1.0,
},
}

def test_from_dict(self):
data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {"converter": None}}
data = {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {
"converter": None,
"extraction_mode": "plain",
"plain_mode_orientations": (0, 90, 180, 270),
"plain_mode_space_width": 200.0,
"layout_mode_space_vertically": True,
"layout_mode_scale_weight": 1.25,
"layout_mode_strip_rotated": True,
"layout_mode_font_height_weight": 1.0,
},
}

instance = PyPDFToDocument.from_dict(data)
assert isinstance(instance, PyPDFToDocument)
assert instance.converter is None
assert instance.extraction_mode == PyPDFExtractionMode.PLAIN
assert instance.plain_mode_orientations == (0, 90, 180, 270)
assert instance.plain_mode_space_width == 200.0
assert instance.layout_mode_space_vertically is True
assert instance.layout_mode_scale_weight == 1.25
assert instance.layout_mode_strip_rotated is True
assert instance.layout_mode_font_height_weight == 1.0

def test_from_dict_defaults(self):
data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}}
Expand All @@ -83,30 +149,63 @@ def test_from_dict_custom_converter(self):
assert isinstance(instance, PyPDFToDocument)
assert isinstance(instance.converter, CustomConverter)

def test_default_convert(self):
mock_page1 = Mock()
mock_page2 = Mock()
mock_page1.extract_text.return_value = "Page 1 content"
mock_page2.extract_text.return_value = "Page 2 content"
mock_reader = Mock()
mock_reader.pages = [mock_page1, mock_page2]

converter = PyPDFToDocument(
extraction_mode="layout",
plain_mode_orientations=(0, 90),
plain_mode_space_width=150.0,
layout_mode_space_vertically=False,
layout_mode_scale_weight=2.0,
layout_mode_strip_rotated=False,
layout_mode_font_height_weight=1.5,
)

doc = converter._default_convert(mock_reader)
assert doc.content == "Page 1 content\fPage 2 content"

expected_params = {
"extraction_mode": "layout",
"orientations": (0, 90),
"space_width": 150.0,
"layout_mode_space_vertically": False,
"layout_mode_scale_weight": 2.0,
"layout_mode_strip_rotated": False,
"layout_mode_font_height_weight": 1.5,
}
for mock_page in mock_reader.pages:
mock_page.extract_text.assert_called_once_with(**expected_params)

@pytest.mark.integration
def test_run(self, test_files_path, pypdf_converter):
def test_run(self, test_files_path, pypdf_component):
"""
Test if the component runs correctly.
"""
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
output = pypdf_converter.run(sources=paths)
output = pypdf_component.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "History" in docs[0].content

@pytest.mark.integration
def test_page_breaks_added(self, test_files_path, pypdf_converter):
def test_page_breaks_added(self, test_files_path, pypdf_component):
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
output = pypdf_converter.run(sources=paths)
output = pypdf_component.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert docs[0].content.count("\f") == 3

def test_run_with_meta(self, test_files_path, pypdf_converter):
def test_run_with_meta(self, test_files_path, pypdf_component):
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})

with patch("haystack.components.converters.pypdf.PdfReader"):
output = pypdf_converter.run(
output = pypdf_component.run(
sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"}
)

Expand All @@ -115,25 +214,25 @@ def test_run_with_meta(self, test_files_path, pypdf_converter):
assert output["documents"][0].meta["language"] == "it"
assert output["documents"][1].meta["language"] == "it"

def test_run_error_handling(self, test_files_path, pypdf_converter, caplog):
def test_run_error_handling(self, test_files_path, pypdf_component, caplog):
"""
Test if the component correctly handles errors.
"""
paths = ["non_existing_file.pdf"]
with caplog.at_level(logging.WARNING):
pypdf_converter.run(sources=paths)
pypdf_component.run(sources=paths)
assert "Could not read non_existing_file.pdf" in caplog.text

@pytest.mark.integration
def test_mixed_sources_run(self, test_files_path, pypdf_converter):
def test_mixed_sources_run(self, test_files_path, pypdf_component):
"""
Test if the component runs correctly when mixed sources are provided.
"""
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f:
paths.append(ByteStream(f.read()))

output = pypdf_converter.run(sources=paths)
output = pypdf_component.run(sources=paths)
docs = output["documents"]
assert len(docs) == 2
assert "History and standardization" in docs[0].content
Expand Down

0 comments on commit fb42c03

Please sign in to comment.