Skip to content

Commit

Permalink
feat: Add store_full_path to converters (3/3) (#8585)
Browse files Browse the repository at this point in the history
* Add store_full_path params
  • Loading branch information
Amnah199 authored Dec 3, 2024
1 parent de7099e commit 4c8eb54
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 2 deletions.
19 changes: 18 additions & 1 deletion haystack/components/converters/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import copy
import hashlib
import os
import warnings
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Union
Expand Down Expand Up @@ -49,7 +51,7 @@ class AzureOCRDocumentConverter:
```
"""

def __init__(
def __init__( # pylint: disable=too-many-positional-arguments
self,
endpoint: str,
api_key: Secret = Secret.from_env_var("AZURE_AI_API_KEY"),
Expand All @@ -59,6 +61,7 @@ def __init__(
merge_multiple_column_headers: bool = True,
page_layout: Literal["natural", "single_column"] = "natural",
threshold_y: Optional[float] = 0.05,
store_full_path: bool = True,
):
"""
Creates an AzureOCRDocumentConverter component.
Expand All @@ -83,6 +86,9 @@ def __init__(
The threshold, in inches, to determine if two recognized PDF elements are grouped into a
single line. This is crucial for section headers or numbers which may be spatially separated
from the remaining text on the horizontal axis.
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""
azure_import.check()

Expand All @@ -97,6 +103,7 @@ def __init__(
self.merge_multiple_column_headers = merge_multiple_column_headers
self.page_layout = page_layout
self.threshold_y = threshold_y
self.store_full_path = store_full_path
if self.page_layout == "single_column" and self.threshold_y is None:
self.threshold_y = 0.05

Expand Down Expand Up @@ -136,6 +143,15 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
azure_output.append(result.to_dict())

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
docs = self._convert_tables_and_text(result=result, meta=merged_metadata)
documents.extend(docs)

Expand All @@ -158,6 +174,7 @@ def to_dict(self) -> Dict[str, Any]:
merge_multiple_column_headers=self.merge_multiple_column_headers,
page_layout=self.page_layout,
threshold_y=self.threshold_y,
store_full_path=self.store_full_path,
)

@classmethod
Expand Down
15 changes: 15 additions & 0 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

import io
import os
import warnings
from enum import Enum
from pathlib import Path
Expand Down Expand Up @@ -99,6 +100,7 @@ def __init__(
layout_mode_scale_weight: float = 1.25,
layout_mode_strip_rotated: bool = True,
layout_mode_font_height_weight: float = 1.0,
store_full_path: bool = True,
):
"""
Create an PyPDFToDocument component.
Expand Down Expand Up @@ -131,6 +133,9 @@ def __init__(
:param layout_mode_font_height_weight:
Multiplier for font height when calculating blank line height.
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""
pypdf_import.check()

Expand All @@ -142,6 +147,7 @@ def __init__(
warnings.warn(msg, DeprecationWarning)

self.converter = converter
self.store_full_path = store_full_path

if isinstance(extraction_mode, str):
extraction_mode = PyPDFExtractionMode.from_str(extraction_mode)
Expand Down Expand Up @@ -170,6 +176,7 @@ def to_dict(self):
layout_mode_scale_weight=self.layout_mode_scale_weight,
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
store_full_path=self.store_full_path,
)

@classmethod
Expand Down Expand Up @@ -255,6 +262,14 @@ def run(
)

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)
if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
document.meta = merged_metadata
documents.append(document)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
features:
- |
Added a new `store_full_path` parameter to the `__init__` methods of `PyPDFToDocument` and `AzureOCRDocumentConverter`. The default value is `True`, which stores full file path in the metadata of the output documents. When set to `False`, only the file name is stored.
deprecations:
- |
The default value of the `store_full_path` parameter in `PyPDFToDocument` and `AzureOCRDocumentConverter` will change to `False` in Haysatck 2.9.0 to enhance privacy.
19 changes: 19 additions & 0 deletions test/components/converters/test_azure_ocr_doc_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def test_to_dict(self, mock_resolve_value):
"page_layout": "natural",
"preceding_context_len": 3,
"threshold_y": 0.05,
"store_full_path": True,
},
}

Expand Down Expand Up @@ -278,6 +279,9 @@ def test_with_image_file(self, test_files_path):
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure endpoint not available")
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
def test_run_with_docx_file(self, test_files_path):
"""
Test if the component runs correctly with store_full_path=False
"""
component = AzureOCRDocumentConverter(
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=Secret.from_env_var("CORE_AZURE_CS_API_KEY")
)
Expand All @@ -288,6 +292,21 @@ def test_run_with_docx_file(self, test_files_path):
assert "Now we are in Page 2" in documents[0].content
assert "Page 3 was empty this is page 4" in documents[0].content

@pytest.mark.integration
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure endpoint not available")
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
def test_run_with_store_full_path_false(self, test_files_path):
component = AzureOCRDocumentConverter(
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"],
api_key=Secret.from_env_var("CORE_AZURE_CS_API_KEY"),
store_full_path=False,
)
output = component.run(sources=[test_files_path / "docx" / "sample_docx.docx"])
documents = output["documents"]
assert len(documents) == 1
assert "Sample Docx File" in documents[0].content
assert documents[0].meta["file_path"] == "sample_docx.docx"

@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
def test_hashing_dataframe(self, mock_resolve_value):
mock_resolve_value.return_value = "test_api_key"
Expand Down
23 changes: 22 additions & 1 deletion test/components/converters/test_pypdf_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,12 @@ def test_to_dict(self, pypdf_component):
"layout_mode_scale_weight": 1.25,
"layout_mode_strip_rotated": True,
"layout_mode_font_height_weight": 1.0,
"store_full_path": True,
},
}

def test_to_dict_custom_converter(self):
pypdf_component = PyPDFToDocument(converter=CustomConverter())
pypdf_component = PyPDFToDocument(converter=CustomConverter(), store_full_path=False)
data = pypdf_component.to_dict()
assert data == {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
Expand All @@ -100,6 +101,7 @@ def test_to_dict_custom_converter(self):
"layout_mode_scale_weight": 1.25,
"layout_mode_strip_rotated": True,
"layout_mode_font_height_weight": 1.0,
"store_full_path": False,
},
}

Expand Down Expand Up @@ -214,6 +216,25 @@ def test_run_with_meta(self, test_files_path, pypdf_component):
assert output["documents"][0].meta["language"] == "it"
assert output["documents"][1].meta["language"] == "it"

def test_run_with_store_full_path_false(self, test_files_path):
"""
Test if the component runs correctly with store_full_path=False
"""
sources = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
converter = PyPDFToDocument(store_full_path=True)
results = converter.run(sources=sources)
docs = results["documents"]

assert len(docs) == 1
assert docs[0].meta["file_path"] == str(sources[0])

converter = PyPDFToDocument(store_full_path=False)
results = converter.run(sources=sources)
docs = results["documents"]

assert len(docs) == 1
assert docs[0].meta["file_path"] == "sample_pdf_1.pdf"

def test_run_error_handling(self, test_files_path, pypdf_component, caplog):
"""
Test if the component correctly handles errors.
Expand Down

0 comments on commit 4c8eb54

Please sign in to comment.