From 4c8eb540494224791651f57f430e38fa190c0707 Mon Sep 17 00:00:00 2001 From: Amna Mubashar Date: Tue, 3 Dec 2024 13:48:56 +0500 Subject: [PATCH] feat: Add store_full_path to converters (3/3) (#8585) * Add store_full_path params --- haystack/components/converters/azure.py | 19 ++++++++++++++- haystack/components/converters/pypdf.py | 15 ++++++++++++ ...-param-to-converters-5bd32a7561abfe78.yaml | 8 +++++++ .../test_azure_ocr_doc_converter.py | 19 +++++++++++++++ .../converters/test_pypdf_to_document.py | 23 ++++++++++++++++++- 5 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 releasenotes/notes/add-store-full-path-param-to-converters-5bd32a7561abfe78.yaml diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py index 94ca9714f3..edcc348d84 100644 --- a/haystack/components/converters/azure.py +++ b/haystack/components/converters/azure.py @@ -4,6 +4,8 @@ import copy import hashlib +import os +import warnings from collections import defaultdict from pathlib import Path from typing import Any, Dict, List, Literal, Optional, Union @@ -49,7 +51,7 @@ class AzureOCRDocumentConverter: ``` """ - def __init__( + def __init__( # pylint: disable=too-many-positional-arguments self, endpoint: str, api_key: Secret = Secret.from_env_var("AZURE_AI_API_KEY"), @@ -59,6 +61,7 @@ def __init__( merge_multiple_column_headers: bool = True, page_layout: Literal["natural", "single_column"] = "natural", threshold_y: Optional[float] = 0.05, + store_full_path: bool = True, ): """ Creates an AzureOCRDocumentConverter component. @@ -83,6 +86,9 @@ def __init__( The threshold, in inches, to determine if two recognized PDF elements are grouped into a single line. This is crucial for section headers or numbers which may be spatially separated from the remaining text on the horizontal axis. + :param store_full_path: + If True, the full path of the file is stored in the metadata of the document. + If False, only the file name is stored. """ azure_import.check() @@ -97,6 +103,7 @@ def __init__( self.merge_multiple_column_headers = merge_multiple_column_headers self.page_layout = page_layout self.threshold_y = threshold_y + self.store_full_path = store_full_path if self.page_layout == "single_column" and self.threshold_y is None: self.threshold_y = 0.05 @@ -136,6 +143,15 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D azure_output.append(result.to_dict()) merged_metadata = {**bytestream.meta, **metadata} + warnings.warn( + "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " + "In the 2.9.0 release, the default value for `store_full_path` will change to False, " + "storing only file names to improve privacy.", + DeprecationWarning, + ) + + if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): + merged_metadata["file_path"] = os.path.basename(file_path) docs = self._convert_tables_and_text(result=result, meta=merged_metadata) documents.extend(docs) @@ -158,6 +174,7 @@ def to_dict(self) -> Dict[str, Any]: merge_multiple_column_headers=self.merge_multiple_column_headers, page_layout=self.page_layout, threshold_y=self.threshold_y, + store_full_path=self.store_full_path, ) @classmethod diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py index 555cf1df58..de55f68471 100644 --- a/haystack/components/converters/pypdf.py +++ b/haystack/components/converters/pypdf.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import io +import os import warnings from enum import Enum from pathlib import Path @@ -99,6 +100,7 @@ def __init__( layout_mode_scale_weight: float = 1.25, layout_mode_strip_rotated: bool = True, layout_mode_font_height_weight: float = 1.0, + store_full_path: bool = True, ): """ Create an PyPDFToDocument component. @@ -131,6 +133,9 @@ def __init__( :param layout_mode_font_height_weight: Multiplier for font height when calculating blank line height. Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`. + :param store_full_path: + If True, the full path of the file is stored in the metadata of the document. + If False, only the file name is stored. """ pypdf_import.check() @@ -142,6 +147,7 @@ def __init__( warnings.warn(msg, DeprecationWarning) self.converter = converter + self.store_full_path = store_full_path if isinstance(extraction_mode, str): extraction_mode = PyPDFExtractionMode.from_str(extraction_mode) @@ -170,6 +176,7 @@ def to_dict(self): layout_mode_scale_weight=self.layout_mode_scale_weight, layout_mode_strip_rotated=self.layout_mode_strip_rotated, layout_mode_font_height_weight=self.layout_mode_font_height_weight, + store_full_path=self.store_full_path, ) @classmethod @@ -255,6 +262,14 @@ def run( ) merged_metadata = {**bytestream.meta, **metadata} + warnings.warn( + "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " + "In the 2.9.0 release, the default value for `store_full_path` will change to False, " + "storing only file names to improve privacy.", + DeprecationWarning, + ) + if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): + merged_metadata["file_path"] = os.path.basename(file_path) document.meta = merged_metadata documents.append(document) diff --git a/releasenotes/notes/add-store-full-path-param-to-converters-5bd32a7561abfe78.yaml b/releasenotes/notes/add-store-full-path-param-to-converters-5bd32a7561abfe78.yaml new file mode 100644 index 0000000000..a45d6e44f3 --- /dev/null +++ b/releasenotes/notes/add-store-full-path-param-to-converters-5bd32a7561abfe78.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Added a new `store_full_path` parameter to the `__init__` methods of `PyPDFToDocument` and `AzureOCRDocumentConverter`. The default value is `True`, which stores full file path in the metadata of the output documents. When set to `False`, only the file name is stored. + +deprecations: + - | + The default value of the `store_full_path` parameter in `PyPDFToDocument` and `AzureOCRDocumentConverter` will change to `False` in Haysatck 2.9.0 to enhance privacy. diff --git a/test/components/converters/test_azure_ocr_doc_converter.py b/test/components/converters/test_azure_ocr_doc_converter.py index 1ba7d33f5a..f8db0d734c 100644 --- a/test/components/converters/test_azure_ocr_doc_converter.py +++ b/test/components/converters/test_azure_ocr_doc_converter.py @@ -105,6 +105,7 @@ def test_to_dict(self, mock_resolve_value): "page_layout": "natural", "preceding_context_len": 3, "threshold_y": 0.05, + "store_full_path": True, }, } @@ -278,6 +279,9 @@ def test_with_image_file(self, test_files_path): @pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure endpoint not available") @pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available") def test_run_with_docx_file(self, test_files_path): + """ + Test if the component runs correctly with store_full_path=False + """ component = AzureOCRDocumentConverter( endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=Secret.from_env_var("CORE_AZURE_CS_API_KEY") ) @@ -288,6 +292,21 @@ def test_run_with_docx_file(self, test_files_path): assert "Now we are in Page 2" in documents[0].content assert "Page 3 was empty this is page 4" in documents[0].content + @pytest.mark.integration + @pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure endpoint not available") + @pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available") + def test_run_with_store_full_path_false(self, test_files_path): + component = AzureOCRDocumentConverter( + endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], + api_key=Secret.from_env_var("CORE_AZURE_CS_API_KEY"), + store_full_path=False, + ) + output = component.run(sources=[test_files_path / "docx" / "sample_docx.docx"]) + documents = output["documents"] + assert len(documents) == 1 + assert "Sample Docx File" in documents[0].content + assert documents[0].meta["file_path"] == "sample_docx.docx" + @patch("haystack.utils.auth.EnvVarSecret.resolve_value") def test_hashing_dataframe(self, mock_resolve_value): mock_resolve_value.return_value = "test_api_key" diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py index d9aee0020f..e82b8029d4 100644 --- a/test/components/converters/test_pypdf_to_document.py +++ b/test/components/converters/test_pypdf_to_document.py @@ -80,11 +80,12 @@ def test_to_dict(self, pypdf_component): "layout_mode_scale_weight": 1.25, "layout_mode_strip_rotated": True, "layout_mode_font_height_weight": 1.0, + "store_full_path": True, }, } def test_to_dict_custom_converter(self): - pypdf_component = PyPDFToDocument(converter=CustomConverter()) + pypdf_component = PyPDFToDocument(converter=CustomConverter(), store_full_path=False) data = pypdf_component.to_dict() assert data == { "type": "haystack.components.converters.pypdf.PyPDFToDocument", @@ -100,6 +101,7 @@ def test_to_dict_custom_converter(self): "layout_mode_scale_weight": 1.25, "layout_mode_strip_rotated": True, "layout_mode_font_height_weight": 1.0, + "store_full_path": False, }, } @@ -214,6 +216,25 @@ def test_run_with_meta(self, test_files_path, pypdf_component): assert output["documents"][0].meta["language"] == "it" assert output["documents"][1].meta["language"] == "it" + def test_run_with_store_full_path_false(self, test_files_path): + """ + Test if the component runs correctly with store_full_path=False + """ + sources = [test_files_path / "pdf" / "sample_pdf_1.pdf"] + converter = PyPDFToDocument(store_full_path=True) + results = converter.run(sources=sources) + docs = results["documents"] + + assert len(docs) == 1 + assert docs[0].meta["file_path"] == str(sources[0]) + + converter = PyPDFToDocument(store_full_path=False) + results = converter.run(sources=sources) + docs = results["documents"] + + assert len(docs) == 1 + assert docs[0].meta["file_path"] == "sample_pdf_1.pdf" + def test_run_error_handling(self, test_files_path, pypdf_component, caplog): """ Test if the component correctly handles errors.