feat: Add store_full_path to converters (3/3) (#8585)

* Add store_full_path params
deepset-ai · Dec 3, 2024 · 4c8eb54 · 4c8eb54
1 parent de7099e
commit 4c8eb54
Show file tree

Hide file tree

Showing 5 changed files with 82 additions and 2 deletions.
diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py
@@ -4,6 +4,8 @@
 
 import copy
 import hashlib
+import os
+import warnings
 from collections import defaultdict
 from pathlib import Path
 from typing import Any, Dict, List, Literal, Optional, Union
@@ -49,7 +51,7 @@ class AzureOCRDocumentConverter:
     ```
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         endpoint: str,
         api_key: Secret = Secret.from_env_var("AZURE_AI_API_KEY"),
@@ -59,6 +61,7 @@ def __init__(
         merge_multiple_column_headers: bool = True,
         page_layout: Literal["natural", "single_column"] = "natural",
         threshold_y: Optional[float] = 0.05,
+        store_full_path: bool = True,
     ):
         """
         Creates an AzureOCRDocumentConverter component.
@@ -83,6 +86,9 @@ def __init__(
             The threshold, in inches, to determine if two recognized PDF elements are grouped into a
             single line. This is crucial for section headers or numbers which may be spatially separated
             from the remaining text on the horizontal axis.
+        :param store_full_path:
+            If True, the full path of the file is stored in the metadata of the document.
+            If False, only the file name is stored.
         """
         azure_import.check()
 
@@ -97,6 +103,7 @@ def __init__(
         self.merge_multiple_column_headers = merge_multiple_column_headers
         self.page_layout = page_layout
         self.threshold_y = threshold_y
+        self.store_full_path = store_full_path
         if self.page_layout == "single_column" and self.threshold_y is None:
             self.threshold_y = 0.05
 
@@ -136,6 +143,15 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
             azure_output.append(result.to_dict())
 
             merged_metadata = {**bytestream.meta, **metadata}
+            warnings.warn(
+                "The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
+                "In the 2.9.0 release, the default value for `store_full_path` will change to False, "
+                "storing only file names to improve privacy.",
+                DeprecationWarning,
+            )
+
+            if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
+                merged_metadata["file_path"] = os.path.basename(file_path)
             docs = self._convert_tables_and_text(result=result, meta=merged_metadata)
             documents.extend(docs)
 
@@ -158,6 +174,7 @@ def to_dict(self) -> Dict[str, Any]:
             merge_multiple_column_headers=self.merge_multiple_column_headers,
             page_layout=self.page_layout,
             threshold_y=self.threshold_y,
+            store_full_path=self.store_full_path,
         )
 
     @classmethod

diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import io
+import os
 import warnings
 from enum import Enum
 from pathlib import Path
@@ -99,6 +100,7 @@ def __init__(
         layout_mode_scale_weight: float = 1.25,
         layout_mode_strip_rotated: bool = True,
         layout_mode_font_height_weight: float = 1.0,
+        store_full_path: bool = True,
     ):
         """
         Create an PyPDFToDocument component.
@@ -131,6 +133,9 @@ def __init__(
         :param layout_mode_font_height_weight:
             Multiplier for font height when calculating blank line height.
             Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
+        :param store_full_path:
+            If True, the full path of the file is stored in the metadata of the document.
+            If False, only the file name is stored.
         """
         pypdf_import.check()
 
@@ -142,6 +147,7 @@ def __init__(
             warnings.warn(msg, DeprecationWarning)
 
         self.converter = converter
+        self.store_full_path = store_full_path
 
         if isinstance(extraction_mode, str):
             extraction_mode = PyPDFExtractionMode.from_str(extraction_mode)
@@ -170,6 +176,7 @@ def to_dict(self):
             layout_mode_scale_weight=self.layout_mode_scale_weight,
             layout_mode_strip_rotated=self.layout_mode_strip_rotated,
             layout_mode_font_height_weight=self.layout_mode_font_height_weight,
+            store_full_path=self.store_full_path,
         )
 
     @classmethod
@@ -255,6 +262,14 @@ def run(
                 )
 
             merged_metadata = {**bytestream.meta, **metadata}
+            warnings.warn(
+                "The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
+                "In the 2.9.0 release, the default value for `store_full_path` will change to False, "
+                "storing only file names to improve privacy.",
+                DeprecationWarning,
+            )
+            if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
+                merged_metadata["file_path"] = os.path.basename(file_path)
             document.meta = merged_metadata
             documents.append(document)
 

diff --git a/releasenotes/notes/add-store-full-path-param-to-converters-5bd32a7561abfe78.yaml b/releasenotes/notes/add-store-full-path-param-to-converters-5bd32a7561abfe78.yaml
@@ -0,0 +1,8 @@
+---
+features:
+  - |
+    Added a new `store_full_path` parameter to the `__init__` methods of `PyPDFToDocument` and `AzureOCRDocumentConverter`. The default value is `True`, which stores full file path in the metadata of the output documents. When set to `False`, only the file name is stored.
+
+deprecations:
+  - |
+    The default value of the `store_full_path` parameter in `PyPDFToDocument` and `AzureOCRDocumentConverter` will change to `False` in Haysatck 2.9.0 to enhance privacy.
diff --git a/test/components/converters/test_azure_ocr_doc_converter.py b/test/components/converters/test_azure_ocr_doc_converter.py
@@ -105,6 +105,7 @@ def test_to_dict(self, mock_resolve_value):
                 "page_layout": "natural",
                 "preceding_context_len": 3,
                 "threshold_y": 0.05,
+                "store_full_path": True,
             },
         }
 
@@ -278,6 +279,9 @@ def test_with_image_file(self, test_files_path):
     @pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure endpoint not available")
     @pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
     def test_run_with_docx_file(self, test_files_path):
+        """
+        Test if the component runs correctly with store_full_path=False
+        """
         component = AzureOCRDocumentConverter(
             endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=Secret.from_env_var("CORE_AZURE_CS_API_KEY")
         )
@@ -288,6 +292,21 @@ def test_run_with_docx_file(self, test_files_path):
         assert "Now we are in Page 2" in documents[0].content
         assert "Page 3 was empty this is page 4" in documents[0].content
 
+    @pytest.mark.integration
+    @pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure endpoint not available")
+    @pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
+    def test_run_with_store_full_path_false(self, test_files_path):
+        component = AzureOCRDocumentConverter(
+            endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"],
+            api_key=Secret.from_env_var("CORE_AZURE_CS_API_KEY"),
+            store_full_path=False,
+        )
+        output = component.run(sources=[test_files_path / "docx" / "sample_docx.docx"])
+        documents = output["documents"]
+        assert len(documents) == 1
+        assert "Sample Docx File" in documents[0].content
+        assert documents[0].meta["file_path"] == "sample_docx.docx"
+
     @patch("haystack.utils.auth.EnvVarSecret.resolve_value")
     def test_hashing_dataframe(self, mock_resolve_value):
         mock_resolve_value.return_value = "test_api_key"

diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py
@@ -80,11 +80,12 @@ def test_to_dict(self, pypdf_component):
                 "layout_mode_scale_weight": 1.25,
                 "layout_mode_strip_rotated": True,
                 "layout_mode_font_height_weight": 1.0,
+                "store_full_path": True,
             },
         }
 
     def test_to_dict_custom_converter(self):
-        pypdf_component = PyPDFToDocument(converter=CustomConverter())
+        pypdf_component = PyPDFToDocument(converter=CustomConverter(), store_full_path=False)
         data = pypdf_component.to_dict()
         assert data == {
             "type": "haystack.components.converters.pypdf.PyPDFToDocument",
@@ -100,6 +101,7 @@ def test_to_dict_custom_converter(self):
                 "layout_mode_scale_weight": 1.25,
                 "layout_mode_strip_rotated": True,
                 "layout_mode_font_height_weight": 1.0,
+                "store_full_path": False,
             },
         }
 
@@ -214,6 +216,25 @@ def test_run_with_meta(self, test_files_path, pypdf_component):
         assert output["documents"][0].meta["language"] == "it"
         assert output["documents"][1].meta["language"] == "it"
 
+    def test_run_with_store_full_path_false(self, test_files_path):
+        """
+        Test if the component runs correctly with store_full_path=False
+        """
+        sources = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
+        converter = PyPDFToDocument(store_full_path=True)
+        results = converter.run(sources=sources)
+        docs = results["documents"]
+
+        assert len(docs) == 1
+        assert docs[0].meta["file_path"] == str(sources[0])
+
+        converter = PyPDFToDocument(store_full_path=False)
+        results = converter.run(sources=sources)
+        docs = results["documents"]
+
+        assert len(docs) == 1
+        assert docs[0].meta["file_path"] == "sample_pdf_1.pdf"
+
     def test_run_error_handling(self, test_files_path, pypdf_component, caplog):
         """
         Test if the component correctly handles errors.