update default value of 'store_full_path' to False in converters (#8619)

deepset-ai · Dec 10, 2024 · 21d53d0 · 21d53d0
1 parent c78eb9b
commit 21d53d0
Show file tree

Hide file tree

Showing 20 changed files with 88 additions and 57 deletions.
diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py
@@ -61,7 +61,7 @@ def __init__(  # pylint: disable=too-many-positional-arguments
         merge_multiple_column_headers: bool = True,
         page_layout: Literal["natural", "single_column"] = "natural",
         threshold_y: Optional[float] = 0.05,
-        store_full_path: bool = True,
+        store_full_path: bool = False,
     ):
         """
         Creates an AzureOCRDocumentConverter component.

diff --git a/haystack/components/converters/csv.py b/haystack/components/converters/csv.py
@@ -36,7 +36,7 @@ class CSVToDocument:
     ```
     """
 
-    def __init__(self, encoding: str = "utf-8", store_full_path: bool = True):
+    def __init__(self, encoding: str = "utf-8", store_full_path: bool = False):
         """
         Creates a CSVToDocument component.
 

diff --git a/haystack/components/converters/docx.py b/haystack/components/converters/docx.py
@@ -109,7 +109,7 @@ class DOCXToDocument:
     ```
     """
 
-    def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = True):
+    def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = False):
         """
         Create a DOCXToDocument component.
 

diff --git a/haystack/components/converters/html.py b/haystack/components/converters/html.py
@@ -35,7 +35,7 @@ class HTMLToDocument:
     ```
     """
 
-    def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None, store_full_path: bool = True):
+    def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None, store_full_path: bool = False):
         """
         Create an HTMLToDocument component.
 

diff --git a/haystack/components/converters/json.py b/haystack/components/converters/json.py
@@ -95,7 +95,7 @@ def __init__(
         jq_schema: Optional[str] = None,
         content_key: Optional[str] = None,
         extra_meta_fields: Optional[Union[Set[str], Literal["*"]]] = None,
-        store_full_path: bool = True,
+        store_full_path: bool = False,
     ):
         """
         Creates a JSONConverter component.

diff --git a/haystack/components/converters/markdown.py b/haystack/components/converters/markdown.py
@@ -40,7 +40,7 @@ class MarkdownToDocument:
     ```
     """
 
-    def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = True):
+    def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = False):
         """
         Create a MarkdownToDocument component.
 

diff --git a/haystack/components/converters/pdfminer.py b/haystack/components/converters/pdfminer.py
@@ -48,7 +48,7 @@ def __init__(  # pylint: disable=too-many-positional-arguments
         boxes_flow: Optional[float] = 0.5,
         detect_vertical: bool = True,
         all_texts: bool = False,
-        store_full_path: bool = True,
+        store_full_path: bool = False,
     ) -> None:
         """
         Create a PDFMinerToDocument component.

diff --git a/haystack/components/converters/pptx.py b/haystack/components/converters/pptx.py
@@ -37,7 +37,7 @@ class PPTXToDocument:
     ```
     """
 
-    def __init__(self, store_full_path: bool = True):
+    def __init__(self, store_full_path: bool = False):
         """
         Create an PPTXToDocument component.
 

diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py
@@ -79,7 +79,7 @@ def __init__(
         layout_mode_scale_weight: float = 1.25,
         layout_mode_strip_rotated: bool = True,
         layout_mode_font_height_weight: float = 1.0,
-        store_full_path: bool = True,
+        store_full_path: bool = False,
     ):
         """
         Create an PyPDFToDocument component.

diff --git a/haystack/components/converters/tika.py b/haystack/components/converters/tika.py
@@ -75,7 +75,7 @@ class TikaDocumentConverter:
     ```
     """
 
-    def __init__(self, tika_url: str = "http://localhost:9998/tika", store_full_path: bool = True):
+    def __init__(self, tika_url: str = "http://localhost:9998/tika", store_full_path: bool = False):
         """
         Create a TikaDocumentConverter component.
 

diff --git a/haystack/components/converters/txt.py b/haystack/components/converters/txt.py
@@ -36,7 +36,7 @@ class TextFileToDocument:
     ```
     """
 
-    def __init__(self, encoding: str = "utf-8", store_full_path: bool = True):
+    def __init__(self, encoding: str = "utf-8", store_full_path: bool = False):
         """
         Creates a TextFileToDocument component.
 

diff --git a/releasenotes/notes/update-store-full-path-default-value-129f701ba07b944b.yaml b/releasenotes/notes/update-store-full-path-default-value-129f701ba07b944b.yaml
@@ -0,0 +1,4 @@
+---
+upgrade:
+  - |
+    Update default value of `store_full_path` to `False` in converters
diff --git a/test/components/converters/test_azure_ocr_doc_converter.py b/test/components/converters/test_azure_ocr_doc_converter.py
@@ -105,7 +105,7 @@ def test_to_dict(self, mock_resolve_value):
                 "page_layout": "natural",
                 "preceding_context_len": 3,
                 "threshold_y": 0.05,
-                "store_full_path": True,
+                "store_full_path": False,
             },
         }
 

diff --git a/test/components/converters/test_csv_to_document.py b/test/components/converters/test_csv_to_document.py
@@ -5,6 +5,7 @@
 from unittest.mock import patch
 import pandas as pd
 from pathlib import Path
+import os
 
 import pytest
 
@@ -35,9 +36,9 @@ def test_run(self, test_files_path):
         assert len(docs) == 3
         assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content
         assert isinstance(docs[0].content, str)
-        assert docs[0].meta == bytestream.meta
-        assert docs[1].meta["file_path"] == str(files[1])
-        assert docs[2].meta["file_path"] == str(files[2])
+        assert docs[0].meta == {"file_path": os.path.basename(bytestream.meta["file_path"]), "key": "value"}
+        assert docs[1].meta["file_path"] == os.path.basename(files[1])
+        assert docs[2].meta["file_path"] == os.path.basename(files[2])
 
     def test_run_with_store_full_path_false(self, test_files_path):
         """
@@ -73,7 +74,7 @@ def test_run_error_handling(self, test_files_path, caplog):
             assert "non_existing_file.csv" in caplog.text
         docs = output["documents"]
         assert len(docs) == 2
-        assert docs[0].meta["file_path"] == str(paths[0])
+        assert docs[0].meta["file_path"] == os.path.basename(paths[0])
 
     def test_encoding_override(self, test_files_path, caplog):
         """

diff --git a/test/components/converters/test_docx_file_to_document.py b/test/components/converters/test_docx_file_to_document.py
@@ -1,4 +1,5 @@
 import json
+import os
 import logging
 import pytest
 import csv
@@ -32,36 +33,36 @@ def test_to_dict(self):
         data = converter.to_dict()
         assert data == {
             "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"store_full_path": True, "table_format": "csv"},
+            "init_parameters": {"store_full_path": False, "table_format": "csv"},
         }
 
     def test_to_dict_custom_parameters(self):
         converter = DOCXToDocument(table_format="markdown")
         data = converter.to_dict()
         assert data == {
             "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"store_full_path": True, "table_format": "markdown"},
+            "init_parameters": {"store_full_path": False, "table_format": "markdown"},
         }
 
         converter = DOCXToDocument(table_format="csv")
         data = converter.to_dict()
         assert data == {
             "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"store_full_path": True, "table_format": "csv"},
+            "init_parameters": {"store_full_path": False, "table_format": "csv"},
         }
 
         converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
         data = converter.to_dict()
         assert data == {
             "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"store_full_path": True, "table_format": "markdown"},
+            "init_parameters": {"store_full_path": False, "table_format": "markdown"},
         }
 
         converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
         data = converter.to_dict()
         assert data == {
             "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"store_full_path": True, "table_format": "csv"},
+            "init_parameters": {"store_full_path": False, "table_format": "csv"},
         }
 
     def test_from_dict(self):
@@ -119,7 +120,7 @@ def test_run(self, test_files_path, docx_converter):
         assert "History" in docs[0].content
         assert docs[0].meta.keys() == {"file_path", "docx"}
         assert docs[0].meta == {
-            "file_path": str(paths[0]),
+            "file_path": os.path.basename(paths[0]),
             "docx": DOCXMetadata(
                 author="Microsoft Office User",
                 category="",
@@ -151,7 +152,7 @@ def test_run_with_table(self, test_files_path):
         assert "Donald Trump" in docs[0].content  ## :-)
         assert docs[0].meta.keys() == {"file_path", "docx"}
         assert docs[0].meta == {
-            "file_path": str(paths[0]),
+            "file_path": os.path.basename(paths[0]),
             "docx": DOCXMetadata(
                 author="Saha, Anirban",
                 category="",
@@ -283,7 +284,7 @@ def test_run_with_additional_meta(self, test_files_path, docx_converter):
         output = docx_converter.run(sources=paths, meta={"language": "it", "author": "test_author"})
         doc = output["documents"][0]
         assert doc.meta == {
-            "file_path": str(paths[0]),
+            "file_path": os.path.basename(paths[0]),
             "docx": DOCXMetadata(
                 author="Microsoft Office User",
                 category="",

diff --git a/test/components/converters/test_html_to_document.py b/test/components/converters/test_html_to_document.py
@@ -42,7 +42,7 @@ def test_run_with_store_full_path(self, test_files_path):
         """
         Test if the component runs correctly when metadata is supplied by the user.
         """
-        converter = HTMLToDocument()
+        converter = HTMLToDocument(store_full_path=True)
         sources = [test_files_path / "html" / "what_is_haystack.html"]
 
         results = converter.run(sources=sources)  # store_full_path is True by default

diff --git a/test/components/converters/test_json.py b/test/components/converters/test_json.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
+import os
 from unittest.mock import patch
 from pathlib import Path
 import logging
@@ -104,7 +105,7 @@ def test_to_dict():
             "content_key": "motivation",
             "jq_schema": ".laureates[]",
             "extra_meta_fields": {"firstname", "surname"},
-            "store_full_path": True,
+            "store_full_path": False,
         },
     }
 
@@ -145,11 +146,11 @@ def test_run(tmpdir):
         == "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and "
         "upholding the dignity of the downtrodden"
     )
-    assert result["documents"][0].meta == {"file_path": str(first_test_file)}
+    assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)}
     assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors"
-    assert result["documents"][1].meta == {"file_path": str(second_test_file)}
+    assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)}
     assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors"
-    assert result["documents"][2].meta == {"file_path": str(second_test_file)}
+    assert result["documents"][2].meta == {"file_path": os.path.basename(second_test_file)}
     assert (
         result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new "
         "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
@@ -254,11 +255,20 @@ def test_run_with_single_meta(tmpdir):
         == "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and "
         "upholding the dignity of the downtrodden"
     )
-    assert result["documents"][0].meta == {"file_path": str(first_test_file), "creation_date": "1945-05-25T00:00:00"}
+    assert result["documents"][0].meta == {
+        "file_path": os.path.basename(first_test_file),
+        "creation_date": "1945-05-25T00:00:00",
+    }
     assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors"
-    assert result["documents"][1].meta == {"file_path": str(second_test_file), "creation_date": "1945-05-25T00:00:00"}
+    assert result["documents"][1].meta == {
+        "file_path": os.path.basename(second_test_file),
+        "creation_date": "1945-05-25T00:00:00",
+    }
     assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors"
-    assert result["documents"][2].meta == {"file_path": str(second_test_file), "creation_date": "1945-05-25T00:00:00"}
+    assert result["documents"][2].meta == {
+        "file_path": os.path.basename(second_test_file),
+        "creation_date": "1945-05-25T00:00:00",
+    }
     assert (
         result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new "
         "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
@@ -290,11 +300,20 @@ def test_run_with_meta_list(tmpdir):
         == "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and "
         "upholding the dignity of the downtrodden"
     )
-    assert result["documents"][0].meta == {"file_path": str(first_test_file), "creation_date": "1945-05-25T00:00:00"}
+    assert result["documents"][0].meta == {
+        "file_path": os.path.basename(first_test_file),
+        "creation_date": "1945-05-25T00:00:00",
+    }
     assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors"
-    assert result["documents"][1].meta == {"file_path": str(second_test_file), "creation_date": "1943-09-03T00:00:00"}
+    assert result["documents"][1].meta == {
+        "file_path": os.path.basename(second_test_file),
+        "creation_date": "1943-09-03T00:00:00",
+    }
     assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors"
-    assert result["documents"][2].meta == {"file_path": str(second_test_file), "creation_date": "1943-09-03T00:00:00"}
+    assert result["documents"][2].meta == {
+        "file_path": os.path.basename(second_test_file),
+        "creation_date": "1943-09-03T00:00:00",
+    }
     assert (
         result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new "
         "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
@@ -329,11 +348,11 @@ def test_run_with_jq_schema_and_content_key(tmpdir):
         result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and "
         "upholding the dignity of the downtrodden"
     )
-    assert result["documents"][0].meta == {"file_path": str(first_test_file)}
+    assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)}
     assert result["documents"][1].content == "for their discoveries of growth factors"
-    assert result["documents"][1].meta == {"file_path": str(second_test_file)}
+    assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)}
     assert result["documents"][2].content == "for their discoveries of growth factors"
-    assert result["documents"][2].meta == {"file_path": str(second_test_file)}
+    assert result["documents"][2].meta == {"file_path": os.path.basename(second_test_file)}
     assert (
         result["documents"][3].content == "for his demonstrations of the existence of new "
         "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
@@ -361,16 +380,20 @@ def test_run_with_jq_schema_content_key_and_extra_meta_fields(tmpdir):
         result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and "
         "upholding the dignity of the downtrodden"
     )
-    assert result["documents"][0].meta == {"file_path": str(first_test_file), "firstname": "Dario", "surname": "Fokin"}
+    assert result["documents"][0].meta == {
+        "file_path": os.path.basename(first_test_file),
+        "firstname": "Dario",
+        "surname": "Fokin",
+    }
     assert result["documents"][1].content == "for their discoveries of growth factors"
     assert result["documents"][1].meta == {
-        "file_path": str(second_test_file),
+        "file_path": os.path.basename(second_test_file),
         "firstname": "Stanley",
         "surname": "Cohen",
     }
     assert result["documents"][2].content == "for their discoveries of growth factors"
     assert result["documents"][2].meta == {
-        "file_path": str(second_test_file),
+        "file_path": os.path.basename(second_test_file),
         "firstname": "Rita",
         "surname": "Levi-Montalcini",
     }
@@ -396,9 +419,9 @@ def test_run_with_content_key(tmpdir):
     assert len(result) == 1
     assert len(result["documents"]) == 3
     assert result["documents"][0].content == "literature"
-    assert result["documents"][0].meta == {"file_path": str(first_test_file)}
+    assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)}
     assert result["documents"][1].content == "medicine"
-    assert result["documents"][1].meta == {"file_path": str(second_test_file)}
+    assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)}
     assert result["documents"][2].content == "physics"
     assert result["documents"][2].meta == {}
 
@@ -417,9 +440,9 @@ def test_run_with_content_key_and_extra_meta_fields(tmpdir):
     assert len(result) == 1
     assert len(result["documents"]) == 3
     assert result["documents"][0].content == "literature"
-    assert result["documents"][0].meta == {"file_path": str(first_test_file), "year": "1997"}
+    assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file), "year": "1997"}
     assert result["documents"][1].content == "medicine"
-    assert result["documents"][1].meta == {"file_path": str(second_test_file), "year": "1986"}
+    assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file), "year": "1986"}
     assert result["documents"][2].content == "physics"
     assert result["documents"][2].meta == {"year": "1938"}
 
@@ -442,23 +465,23 @@ def test_run_with_jq_schema_content_key_and_extra_meta_fields_literal(tmpdir):
         == "who emulates the jesters of the Middle Ages in scourging authority and upholding the dignity of the downtrodden"
     )
     assert result["documents"][0].meta == {
-        "file_path": str(first_test_file),
+        "file_path": os.path.basename(first_test_file),
         "id": "674",
         "firstname": "Dario",
         "surname": "Fokin",
         "share": "1",
     }
     assert result["documents"][1].content == "for their discoveries of growth factors"
     assert result["documents"][1].meta == {
-        "file_path": str(second_test_file),
+        "file_path": os.path.basename(second_test_file),
         "id": "434",
         "firstname": "Stanley",
         "surname": "Cohen",
         "share": "2",
     }
     assert result["documents"][2].content == "for their discoveries of growth factors"
     assert result["documents"][2].meta == {
-        "file_path": str(second_test_file),
+        "file_path": os.path.basename(second_test_file),
         "id": "435",
         "firstname": "Rita",
         "surname": "Levi-Montalcini",
-Original file line number
+Diff line change
@@ Expand Up / @@ -36,7 +36,7 @@ class CSVToDocument: @@
         ```
         """
-        def __init__(self, encoding: str = "utf-8", store_full_path: bool = True):
+        def __init__(self, encoding: str = "utf-8", store_full_path: bool = False):
             """
             Creates a CSVToDocument component.
@@ Expand Down @@