diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py index edcc348d84..0c1172e6dc 100644 --- a/haystack/components/converters/azure.py +++ b/haystack/components/converters/azure.py @@ -61,7 +61,7 @@ def __init__( # pylint: disable=too-many-positional-arguments merge_multiple_column_headers: bool = True, page_layout: Literal["natural", "single_column"] = "natural", threshold_y: Optional[float] = 0.05, - store_full_path: bool = True, + store_full_path: bool = False, ): """ Creates an AzureOCRDocumentConverter component. diff --git a/haystack/components/converters/csv.py b/haystack/components/converters/csv.py index 2b36d4cc67..1a007dc2ea 100644 --- a/haystack/components/converters/csv.py +++ b/haystack/components/converters/csv.py @@ -36,7 +36,7 @@ class CSVToDocument: ``` """ - def __init__(self, encoding: str = "utf-8", store_full_path: bool = True): + def __init__(self, encoding: str = "utf-8", store_full_path: bool = False): """ Creates a CSVToDocument component. diff --git a/haystack/components/converters/docx.py b/haystack/components/converters/docx.py index 604b1a5173..fcd7cbf33f 100644 --- a/haystack/components/converters/docx.py +++ b/haystack/components/converters/docx.py @@ -109,7 +109,7 @@ class DOCXToDocument: ``` """ - def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = True): + def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = False): """ Create a DOCXToDocument component. diff --git a/haystack/components/converters/html.py b/haystack/components/converters/html.py index 4c226ab99f..79baecac4f 100644 --- a/haystack/components/converters/html.py +++ b/haystack/components/converters/html.py @@ -35,7 +35,7 @@ class HTMLToDocument: ``` """ - def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None, store_full_path: bool = True): + def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None, store_full_path: bool = False): """ Create an HTMLToDocument component. diff --git a/haystack/components/converters/json.py b/haystack/components/converters/json.py index e6fc56460a..8a39237035 100644 --- a/haystack/components/converters/json.py +++ b/haystack/components/converters/json.py @@ -95,7 +95,7 @@ def __init__( jq_schema: Optional[str] = None, content_key: Optional[str] = None, extra_meta_fields: Optional[Union[Set[str], Literal["*"]]] = None, - store_full_path: bool = True, + store_full_path: bool = False, ): """ Creates a JSONConverter component. diff --git a/haystack/components/converters/markdown.py b/haystack/components/converters/markdown.py index 0bd722c704..cf57af557c 100644 --- a/haystack/components/converters/markdown.py +++ b/haystack/components/converters/markdown.py @@ -40,7 +40,7 @@ class MarkdownToDocument: ``` """ - def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = True): + def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = False): """ Create a MarkdownToDocument component. diff --git a/haystack/components/converters/pdfminer.py b/haystack/components/converters/pdfminer.py index a66fde927e..fe9a28cad7 100644 --- a/haystack/components/converters/pdfminer.py +++ b/haystack/components/converters/pdfminer.py @@ -48,7 +48,7 @@ def __init__( # pylint: disable=too-many-positional-arguments boxes_flow: Optional[float] = 0.5, detect_vertical: bool = True, all_texts: bool = False, - store_full_path: bool = True, + store_full_path: bool = False, ) -> None: """ Create a PDFMinerToDocument component. diff --git a/haystack/components/converters/pptx.py b/haystack/components/converters/pptx.py index f671acabf4..468d843bd3 100644 --- a/haystack/components/converters/pptx.py +++ b/haystack/components/converters/pptx.py @@ -37,7 +37,7 @@ class PPTXToDocument: ``` """ - def __init__(self, store_full_path: bool = True): + def __init__(self, store_full_path: bool = False): """ Create an PPTXToDocument component. diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py index 2086873c4f..df8be1ad79 100644 --- a/haystack/components/converters/pypdf.py +++ b/haystack/components/converters/pypdf.py @@ -79,7 +79,7 @@ def __init__( layout_mode_scale_weight: float = 1.25, layout_mode_strip_rotated: bool = True, layout_mode_font_height_weight: float = 1.0, - store_full_path: bool = True, + store_full_path: bool = False, ): """ Create an PyPDFToDocument component. diff --git a/haystack/components/converters/tika.py b/haystack/components/converters/tika.py index 7a8fdb36b6..a6a27f584d 100644 --- a/haystack/components/converters/tika.py +++ b/haystack/components/converters/tika.py @@ -75,7 +75,7 @@ class TikaDocumentConverter: ``` """ - def __init__(self, tika_url: str = "http://localhost:9998/tika", store_full_path: bool = True): + def __init__(self, tika_url: str = "http://localhost:9998/tika", store_full_path: bool = False): """ Create a TikaDocumentConverter component. diff --git a/haystack/components/converters/txt.py b/haystack/components/converters/txt.py index 8d7a12a4f5..ea29e3f078 100644 --- a/haystack/components/converters/txt.py +++ b/haystack/components/converters/txt.py @@ -36,7 +36,7 @@ class TextFileToDocument: ``` """ - def __init__(self, encoding: str = "utf-8", store_full_path: bool = True): + def __init__(self, encoding: str = "utf-8", store_full_path: bool = False): """ Creates a TextFileToDocument component. diff --git a/releasenotes/notes/update-store-full-path-default-value-129f701ba07b944b.yaml b/releasenotes/notes/update-store-full-path-default-value-129f701ba07b944b.yaml new file mode 100644 index 0000000000..61a0110958 --- /dev/null +++ b/releasenotes/notes/update-store-full-path-default-value-129f701ba07b944b.yaml @@ -0,0 +1,4 @@ +--- +upgrade: + - | + Update default value of `store_full_path` to `False` in converters diff --git a/test/components/converters/test_azure_ocr_doc_converter.py b/test/components/converters/test_azure_ocr_doc_converter.py index f8db0d734c..eef16ab8d9 100644 --- a/test/components/converters/test_azure_ocr_doc_converter.py +++ b/test/components/converters/test_azure_ocr_doc_converter.py @@ -105,7 +105,7 @@ def test_to_dict(self, mock_resolve_value): "page_layout": "natural", "preceding_context_len": 3, "threshold_y": 0.05, - "store_full_path": True, + "store_full_path": False, }, } diff --git a/test/components/converters/test_csv_to_document.py b/test/components/converters/test_csv_to_document.py index 3271c09c17..cfb955041b 100644 --- a/test/components/converters/test_csv_to_document.py +++ b/test/components/converters/test_csv_to_document.py @@ -5,6 +5,7 @@ from unittest.mock import patch import pandas as pd from pathlib import Path +import os import pytest @@ -35,9 +36,9 @@ def test_run(self, test_files_path): assert len(docs) == 3 assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content assert isinstance(docs[0].content, str) - assert docs[0].meta == bytestream.meta - assert docs[1].meta["file_path"] == str(files[1]) - assert docs[2].meta["file_path"] == str(files[2]) + assert docs[0].meta == {"file_path": os.path.basename(bytestream.meta["file_path"]), "key": "value"} + assert docs[1].meta["file_path"] == os.path.basename(files[1]) + assert docs[2].meta["file_path"] == os.path.basename(files[2]) def test_run_with_store_full_path_false(self, test_files_path): """ @@ -73,7 +74,7 @@ def test_run_error_handling(self, test_files_path, caplog): assert "non_existing_file.csv" in caplog.text docs = output["documents"] assert len(docs) == 2 - assert docs[0].meta["file_path"] == str(paths[0]) + assert docs[0].meta["file_path"] == os.path.basename(paths[0]) def test_encoding_override(self, test_files_path, caplog): """ diff --git a/test/components/converters/test_docx_file_to_document.py b/test/components/converters/test_docx_file_to_document.py index c0ce370f3e..9b4ee3fe60 100644 --- a/test/components/converters/test_docx_file_to_document.py +++ b/test/components/converters/test_docx_file_to_document.py @@ -1,4 +1,5 @@ import json +import os import logging import pytest import csv @@ -32,7 +33,7 @@ def test_to_dict(self): data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"store_full_path": True, "table_format": "csv"}, + "init_parameters": {"store_full_path": False, "table_format": "csv"}, } def test_to_dict_custom_parameters(self): @@ -40,28 +41,28 @@ def test_to_dict_custom_parameters(self): data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"store_full_path": True, "table_format": "markdown"}, + "init_parameters": {"store_full_path": False, "table_format": "markdown"}, } converter = DOCXToDocument(table_format="csv") data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"store_full_path": True, "table_format": "csv"}, + "init_parameters": {"store_full_path": False, "table_format": "csv"}, } converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN) data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"store_full_path": True, "table_format": "markdown"}, + "init_parameters": {"store_full_path": False, "table_format": "markdown"}, } converter = DOCXToDocument(table_format=DOCXTableFormat.CSV) data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"store_full_path": True, "table_format": "csv"}, + "init_parameters": {"store_full_path": False, "table_format": "csv"}, } def test_from_dict(self): @@ -119,7 +120,7 @@ def test_run(self, test_files_path, docx_converter): assert "History" in docs[0].content assert docs[0].meta.keys() == {"file_path", "docx"} assert docs[0].meta == { - "file_path": str(paths[0]), + "file_path": os.path.basename(paths[0]), "docx": DOCXMetadata( author="Microsoft Office User", category="", @@ -151,7 +152,7 @@ def test_run_with_table(self, test_files_path): assert "Donald Trump" in docs[0].content ## :-) assert docs[0].meta.keys() == {"file_path", "docx"} assert docs[0].meta == { - "file_path": str(paths[0]), + "file_path": os.path.basename(paths[0]), "docx": DOCXMetadata( author="Saha, Anirban", category="", @@ -283,7 +284,7 @@ def test_run_with_additional_meta(self, test_files_path, docx_converter): output = docx_converter.run(sources=paths, meta={"language": "it", "author": "test_author"}) doc = output["documents"][0] assert doc.meta == { - "file_path": str(paths[0]), + "file_path": os.path.basename(paths[0]), "docx": DOCXMetadata( author="Microsoft Office User", category="", diff --git a/test/components/converters/test_html_to_document.py b/test/components/converters/test_html_to_document.py index df76c8e892..890c927434 100644 --- a/test/components/converters/test_html_to_document.py +++ b/test/components/converters/test_html_to_document.py @@ -42,7 +42,7 @@ def test_run_with_store_full_path(self, test_files_path): """ Test if the component runs correctly when metadata is supplied by the user. """ - converter = HTMLToDocument() + converter = HTMLToDocument(store_full_path=True) sources = [test_files_path / "html" / "what_is_haystack.html"] results = converter.run(sources=sources) # store_full_path is True by default diff --git a/test/components/converters/test_json.py b/test/components/converters/test_json.py index d85a0e187a..f9dcf2fa0c 100644 --- a/test/components/converters/test_json.py +++ b/test/components/converters/test_json.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import json +import os from unittest.mock import patch from pathlib import Path import logging @@ -104,7 +105,7 @@ def test_to_dict(): "content_key": "motivation", "jq_schema": ".laureates[]", "extra_meta_fields": {"firstname", "surname"}, - "store_full_path": True, + "store_full_path": False, }, } @@ -145,11 +146,11 @@ def test_run(tmpdir): == "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) - assert result["documents"][0].meta == {"file_path": str(first_test_file)} + assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)} assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors" - assert result["documents"][1].meta == {"file_path": str(second_test_file)} + assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)} assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors" - assert result["documents"][2].meta == {"file_path": str(second_test_file)} + assert result["documents"][2].meta == {"file_path": os.path.basename(second_test_file)} assert ( result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new " "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear " @@ -254,11 +255,20 @@ def test_run_with_single_meta(tmpdir): == "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) - assert result["documents"][0].meta == {"file_path": str(first_test_file), "creation_date": "1945-05-25T00:00:00"} + assert result["documents"][0].meta == { + "file_path": os.path.basename(first_test_file), + "creation_date": "1945-05-25T00:00:00", + } assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors" - assert result["documents"][1].meta == {"file_path": str(second_test_file), "creation_date": "1945-05-25T00:00:00"} + assert result["documents"][1].meta == { + "file_path": os.path.basename(second_test_file), + "creation_date": "1945-05-25T00:00:00", + } assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors" - assert result["documents"][2].meta == {"file_path": str(second_test_file), "creation_date": "1945-05-25T00:00:00"} + assert result["documents"][2].meta == { + "file_path": os.path.basename(second_test_file), + "creation_date": "1945-05-25T00:00:00", + } assert ( result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new " "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear " @@ -290,11 +300,20 @@ def test_run_with_meta_list(tmpdir): == "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) - assert result["documents"][0].meta == {"file_path": str(first_test_file), "creation_date": "1945-05-25T00:00:00"} + assert result["documents"][0].meta == { + "file_path": os.path.basename(first_test_file), + "creation_date": "1945-05-25T00:00:00", + } assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors" - assert result["documents"][1].meta == {"file_path": str(second_test_file), "creation_date": "1943-09-03T00:00:00"} + assert result["documents"][1].meta == { + "file_path": os.path.basename(second_test_file), + "creation_date": "1943-09-03T00:00:00", + } assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors" - assert result["documents"][2].meta == {"file_path": str(second_test_file), "creation_date": "1943-09-03T00:00:00"} + assert result["documents"][2].meta == { + "file_path": os.path.basename(second_test_file), + "creation_date": "1943-09-03T00:00:00", + } assert ( result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new " "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear " @@ -329,11 +348,11 @@ def test_run_with_jq_schema_and_content_key(tmpdir): result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) - assert result["documents"][0].meta == {"file_path": str(first_test_file)} + assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)} assert result["documents"][1].content == "for their discoveries of growth factors" - assert result["documents"][1].meta == {"file_path": str(second_test_file)} + assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)} assert result["documents"][2].content == "for their discoveries of growth factors" - assert result["documents"][2].meta == {"file_path": str(second_test_file)} + assert result["documents"][2].meta == {"file_path": os.path.basename(second_test_file)} assert ( result["documents"][3].content == "for his demonstrations of the existence of new " "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear " @@ -361,16 +380,20 @@ def test_run_with_jq_schema_content_key_and_extra_meta_fields(tmpdir): result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) - assert result["documents"][0].meta == {"file_path": str(first_test_file), "firstname": "Dario", "surname": "Fokin"} + assert result["documents"][0].meta == { + "file_path": os.path.basename(first_test_file), + "firstname": "Dario", + "surname": "Fokin", + } assert result["documents"][1].content == "for their discoveries of growth factors" assert result["documents"][1].meta == { - "file_path": str(second_test_file), + "file_path": os.path.basename(second_test_file), "firstname": "Stanley", "surname": "Cohen", } assert result["documents"][2].content == "for their discoveries of growth factors" assert result["documents"][2].meta == { - "file_path": str(second_test_file), + "file_path": os.path.basename(second_test_file), "firstname": "Rita", "surname": "Levi-Montalcini", } @@ -396,9 +419,9 @@ def test_run_with_content_key(tmpdir): assert len(result) == 1 assert len(result["documents"]) == 3 assert result["documents"][0].content == "literature" - assert result["documents"][0].meta == {"file_path": str(first_test_file)} + assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)} assert result["documents"][1].content == "medicine" - assert result["documents"][1].meta == {"file_path": str(second_test_file)} + assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)} assert result["documents"][2].content == "physics" assert result["documents"][2].meta == {} @@ -417,9 +440,9 @@ def test_run_with_content_key_and_extra_meta_fields(tmpdir): assert len(result) == 1 assert len(result["documents"]) == 3 assert result["documents"][0].content == "literature" - assert result["documents"][0].meta == {"file_path": str(first_test_file), "year": "1997"} + assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file), "year": "1997"} assert result["documents"][1].content == "medicine" - assert result["documents"][1].meta == {"file_path": str(second_test_file), "year": "1986"} + assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file), "year": "1986"} assert result["documents"][2].content == "physics" assert result["documents"][2].meta == {"year": "1938"} @@ -442,7 +465,7 @@ def test_run_with_jq_schema_content_key_and_extra_meta_fields_literal(tmpdir): == "who emulates the jesters of the Middle Ages in scourging authority and upholding the dignity of the downtrodden" ) assert result["documents"][0].meta == { - "file_path": str(first_test_file), + "file_path": os.path.basename(first_test_file), "id": "674", "firstname": "Dario", "surname": "Fokin", @@ -450,7 +473,7 @@ def test_run_with_jq_schema_content_key_and_extra_meta_fields_literal(tmpdir): } assert result["documents"][1].content == "for their discoveries of growth factors" assert result["documents"][1].meta == { - "file_path": str(second_test_file), + "file_path": os.path.basename(second_test_file), "id": "434", "firstname": "Stanley", "surname": "Cohen", @@ -458,7 +481,7 @@ def test_run_with_jq_schema_content_key_and_extra_meta_fields_literal(tmpdir): } assert result["documents"][2].content == "for their discoveries of growth factors" assert result["documents"][2].meta == { - "file_path": str(second_test_file), + "file_path": os.path.basename(second_test_file), "id": "435", "firstname": "Rita", "surname": "Levi-Montalcini", diff --git a/test/components/converters/test_pptx_to_document.py b/test/components/converters/test_pptx_to_document.py index 135b1b08a8..595e1a1234 100644 --- a/test/components/converters/test_pptx_to_document.py +++ b/test/components/converters/test_pptx_to_document.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 import logging +import os from haystack.dataclasses import ByteStream from haystack.components.converters.pptx import PPTXToDocument @@ -29,8 +30,8 @@ def test_run(self, test_files_path): "Sample Title Slide\nJane Doe\fTitle of First Slide\nThis is a bullet point\nThis is another bullet point" in docs[0].content ) - assert docs[0].meta["file_path"] == str(files[0]) - assert docs[1].meta == bytestream.meta + assert docs[0].meta["file_path"] == os.path.basename(files[0]) + assert docs[1].meta == {"file_path": os.path.basename(bytestream.meta["file_path"]), "key": "value"} def test_run_error_non_existent_file(self, caplog): sources = ["non_existing_file.pptx"] @@ -58,7 +59,7 @@ def test_run_with_meta(self, test_files_path): document = output["documents"][0] assert document.meta == { - "file_path": str(test_files_path / "pptx" / "sample_pptx.pptx"), + "file_path": os.path.basename(test_files_path / "pptx" / "sample_pptx.pptx"), "key": "value", "language": "it", } diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py index e5a9964434..fa8f295db7 100644 --- a/test/components/converters/test_pypdf_to_document.py +++ b/test/components/converters/test_pypdf_to_document.py @@ -61,7 +61,7 @@ def test_to_dict(self, pypdf_component): "layout_mode_scale_weight": 1.25, "layout_mode_strip_rotated": True, "layout_mode_font_height_weight": 1.0, - "store_full_path": True, + "store_full_path": False, }, } diff --git a/test/components/converters/test_textfile_to_document.py b/test/components/converters/test_textfile_to_document.py index 419b042134..5698501a80 100644 --- a/test/components/converters/test_textfile_to_document.py +++ b/test/components/converters/test_textfile_to_document.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 import logging +import os from unittest.mock import patch from pathlib import Path @@ -27,9 +28,9 @@ def test_run(self, test_files_path): assert "Some text for testing." in docs[0].content assert "This is a test line." in docs[1].content assert "That's yet another file!" in docs[2].content - assert docs[0].meta["file_path"] == str(files[0]) - assert docs[1].meta["file_path"] == str(files[1]) - assert docs[2].meta == bytestream.meta + assert docs[0].meta["file_path"] == os.path.basename(files[0]) + assert docs[1].meta["file_path"] == os.path.basename(files[1]) + assert docs[2].meta == {"file_path": os.path.basename(bytestream.meta["file_path"]), "key": "value"} def test_run_with_store_full_path(self, test_files_path): """ @@ -59,8 +60,8 @@ def test_run_error_handling(self, test_files_path, caplog): assert "non_existing_file.txt" in caplog.text docs = output["documents"] assert len(docs) == 2 - assert docs[0].meta["file_path"] == str(paths[0]) - assert docs[1].meta["file_path"] == str(paths[2]) + assert docs[0].meta["file_path"] == os.path.basename(paths[0]) + assert docs[1].meta["file_path"] == os.path.basename(paths[2]) def test_encoding_override(self, test_files_path): """