Skip to content

Commit

Permalink
update default value of 'store_full_path' to False in converters (#8619)
Browse files Browse the repository at this point in the history
  • Loading branch information
mpangrazzi authored Dec 10, 2024
1 parent c78eb9b commit 21d53d0
Show file tree
Hide file tree
Showing 20 changed files with 88 additions and 57 deletions.
2 changes: 1 addition & 1 deletion haystack/components/converters/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
merge_multiple_column_headers: bool = True,
page_layout: Literal["natural", "single_column"] = "natural",
threshold_y: Optional[float] = 0.05,
store_full_path: bool = True,
store_full_path: bool = False,
):
"""
Creates an AzureOCRDocumentConverter component.
Expand Down
2 changes: 1 addition & 1 deletion haystack/components/converters/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class CSVToDocument:
```
"""

def __init__(self, encoding: str = "utf-8", store_full_path: bool = True):
def __init__(self, encoding: str = "utf-8", store_full_path: bool = False):
"""
Creates a CSVToDocument component.
Expand Down
2 changes: 1 addition & 1 deletion haystack/components/converters/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ class DOCXToDocument:
```
"""

def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = True):
def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = False):
"""
Create a DOCXToDocument component.
Expand Down
2 changes: 1 addition & 1 deletion haystack/components/converters/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class HTMLToDocument:
```
"""

def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None, store_full_path: bool = True):
def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None, store_full_path: bool = False):
"""
Create an HTMLToDocument component.
Expand Down
2 changes: 1 addition & 1 deletion haystack/components/converters/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def __init__(
jq_schema: Optional[str] = None,
content_key: Optional[str] = None,
extra_meta_fields: Optional[Union[Set[str], Literal["*"]]] = None,
store_full_path: bool = True,
store_full_path: bool = False,
):
"""
Creates a JSONConverter component.
Expand Down
2 changes: 1 addition & 1 deletion haystack/components/converters/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class MarkdownToDocument:
```
"""

def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = True):
def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = False):
"""
Create a MarkdownToDocument component.
Expand Down
2 changes: 1 addition & 1 deletion haystack/components/converters/pdfminer.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
boxes_flow: Optional[float] = 0.5,
detect_vertical: bool = True,
all_texts: bool = False,
store_full_path: bool = True,
store_full_path: bool = False,
) -> None:
"""
Create a PDFMinerToDocument component.
Expand Down
2 changes: 1 addition & 1 deletion haystack/components/converters/pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class PPTXToDocument:
```
"""

def __init__(self, store_full_path: bool = True):
def __init__(self, store_full_path: bool = False):
"""
Create an PPTXToDocument component.
Expand Down
2 changes: 1 addition & 1 deletion haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def __init__(
layout_mode_scale_weight: float = 1.25,
layout_mode_strip_rotated: bool = True,
layout_mode_font_height_weight: float = 1.0,
store_full_path: bool = True,
store_full_path: bool = False,
):
"""
Create an PyPDFToDocument component.
Expand Down
2 changes: 1 addition & 1 deletion haystack/components/converters/tika.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ class TikaDocumentConverter:
```
"""

def __init__(self, tika_url: str = "http://localhost:9998/tika", store_full_path: bool = True):
def __init__(self, tika_url: str = "http://localhost:9998/tika", store_full_path: bool = False):
"""
Create a TikaDocumentConverter component.
Expand Down
2 changes: 1 addition & 1 deletion haystack/components/converters/txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class TextFileToDocument:
```
"""

def __init__(self, encoding: str = "utf-8", store_full_path: bool = True):
def __init__(self, encoding: str = "utf-8", store_full_path: bool = False):
"""
Creates a TextFileToDocument component.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
upgrade:
- |
Update default value of `store_full_path` to `False` in converters
2 changes: 1 addition & 1 deletion test/components/converters/test_azure_ocr_doc_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def test_to_dict(self, mock_resolve_value):
"page_layout": "natural",
"preceding_context_len": 3,
"threshold_y": 0.05,
"store_full_path": True,
"store_full_path": False,
},
}

Expand Down
9 changes: 5 additions & 4 deletions test/components/converters/test_csv_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from unittest.mock import patch
import pandas as pd
from pathlib import Path
import os

import pytest

Expand Down Expand Up @@ -35,9 +36,9 @@ def test_run(self, test_files_path):
assert len(docs) == 3
assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content
assert isinstance(docs[0].content, str)
assert docs[0].meta == bytestream.meta
assert docs[1].meta["file_path"] == str(files[1])
assert docs[2].meta["file_path"] == str(files[2])
assert docs[0].meta == {"file_path": os.path.basename(bytestream.meta["file_path"]), "key": "value"}
assert docs[1].meta["file_path"] == os.path.basename(files[1])
assert docs[2].meta["file_path"] == os.path.basename(files[2])

def test_run_with_store_full_path_false(self, test_files_path):
"""
Expand Down Expand Up @@ -73,7 +74,7 @@ def test_run_error_handling(self, test_files_path, caplog):
assert "non_existing_file.csv" in caplog.text
docs = output["documents"]
assert len(docs) == 2
assert docs[0].meta["file_path"] == str(paths[0])
assert docs[0].meta["file_path"] == os.path.basename(paths[0])

def test_encoding_override(self, test_files_path, caplog):
"""
Expand Down
17 changes: 9 additions & 8 deletions test/components/converters/test_docx_file_to_document.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import os
import logging
import pytest
import csv
Expand Down Expand Up @@ -32,36 +33,36 @@ def test_to_dict(self):
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": True, "table_format": "csv"},
"init_parameters": {"store_full_path": False, "table_format": "csv"},
}

def test_to_dict_custom_parameters(self):
converter = DOCXToDocument(table_format="markdown")
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": True, "table_format": "markdown"},
"init_parameters": {"store_full_path": False, "table_format": "markdown"},
}

converter = DOCXToDocument(table_format="csv")
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": True, "table_format": "csv"},
"init_parameters": {"store_full_path": False, "table_format": "csv"},
}

converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": True, "table_format": "markdown"},
"init_parameters": {"store_full_path": False, "table_format": "markdown"},
}

converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": True, "table_format": "csv"},
"init_parameters": {"store_full_path": False, "table_format": "csv"},
}

def test_from_dict(self):
Expand Down Expand Up @@ -119,7 +120,7 @@ def test_run(self, test_files_path, docx_converter):
assert "History" in docs[0].content
assert docs[0].meta.keys() == {"file_path", "docx"}
assert docs[0].meta == {
"file_path": str(paths[0]),
"file_path": os.path.basename(paths[0]),
"docx": DOCXMetadata(
author="Microsoft Office User",
category="",
Expand Down Expand Up @@ -151,7 +152,7 @@ def test_run_with_table(self, test_files_path):
assert "Donald Trump" in docs[0].content ## :-)
assert docs[0].meta.keys() == {"file_path", "docx"}
assert docs[0].meta == {
"file_path": str(paths[0]),
"file_path": os.path.basename(paths[0]),
"docx": DOCXMetadata(
author="Saha, Anirban",
category="",
Expand Down Expand Up @@ -283,7 +284,7 @@ def test_run_with_additional_meta(self, test_files_path, docx_converter):
output = docx_converter.run(sources=paths, meta={"language": "it", "author": "test_author"})
doc = output["documents"][0]
assert doc.meta == {
"file_path": str(paths[0]),
"file_path": os.path.basename(paths[0]),
"docx": DOCXMetadata(
author="Microsoft Office User",
category="",
Expand Down
2 changes: 1 addition & 1 deletion test/components/converters/test_html_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_run_with_store_full_path(self, test_files_path):
"""
Test if the component runs correctly when metadata is supplied by the user.
"""
converter = HTMLToDocument()
converter = HTMLToDocument(store_full_path=True)
sources = [test_files_path / "html" / "what_is_haystack.html"]

results = converter.run(sources=sources) # store_full_path is True by default
Expand Down
69 changes: 46 additions & 23 deletions test/components/converters/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

import json
import os
from unittest.mock import patch
from pathlib import Path
import logging
Expand Down Expand Up @@ -104,7 +105,7 @@ def test_to_dict():
"content_key": "motivation",
"jq_schema": ".laureates[]",
"extra_meta_fields": {"firstname", "surname"},
"store_full_path": True,
"store_full_path": False,
},
}

Expand Down Expand Up @@ -145,11 +146,11 @@ def test_run(tmpdir):
== "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and "
"upholding the dignity of the downtrodden"
)
assert result["documents"][0].meta == {"file_path": str(first_test_file)}
assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)}
assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors"
assert result["documents"][1].meta == {"file_path": str(second_test_file)}
assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)}
assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors"
assert result["documents"][2].meta == {"file_path": str(second_test_file)}
assert result["documents"][2].meta == {"file_path": os.path.basename(second_test_file)}
assert (
result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new "
"radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
Expand Down Expand Up @@ -254,11 +255,20 @@ def test_run_with_single_meta(tmpdir):
== "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and "
"upholding the dignity of the downtrodden"
)
assert result["documents"][0].meta == {"file_path": str(first_test_file), "creation_date": "1945-05-25T00:00:00"}
assert result["documents"][0].meta == {
"file_path": os.path.basename(first_test_file),
"creation_date": "1945-05-25T00:00:00",
}
assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors"
assert result["documents"][1].meta == {"file_path": str(second_test_file), "creation_date": "1945-05-25T00:00:00"}
assert result["documents"][1].meta == {
"file_path": os.path.basename(second_test_file),
"creation_date": "1945-05-25T00:00:00",
}
assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors"
assert result["documents"][2].meta == {"file_path": str(second_test_file), "creation_date": "1945-05-25T00:00:00"}
assert result["documents"][2].meta == {
"file_path": os.path.basename(second_test_file),
"creation_date": "1945-05-25T00:00:00",
}
assert (
result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new "
"radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
Expand Down Expand Up @@ -290,11 +300,20 @@ def test_run_with_meta_list(tmpdir):
== "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and "
"upholding the dignity of the downtrodden"
)
assert result["documents"][0].meta == {"file_path": str(first_test_file), "creation_date": "1945-05-25T00:00:00"}
assert result["documents"][0].meta == {
"file_path": os.path.basename(first_test_file),
"creation_date": "1945-05-25T00:00:00",
}
assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors"
assert result["documents"][1].meta == {"file_path": str(second_test_file), "creation_date": "1943-09-03T00:00:00"}
assert result["documents"][1].meta == {
"file_path": os.path.basename(second_test_file),
"creation_date": "1943-09-03T00:00:00",
}
assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors"
assert result["documents"][2].meta == {"file_path": str(second_test_file), "creation_date": "1943-09-03T00:00:00"}
assert result["documents"][2].meta == {
"file_path": os.path.basename(second_test_file),
"creation_date": "1943-09-03T00:00:00",
}
assert (
result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new "
"radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
Expand Down Expand Up @@ -329,11 +348,11 @@ def test_run_with_jq_schema_and_content_key(tmpdir):
result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and "
"upholding the dignity of the downtrodden"
)
assert result["documents"][0].meta == {"file_path": str(first_test_file)}
assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)}
assert result["documents"][1].content == "for their discoveries of growth factors"
assert result["documents"][1].meta == {"file_path": str(second_test_file)}
assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)}
assert result["documents"][2].content == "for their discoveries of growth factors"
assert result["documents"][2].meta == {"file_path": str(second_test_file)}
assert result["documents"][2].meta == {"file_path": os.path.basename(second_test_file)}
assert (
result["documents"][3].content == "for his demonstrations of the existence of new "
"radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
Expand Down Expand Up @@ -361,16 +380,20 @@ def test_run_with_jq_schema_content_key_and_extra_meta_fields(tmpdir):
result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and "
"upholding the dignity of the downtrodden"
)
assert result["documents"][0].meta == {"file_path": str(first_test_file), "firstname": "Dario", "surname": "Fokin"}
assert result["documents"][0].meta == {
"file_path": os.path.basename(first_test_file),
"firstname": "Dario",
"surname": "Fokin",
}
assert result["documents"][1].content == "for their discoveries of growth factors"
assert result["documents"][1].meta == {
"file_path": str(second_test_file),
"file_path": os.path.basename(second_test_file),
"firstname": "Stanley",
"surname": "Cohen",
}
assert result["documents"][2].content == "for their discoveries of growth factors"
assert result["documents"][2].meta == {
"file_path": str(second_test_file),
"file_path": os.path.basename(second_test_file),
"firstname": "Rita",
"surname": "Levi-Montalcini",
}
Expand All @@ -396,9 +419,9 @@ def test_run_with_content_key(tmpdir):
assert len(result) == 1
assert len(result["documents"]) == 3
assert result["documents"][0].content == "literature"
assert result["documents"][0].meta == {"file_path": str(first_test_file)}
assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)}
assert result["documents"][1].content == "medicine"
assert result["documents"][1].meta == {"file_path": str(second_test_file)}
assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)}
assert result["documents"][2].content == "physics"
assert result["documents"][2].meta == {}

Expand All @@ -417,9 +440,9 @@ def test_run_with_content_key_and_extra_meta_fields(tmpdir):
assert len(result) == 1
assert len(result["documents"]) == 3
assert result["documents"][0].content == "literature"
assert result["documents"][0].meta == {"file_path": str(first_test_file), "year": "1997"}
assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file), "year": "1997"}
assert result["documents"][1].content == "medicine"
assert result["documents"][1].meta == {"file_path": str(second_test_file), "year": "1986"}
assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file), "year": "1986"}
assert result["documents"][2].content == "physics"
assert result["documents"][2].meta == {"year": "1938"}

Expand All @@ -442,23 +465,23 @@ def test_run_with_jq_schema_content_key_and_extra_meta_fields_literal(tmpdir):
== "who emulates the jesters of the Middle Ages in scourging authority and upholding the dignity of the downtrodden"
)
assert result["documents"][0].meta == {
"file_path": str(first_test_file),
"file_path": os.path.basename(first_test_file),
"id": "674",
"firstname": "Dario",
"surname": "Fokin",
"share": "1",
}
assert result["documents"][1].content == "for their discoveries of growth factors"
assert result["documents"][1].meta == {
"file_path": str(second_test_file),
"file_path": os.path.basename(second_test_file),
"id": "434",
"firstname": "Stanley",
"surname": "Cohen",
"share": "2",
}
assert result["documents"][2].content == "for their discoveries of growth factors"
assert result["documents"][2].meta == {
"file_path": str(second_test_file),
"file_path": os.path.basename(second_test_file),
"id": "435",
"firstname": "Rita",
"surname": "Levi-Montalcini",
Expand Down
Loading

0 comments on commit 21d53d0

Please sign in to comment.