From 96f6ade703c7dd47b6eb9622485a7a285a98b8b0 Mon Sep 17 00:00:00 2001 From: Corentin Date: Mon, 5 Feb 2024 10:41:57 +0100 Subject: [PATCH] unstructured: fix metadata order mixed up (#336) * Optional meta field for UnstructuredFileConverter with proper tests * black lint * Adding multiple files and meta list test case * Black formatting test * Fixing metadata page number bug. Deep copy of dict * Folder of files test * Update integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py Co-authored-by: Stefano Fiorucci * Update integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py Co-authored-by: Stefano Fiorucci * Update integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py Co-authored-by: Stefano Fiorucci * Renaming "name" meta to "file_path" and deepcopy fix * Fix Ruff Complaining * Removing unique file logic using set that does not preserve file orders. Raise error if glob and metadata list because unsafe * Better test to make sure metadata order are preserved. * Make a failing test if metadata list and directory * filepaths as lists * Update integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py Co-authored-by: Stefano Fiorucci * update meta docstrings --------- Co-authored-by: Stefano Fiorucci --- .../converters/unstructured/converter.py | 26 +++++++++++-------- .../unstructured/tests/test_converter.py | 19 ++++++++++++-- 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index bee1d9a7b..188dd9e6e 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -105,19 +105,23 @@ def run( This value can be either a list of dictionaries or a single dictionary. If it's a single dictionary, its content is added to the metadata of all produced Documents. If it's a list, the length of the list must match the number of paths, because the two lists will be zipped. - Please note that if the paths contain directories, the length of the meta list must match - the actual number of files contained. + Please note that if the paths contain directories, meta can only be a single dictionary + (same metadata for all files). Defaults to `None`. """ - - unique_paths = {Path(path) for path in paths} - filepaths = {path for path in unique_paths if path.is_file()} - filepaths_in_directories = { - filepath for path in unique_paths if path.is_dir() for filepath in path.glob("*.*") if filepath.is_file() - } - - all_filepaths = filepaths.union(filepaths_in_directories) - + paths_obj = [Path(path) for path in paths] + filepaths = [path for path in paths_obj if path.is_file()] + filepaths_in_directories = [ + filepath for path in paths_obj if path.is_dir() for filepath in path.glob("*.*") if filepath.is_file() + ] + if filepaths_in_directories and isinstance(meta, list): + error = """"If providing directories in the `paths` parameter, + `meta` can only be a dictionary (metadata applied to every file), + and not a list. To specify different metadata for each file, + provide an explicit list of direct paths instead.""" + raise ValueError(error) + + all_filepaths = filepaths + filepaths_in_directories # currently, the files are converted sequentially to gently handle API failures documents = [] meta_list = normalize_metadata(meta, sources_count=len(all_filepaths)) diff --git a/integrations/unstructured/tests/test_converter.py b/integrations/unstructured/tests/test_converter.py index d5266ac62..ca590ab2f 100644 --- a/integrations/unstructured/tests/test_converter.py +++ b/integrations/unstructured/tests/test_converter.py @@ -154,7 +154,10 @@ def test_run_one_doc_per_element_with_meta(self, samples_path): @pytest.mark.integration def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path): pdf_path = [samples_path / "sample_pdf.pdf", samples_path / "sample_pdf2.pdf"] - meta = [{"custom_meta": "foobar", "common_meta": "common"}, {"other_meta": "barfoo", "common_meta": "common"}] + meta = [ + {"custom_meta": "sample_pdf.pdf", "common_meta": "common"}, + {"custom_meta": "sample_pdf2.pdf", "common_meta": "common"}, + ] local_converter = UnstructuredFileConverter( api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-element" ) @@ -163,6 +166,7 @@ def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path): assert len(documents) > 4 for doc in documents: + assert doc.meta["custom_meta"] == doc.meta["filename"] assert "file_path" in doc.meta assert "page_number" in doc.meta # elements have a category attribute that is saved in the document meta @@ -171,9 +175,20 @@ def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path): assert doc.meta["common_meta"] == "common" @pytest.mark.integration - def test_run_one_doc_per_element_with_meta_list_folder(self, samples_path): + def test_run_one_doc_per_element_with_meta_list_folder_fail(self, samples_path): pdf_path = [samples_path] meta = [{"custom_meta": "foobar", "common_meta": "common"}, {"other_meta": "barfoo", "common_meta": "common"}] + local_converter = UnstructuredFileConverter( + api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-element" + ) + with pytest.raises(ValueError): + local_converter.run(paths=pdf_path, meta=meta)["documents"] + + @pytest.mark.integration + def test_run_one_doc_per_element_with_meta_list_folder(self, samples_path): + pdf_path = [samples_path] + meta = {"common_meta": "common"} + local_converter = UnstructuredFileConverter( api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-element" )