diff --git a/haystack/nodes/file_converter/base.py b/haystack/nodes/file_converter/base.py index d5fbb3fc58..934cca4689 100644 --- a/haystack/nodes/file_converter/base.py +++ b/haystack/nodes/file_converter/base.py @@ -158,6 +158,7 @@ def run( # type: ignore valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None, + raise_on_failure: bool = True, ): """ Extract text from a file. @@ -188,6 +189,7 @@ def run( # type: ignore attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). In this case the id will be generated by using the content and the defined metadata. + :param raise_on_failure: If true, raises an exception if the conversion of a single file fails. If False, skips the file without failing. """ if known_ligatures is None: known_ligatures = KNOWN_LIGATURES @@ -199,17 +201,24 @@ def run( # type: ignore meta = [meta] * len(file_paths) documents: list = [] + failed_paths: list = [] for file_path, file_meta in tqdm( zip(file_paths, meta), total=len(file_paths), disable=not self.progress_bar, desc="Converting files" ): - documents += self.convert( - file_path=file_path, - meta=file_meta, - remove_numeric_tables=remove_numeric_tables, - valid_languages=valid_languages, - encoding=encoding, - id_hash_keys=id_hash_keys, - ) + try: + documents += self.convert( + file_path=file_path, + meta=file_meta, + remove_numeric_tables=remove_numeric_tables, + valid_languages=valid_languages, + encoding=encoding, + id_hash_keys=id_hash_keys, + ) + except Exception as e: + if raise_on_failure: + raise e + failed_paths.append(str(file_path)) + continue # Cleanup ligatures for document in documents: @@ -217,6 +226,9 @@ def run( # type: ignore if document.content is not None: document.content = document.content.replace(ligature, letters) + if failed_paths: + logger.warning("Conversion of the following file paths failed: %s", ",".join(failed_paths)) + result = {"documents": documents} return result, "output_1" diff --git a/releasenotes/notes/add-raise-on-failure-to-base-converter-8c5e9b3dd51c0e0c.yaml b/releasenotes/notes/add-raise-on-failure-to-base-converter-8c5e9b3dd51c0e0c.yaml new file mode 100644 index 0000000000..05e4a959d7 --- /dev/null +++ b/releasenotes/notes/add-raise-on-failure-to-base-converter-8c5e9b3dd51c0e0c.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Add `raise_on_failure` flag to BaseConverter class so that big processes can optionally continue without breaking from exceptions. diff --git a/test/nodes/test_file_converter.py b/test/nodes/test_file_converter.py index 9daee4b587..f3832d8f9a 100644 --- a/test/nodes/test_file_converter.py +++ b/test/nodes/test_file_converter.py @@ -422,6 +422,34 @@ def test_csv_to_document_with_wrong_qa_headers(tmp_path): node.run(file_paths=csv_path) +@pytest.mark.unit +def test_csv_to_document_with_wrong_qa_headers_raise_on_failure_true(tmp_path): + node = CsvTextConverter() + csv_path = tmp_path / "csv_qa_with_wrong_headers.csv" + rows = [ + ["wrong", "headers"], + ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."], + ] + write_as_csv(rows, csv_path) + + with pytest.raises(ValueError): + node.run(file_paths=csv_path, raise_on_failure=True) + + +@pytest.mark.unit +def test_csv_to_document_with_wrong_qa_headers_raise_on_failure_false(tmp_path): + node = CsvTextConverter() + csv_path = tmp_path / "csv_qa_with_wrong_headers.csv" + rows = [ + ["wrong", "headers"], + ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."], + ] + write_as_csv(rows, csv_path) + + result, _ = node.run(file_paths=csv_path, raise_on_failure=False) + assert len(result["documents"]) == 0 + + @pytest.mark.unit def test_csv_to_document_with_one_wrong_qa_headers(tmp_path): node = CsvTextConverter()