Skip to content

Commit

Permalink
Fix DocumentCleaner not preserving Document fields (#8578)
Browse files Browse the repository at this point in the history
  • Loading branch information
silvanocerza authored Nov 25, 2024
1 parent 9302d3d commit ab84035
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 2 deletions.
14 changes: 12 additions & 2 deletions haystack/components/preprocessors/document_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class DocumentCleaner:
```
"""

def __init__(
def __init__( # pylint: disable=too-many-positional-arguments
self,
remove_empty_lines: bool = True,
remove_extra_whitespaces: bool = True,
Expand Down Expand Up @@ -131,7 +131,17 @@ def run(self, documents: List[Document]):
if self.remove_repeated_substrings:
text = self._remove_repeated_substrings(text)

cleaned_docs.append(Document(content=text, meta=deepcopy(doc.meta), id=doc.id if self.keep_id else ""))
clean_doc = Document(
id=doc.id if self.keep_id else "",
content=text,
dataframe=doc.dataframe,
blob=doc.blob,
meta=deepcopy(doc.meta),
score=doc.score,
embedding=doc.embedding,
sparse_embedding=doc.sparse_embedding,
)
cleaned_docs.append(clean_doc)

return {"documents": cleaned_docs}

Expand Down
4 changes: 4 additions & 0 deletions releasenotes/notes/fix-document-cleaner-4e18a63fd7dc2bd9.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
fixes:
- |
Fix `DocumentCleaner` not preserving all `Document` fields when run
32 changes: 32 additions & 0 deletions test/components/preprocessors/test_document_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@

import pytest

from pandas import DataFrame
from haystack import Document
from haystack.dataclasses import ByteStream, SparseEmbedding
from haystack.components.preprocessors import DocumentCleaner


Expand Down Expand Up @@ -204,3 +206,33 @@ def test_ascii_only(self):
cleaner = DocumentCleaner(ascii_only=True, remove_extra_whitespaces=False, remove_empty_lines=False)
result = cleaner.run(documents=[Document(content=text)])
assert result["documents"][0].content == expected_text

def test_other_document_fields_are_not_lost(self):
cleaner = DocumentCleaner(keep_id=True)
document = Document(
content="This is a text with some words. \n"
""
"There is a second sentence. \n"
""
"And there is a third sentence.\n",
dataframe=DataFrame({"col1": [1], "col2": [2]}),
blob=ByteStream.from_string("some_data"),
meta={"data": 1},
score=0.1,
embedding=[0.1, 0.2, 0.3],
sparse_embedding=SparseEmbedding([0, 2], [0.1, 0.3]),
)
res = cleaner.run(documents=[document])

assert len(res) == 1
assert len(res["documents"])
assert res["documents"][0].id == document.id
assert res["documents"][0].content == (
"This is a text with some words. There is a second sentence. And there is a third sentence."
)
assert res["documents"][0].dataframe.equals(document.dataframe)
assert res["documents"][0].blob == document.blob
assert res["documents"][0].meta == document.meta
assert res["documents"][0].score == document.score
assert res["documents"][0].embedding == document.embedding
assert res["documents"][0].sparse_embedding == document.sparse_embedding

0 comments on commit ab84035

Please sign in to comment.