Skip to content

Commit

Permalink
Weaviate - skip writing _split_overlap meta field (#1173)
Browse files Browse the repository at this point in the history
  • Loading branch information
anakin87 authored Nov 11, 2024
1 parent 34ae0bd commit 1bcb9a8
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,14 @@ def _to_data_object(self, document: Document) -> Dict[str, Any]:
# The embedding vector is stored separately from the rest of the data
del data["embedding"]

# _split_overlap meta field is unsupported because of a bug
# https://github.com/deepset-ai/haystack-core-integrations/issues/1172
if "_split_overlap" in data:
data.pop("_split_overlap")
logger.warning(
"Document %s has the unsupported `_split_overlap` meta field. It will be ignored.", data["_original_id"]
)

if "sparse_embedding" in data:
sparse_embedding = data.pop("sparse_embedding", None)
if sparse_embedding:
Expand Down
24 changes: 24 additions & 0 deletions integrations/weaviate/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,30 @@ def test_comparison_less_than_equal_with_iso_date(self, document_store, filterab
def test_comparison_not_equal_with_dataframe(self, document_store, filterable_docs):
return super().test_comparison_not_equal_with_dataframe(document_store, filterable_docs)

def test_meta_split_overlap_is_skipped(self, document_store):
doc = Document(
content="The moonlight shimmered ",
meta={
"source_id": "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0",
"page_number": 1,
"split_id": 0,
"split_idx_start": 0,
"_split_overlap": [
{"doc_id": "68ed48ba830048c5d7815874ed2de794722e6d10866b6c55349a914fd9a0df65", "range": (0, 20)}
],
},
)
document_store.write_documents([doc])

written_doc = document_store.filter_documents()[0]

assert written_doc.content == "The moonlight shimmered "
assert written_doc.meta["source_id"] == "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0"
assert written_doc.meta["page_number"] == 1.0
assert written_doc.meta["split_id"] == 0.0
assert written_doc.meta["split_idx_start"] == 0.0
assert "_split_overlap" not in written_doc.meta

def test_bm25_retrieval(self, document_store):
document_store.write_documents(
[
Expand Down

0 comments on commit 1bcb9a8

Please sign in to comment.