Skip to content

Commit

Permalink
fix: adapt our implementation to breaking changes in Chroma 0.5.17 (d…
Browse files Browse the repository at this point in the history
…eepset-ai#1165)

* fix chroma breaking changes

* improve warning

* better warning
  • Loading branch information
anakin87 authored and AnesBenmerzoug committed Nov 14, 2024
1 parent 4648f73 commit 05f0408
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 8 deletions.
2 changes: 1 addition & 1 deletion integrations/chroma/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ classifiers = [
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = ["haystack-ai", "chromadb>=0.5.0", "typing_extensions>=4.8.0"]
dependencies = ["haystack-ai", "chromadb>=0.5.17", "typing_extensions>=4.8.0"]

[project.urls]
Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/chroma#readme"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,9 +248,12 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D

if doc.content is None:
logger.warning(
"ChromaDocumentStore can only store the text field of Documents: "
"'array', 'dataframe' and 'blob' will be dropped."
"ChromaDocumentStore cannot store documents with `content=None`. "
"`array`, `dataframe` and `blob` are not supported. "
"Document with id %s will be skipped.",
doc.id,
)
continue
data = {"ids": [doc.id], "documents": [doc.content]}

if doc.meta:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections import defaultdict
from dataclasses import dataclass
from typing import Any, Dict, List
from typing import Any, Dict, List, Optional

from chromadb.api.types import validate_where, validate_where_document

Expand Down Expand Up @@ -34,8 +34,8 @@ class ChromaFilter:
"""

ids: List[str]
where: Dict[str, Any]
where_document: Dict[str, Any]
where: Optional[Dict[str, Any]]
where_document: Optional[Dict[str, Any]]


def _convert_filters(filters: Dict[str, Any]) -> ChromaFilter:
Expand Down Expand Up @@ -80,7 +80,7 @@ def _convert_filters(filters: Dict[str, Any]) -> ChromaFilter:
msg = f"Invalid '{test_clause}' : {e}"
raise ChromaDocumentStoreFilterError(msg) from e

return ChromaFilter(ids=ids, where=where, where_document=where_document)
return ChromaFilter(ids=ids, where=where or None, where_document=where_document or None)


def _convert_filter_clause(filters: Dict[str, Any]) -> Dict[str, Any]:
Expand Down
65 changes: 64 additions & 1 deletion integrations/chroma/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings
from haystack import Document
from haystack.testing.document_store import (
TEST_EMBEDDING_1,
TEST_EMBEDDING_2,
CountDocumentsTest,
DeleteDocumentsTest,
FilterDocumentsTest,
_random_embeddings,
)

from haystack_integrations.document_stores.chroma import ChromaDocumentStore
Expand Down Expand Up @@ -51,6 +54,67 @@ def document_store(self) -> ChromaDocumentStore:
get_func.return_value = _TestEmbeddingFunction()
return ChromaDocumentStore(embedding_function="test_function", collection_name=str(uuid.uuid1()))

@pytest.fixture
def filterable_docs(self) -> List[Document]:
"""
This fixture has been copied from haystack/testing/document_store.py and modified to
remove the documents that don't have textual content, as Chroma does not support writing them.
"""
documents = []
for i in range(3):
documents.append(
Document(
content=f"A Foo Document {i}",
meta={
"name": f"name_{i}",
"page": "100",
"chapter": "intro",
"number": 2,
"date": "1969-07-21T20:17:40",
},
embedding=_random_embeddings(768),
)
)
documents.append(
Document(
content=f"A Bar Document {i}",
meta={
"name": f"name_{i}",
"page": "123",
"chapter": "abstract",
"number": -2,
"date": "1972-12-11T19:54:58",
},
embedding=_random_embeddings(768),
)
)
documents.append(
Document(
content=f"A Foobar Document {i}",
meta={
"name": f"name_{i}",
"page": "90",
"chapter": "conclusion",
"number": -10,
"date": "1989-11-09T17:53:00",
},
embedding=_random_embeddings(768),
)
)
documents.append(
Document(
content=f"Document {i} without embedding",
meta={"name": f"name_{i}", "no_embedding": True, "chapter": "conclusion"},
)
)
documents.append(
Document(content=f"Doc {i} with zeros emb", meta={"name": "zeros_doc"}, embedding=TEST_EMBEDDING_1)
)
documents.append(
Document(content=f"Doc {i} with ones emb", meta={"name": "ones_doc"}, embedding=TEST_EMBEDDING_2)
)
return documents

def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
"""
Assert that two lists of Documents are equal.
Expand Down Expand Up @@ -283,7 +347,6 @@ def test_contains(self, document_store: ChromaDocumentStore, filterable_docs: Li
)

def test_multiple_contains(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
filterable_docs = [doc for doc in filterable_docs if doc.content] # remove documents without content
document_store.write_documents(filterable_docs)
filters = {
"operator": "OR",
Expand Down

0 comments on commit 05f0408

Please sign in to comment.