fix: adapt our implementation to breaking changes in Chroma 0.5.17 (d…

…eepset-ai#1165) * fix chroma breaking changes * improve warning * better warning
AnesBenmerzoug · Nov 14, 2024 · 05f0408 · 05f0408
1 parent 4648f73
commit 05f0408
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 8 deletions.
diff --git a/integrations/chroma/pyproject.toml b/integrations/chroma/pyproject.toml
@@ -22,7 +22,7 @@ classifiers = [
   "Programming Language :: Python :: Implementation :: CPython",
   "Programming Language :: Python :: Implementation :: PyPy",
 ]
-dependencies = ["haystack-ai", "chromadb>=0.5.0", "typing_extensions>=4.8.0"]
+dependencies = ["haystack-ai", "chromadb>=0.5.17", "typing_extensions>=4.8.0"]
 
 [project.urls]
 Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/chroma#readme"

diff --git a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py
@@ -248,9 +248,12 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
 
             if doc.content is None:
                 logger.warning(
-                    "ChromaDocumentStore can only store the text field of Documents: "
-                    "'array', 'dataframe' and 'blob' will be dropped."
+                    "ChromaDocumentStore cannot store documents with `content=None`. "
+                    "`array`, `dataframe` and `blob` are not supported. "
+                    "Document with id %s will be skipped.",
+                    doc.id,
                 )
+                continue
             data = {"ids": [doc.id], "documents": [doc.content]}
 
             if doc.meta:

diff --git a/integrations/chroma/src/haystack_integrations/document_stores/chroma/filters.py b/integrations/chroma/src/haystack_integrations/document_stores/chroma/filters.py
@@ -1,6 +1,6 @@
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 from chromadb.api.types import validate_where, validate_where_document
 
@@ -34,8 +34,8 @@ class ChromaFilter:
     """
 
     ids: List[str]
-    where: Dict[str, Any]
-    where_document: Dict[str, Any]
+    where: Optional[Dict[str, Any]]
+    where_document: Optional[Dict[str, Any]]
 
 
 def _convert_filters(filters: Dict[str, Any]) -> ChromaFilter:
@@ -80,7 +80,7 @@ def _convert_filters(filters: Dict[str, Any]) -> ChromaFilter:
         msg = f"Invalid '{test_clause}' : {e}"
         raise ChromaDocumentStoreFilterError(msg) from e
 
-    return ChromaFilter(ids=ids, where=where, where_document=where_document)
+    return ChromaFilter(ids=ids, where=where or None, where_document=where_document or None)
 
 
 def _convert_filter_clause(filters: Dict[str, Any]) -> Dict[str, Any]:

diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py
@@ -13,9 +13,12 @@
 from chromadb.api.types import Documents, EmbeddingFunction, Embeddings
 from haystack import Document
 from haystack.testing.document_store import (
+    TEST_EMBEDDING_1,
+    TEST_EMBEDDING_2,
     CountDocumentsTest,
     DeleteDocumentsTest,
     FilterDocumentsTest,
+    _random_embeddings,
 )
 
 from haystack_integrations.document_stores.chroma import ChromaDocumentStore
@@ -51,6 +54,67 @@ def document_store(self) -> ChromaDocumentStore:
             get_func.return_value = _TestEmbeddingFunction()
             return ChromaDocumentStore(embedding_function="test_function", collection_name=str(uuid.uuid1()))
 
+    @pytest.fixture
+    def filterable_docs(self) -> List[Document]:
+        """
+        This fixture has been copied from haystack/testing/document_store.py and modified to
+        remove the documents that don't have textual content, as Chroma does not support writing them.
+        """
+        documents = []
+        for i in range(3):
+            documents.append(
+                Document(
+                    content=f"A Foo Document {i}",
+                    meta={
+                        "name": f"name_{i}",
+                        "page": "100",
+                        "chapter": "intro",
+                        "number": 2,
+                        "date": "1969-07-21T20:17:40",
+                    },
+                    embedding=_random_embeddings(768),
+                )
+            )
+            documents.append(
+                Document(
+                    content=f"A Bar Document {i}",
+                    meta={
+                        "name": f"name_{i}",
+                        "page": "123",
+                        "chapter": "abstract",
+                        "number": -2,
+                        "date": "1972-12-11T19:54:58",
+                    },
+                    embedding=_random_embeddings(768),
+                )
+            )
+            documents.append(
+                Document(
+                    content=f"A Foobar Document {i}",
+                    meta={
+                        "name": f"name_{i}",
+                        "page": "90",
+                        "chapter": "conclusion",
+                        "number": -10,
+                        "date": "1989-11-09T17:53:00",
+                    },
+                    embedding=_random_embeddings(768),
+                )
+            )
+            documents.append(
+                Document(
+                    content=f"Document {i} without embedding",
+                    meta={"name": f"name_{i}", "no_embedding": True, "chapter": "conclusion"},
+                )
+            )
+            documents.append(
+                Document(content=f"Doc {i} with zeros emb", meta={"name": "zeros_doc"}, embedding=TEST_EMBEDDING_1)
+            )
+            documents.append(
+                Document(content=f"Doc {i} with ones emb", meta={"name": "ones_doc"}, embedding=TEST_EMBEDDING_2)
+            )
+        return documents
+
     def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
         """
         Assert that two lists of Documents are equal.
@@ -283,7 +347,6 @@ def test_contains(self, document_store: ChromaDocumentStore, filterable_docs: Li
         )
 
     def test_multiple_contains(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
-        filterable_docs = [doc for doc in filterable_docs if doc.content]  # remove documents without content
         document_store.write_documents(filterable_docs)
         filters = {
             "operator": "OR",