Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patch chroma filters tests #67

Merged
merged 5 commits into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
raise ValueError(msg)

if doc.content is None:
logger.warn(
logger.warning(
"ChromaDocumentStore can only store the text field of Documents: "
"'array', 'dataframe' and 'blob' will be dropped."
)
Expand Down
2 changes: 1 addition & 1 deletion integrations/chroma/src/chroma_haystack/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class ChromaDocumentStoreError(DocumentStoreError):
pass


class ChromaDocumentStoreFilterError(FilterError):
class ChromaDocumentStoreFilterError(FilterError, ValueError):
pass


Expand Down
105 changes: 63 additions & 42 deletions integrations/chroma/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,16 @@
import pytest
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings
from haystack import Document
from haystack.testing.document_store import DocumentStoreBaseTests
from haystack.testing.document_store import (
CountDocumentsTest,
DeleteDocumentsTest,
LegacyFilterDocumentsTest,
)

from chroma_haystack.document_store import ChromaDocumentStore


class TestEmbeddingFunction(EmbeddingFunction):
class _TestEmbeddingFunction(EmbeddingFunction):
"""
Chroma lets you provide custom functions to compute embeddings,
we use this feature to provide a fake algorithm returning random
Expand All @@ -26,49 +30,64 @@ def __call__(self, input: Documents) -> Embeddings: # noqa - chroma will inspec
return [np.random.default_rng().uniform(-1, 1, 768).tolist()]


class TestDocumentStore(DocumentStoreBaseTests):
class TestDocumentStore(CountDocumentsTest, DeleteDocumentsTest, LegacyFilterDocumentsTest):
"""
Common test cases will be provided by `DocumentStoreBaseTests` but
you can add more to this class.
"""

@pytest.fixture
def docstore(self) -> ChromaDocumentStore:
def document_store(self) -> ChromaDocumentStore:
"""
This is the most basic requirement for the child class: provide
an instance of this document store so the base class can use it.
"""
with mock.patch("chroma_haystack.document_store.get_embedding_function") as get_func:
get_func.return_value = TestEmbeddingFunction()
get_func.return_value = _TestEmbeddingFunction()
return ChromaDocumentStore(embedding_function="test_function", collection_name=str(uuid.uuid1()))

def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
"""
Assert that two lists of Documents are equal.
This is used in every test, if a Document Store implementation has a different behaviour
it should override this method.

This can happen for example when the Document Store sets a score to returned Documents.
Since we can't know what the score will be, we can't compare the Documents reliably.
"""
for doc_received, doc_expected in zip(received, expected):
assert doc_received.content == doc_expected.content
assert doc_received.meta == doc_expected.meta

@pytest.mark.unit
def test_ne_filter(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_ne_filter(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
"""
We customize this test because Chroma consider "not equal" true when
a field is missing
"""
docstore.write_documents(filterable_docs)
result = docstore.filter_documents(filters={"page": {"$ne": "100"}})
assert self.contains_same_docs(result, [doc for doc in filterable_docs if doc.meta.get("page", "100") != "100"])
document_store.write_documents(filterable_docs)
result = document_store.filter_documents(filters={"page": {"$ne": "100"}})
self.assert_documents_are_equal(
result, [doc for doc in filterable_docs if doc.meta.get("page", "100") != "100"]
)

@pytest.mark.unit
def test_delete_empty(self, docstore: ChromaDocumentStore):
def test_delete_empty(self, document_store: ChromaDocumentStore):
"""
Deleting a non-existing document should not raise with Chroma
"""
docstore.delete_documents(["test"])
document_store.delete_documents(["test"])

@pytest.mark.unit
def test_delete_not_empty_nonexisting(self, docstore: ChromaDocumentStore):
def test_delete_not_empty_nonexisting(self, document_store: ChromaDocumentStore):
"""
Deleting a non-existing document should not raise with Chroma
"""
doc = Document(content="test doc")
docstore.write_documents([doc])
docstore.delete_documents(["non_existing"])
document_store.write_documents([doc])
document_store.delete_documents(["non_existing"])

assert docstore.filter_documents(filters={"id": doc.id}) == [doc]
assert document_store.filter_documents(filters={"id": doc.id}) == [doc]

@pytest.mark.integration
def test_to_json(self, request):
Expand All @@ -95,141 +114,143 @@ def test_from_json(self):

@pytest.mark.skip(reason="Filter on array contents is not supported.")
@pytest.mark.unit
def test_filter_document_array(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_filter_document_array(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="Filter on dataframe contents is not supported.")
@pytest.mark.unit
def test_filter_document_dataframe(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_filter_document_dataframe(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="Filter on table contents is not supported.")
@pytest.mark.unit
def test_eq_filter_table(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_eq_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="Filter on embedding value is not supported.")
@pytest.mark.unit
def test_eq_filter_embedding(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_eq_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="$in operator is not supported.")
@pytest.mark.unit
def test_in_filter_explicit(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_in_filter_explicit(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="$in operator is not supported. Filter on table contents is not supported.")
@pytest.mark.unit
def test_in_filter_table(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_in_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="$in operator is not supported.")
@pytest.mark.unit
def test_in_filter_embedding(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_in_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="Filter on table contents is not supported.")
@pytest.mark.unit
def test_ne_filter_table(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_ne_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="Filter on embedding value is not supported.")
@pytest.mark.unit
def test_ne_filter_embedding(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_ne_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="$nin operator is not supported. Filter on table contents is not supported.")
@pytest.mark.unit
def test_nin_filter_table(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_nin_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="$nin operator is not supported. Filter on embedding value is not supported.")
@pytest.mark.unit
def test_nin_filter_embedding(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_nin_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="$nin operator is not supported.")
@pytest.mark.unit
def test_nin_filter(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_nin_filter(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="Filter syntax not supported.")
@pytest.mark.unit
def test_filter_simple_implicit_and_with_multi_key_dict(
self, docstore: ChromaDocumentStore, filterable_docs: List[Document]
self, document_store: ChromaDocumentStore, filterable_docs: List[Document]
):
pass

@pytest.mark.skip(reason="Filter syntax not supported.")
@pytest.mark.unit
def test_filter_simple_explicit_and_with_multikey_dict(
self, docstore: ChromaDocumentStore, filterable_docs: List[Document]
self, document_store: ChromaDocumentStore, filterable_docs: List[Document]
):
pass

@pytest.mark.skip(reason="Filter syntax not supported.")
@pytest.mark.unit
def test_filter_simple_explicit_and_with_list(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_filter_simple_explicit_and_with_list(
self, document_store: ChromaDocumentStore, filterable_docs: List[Document]
):
pass

@pytest.mark.skip(reason="Filter syntax not supported.")
@pytest.mark.unit
def test_filter_simple_implicit_and(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_filter_simple_implicit_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="Filter syntax not supported.")
@pytest.mark.unit
def test_filter_nested_explicit_and(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_filter_nested_explicit_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="Filter syntax not supported.")
@pytest.mark.unit
def test_filter_nested_implicit_and(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_filter_nested_implicit_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="Filter syntax not supported.")
@pytest.mark.unit
def test_filter_simple_or(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_filter_simple_or(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="Filter syntax not supported.")
@pytest.mark.unit
def test_filter_nested_or(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_filter_nested_or(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="Filter on table contents is not supported.")
@pytest.mark.unit
def test_filter_nested_and_or_explicit(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_filter_nested_and_or_explicit(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="Filter syntax not supported.")
@pytest.mark.unit
def test_filter_nested_and_or_implicit(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_filter_nested_and_or_implicit(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="Filter syntax not supported.")
@pytest.mark.unit
def test_filter_nested_or_and(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]):
def test_filter_nested_or_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]):
pass

@pytest.mark.skip(reason="Filter syntax not supported.")
@pytest.mark.unit
def test_filter_nested_multiple_identical_operators_same_level(
self, docstore: ChromaDocumentStore, filterable_docs: List[Document]
self, document_store: ChromaDocumentStore, filterable_docs: List[Document]
):
pass

@pytest.mark.skip(reason="Duplicate policy not supported.")
@pytest.mark.unit
def test_write_duplicate_fail(self, docstore: ChromaDocumentStore):
def test_write_duplicate_fail(self, document_store: ChromaDocumentStore):
pass

@pytest.mark.skip(reason="Duplicate policy not supported.")
@pytest.mark.unit
def test_write_duplicate_skip(self, docstore: ChromaDocumentStore):
def test_write_duplicate_skip(self, document_store: ChromaDocumentStore):
pass

@pytest.mark.skip(reason="Duplicate policy not supported.")
@pytest.mark.unit
def test_write_duplicate_overwrite(self, docstore: ChromaDocumentStore):
def test_write_duplicate_overwrite(self, document_store: ChromaDocumentStore):
pass
4 changes: 2 additions & 2 deletions integrations/chroma/tests/test_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_retriever_to_json(request):
)
retriever = ChromaQueryRetriever(ds, filters={"foo": "bar"}, top_k=99)
assert retriever.to_dict() == {
"type": "ChromaQueryRetriever",
"type": "chroma_haystack.retriever.ChromaQueryRetriever",
"init_parameters": {
"filters": {"foo": "bar"},
"top_k": 99,
Expand All @@ -27,7 +27,7 @@ def test_retriever_to_json(request):
@pytest.mark.integration
def test_retriever_from_json(request):
data = {
"type": "ChromaQueryRetriever",
"type": "chroma_haystack.retriever.ChromaQueryRetriever",
"init_parameters": {
"filters": {"bar": "baz"},
"top_k": 42,
Expand Down