From d09cb481e15bc2532d1ea8e66907ecd153968ff5 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Wed, 29 Nov 2023 21:18:16 +0100 Subject: [PATCH 1/5] fix test class --- integrations/chroma/tests/test_document_store.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index ece91b252..9ad5f96e9 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -9,7 +9,12 @@ import pytest from chromadb.api.types import Documents, EmbeddingFunction, Embeddings from haystack import Document -from haystack.testing.document_store import DocumentStoreBaseTests +from haystack.testing.document_store import ( + CountDocumentsTest, + WriteDocumentsTest, + DeleteDocumentsTest, + LegacyFilterDocumentsTest, +) from chroma_haystack.document_store import ChromaDocumentStore @@ -26,14 +31,14 @@ def __call__(self, input: Documents) -> Embeddings: # noqa - chroma will inspec return [np.random.default_rng().uniform(-1, 1, 768).tolist()] -class TestDocumentStore(DocumentStoreBaseTests): +class TestDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest, LegacyFilterDocumentsTest): """ Common test cases will be provided by `DocumentStoreBaseTests` but you can add more to this class. """ @pytest.fixture - def docstore(self) -> ChromaDocumentStore: + def document_store(self) -> ChromaDocumentStore: """ This is the most basic requirement for the child class: provide an instance of this document store so the base class can use it. From b956666757e62b9b25331087bd4d6416b4f9c7ff Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Wed, 29 Nov 2023 21:19:27 +0100 Subject: [PATCH 2/5] remove deprecated method --- integrations/chroma/src/chroma_haystack/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/chroma/src/chroma_haystack/document_store.py b/integrations/chroma/src/chroma_haystack/document_store.py index 16e6f5e9c..b6840b2a7 100644 --- a/integrations/chroma/src/chroma_haystack/document_store.py +++ b/integrations/chroma/src/chroma_haystack/document_store.py @@ -155,7 +155,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D raise ValueError(msg) if doc.content is None: - logger.warn( + logger.warning( "ChromaDocumentStore can only store the text field of Documents: " "'array', 'dataframe' and 'blob' will be dropped." ) From 131cf94023ae6a3175b1741f9e855e27a57bc812 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Wed, 29 Nov 2023 21:21:20 +0100 Subject: [PATCH 3/5] make the class not discoverable by pytest --- integrations/chroma/tests/test_document_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index 9ad5f96e9..eb23293b6 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -19,7 +19,7 @@ from chroma_haystack.document_store import ChromaDocumentStore -class TestEmbeddingFunction(EmbeddingFunction): +class _TestEmbeddingFunction(EmbeddingFunction): """ Chroma lets you provide custom functions to compute embeddings, we use this feature to provide a fake algorithm returning random @@ -44,7 +44,7 @@ def document_store(self) -> ChromaDocumentStore: an instance of this document store so the base class can use it. """ with mock.patch("chroma_haystack.document_store.get_embedding_function") as get_func: - get_func.return_value = TestEmbeddingFunction() + get_func.return_value = _TestEmbeddingFunction() return ChromaDocumentStore(embedding_function="test_function", collection_name=str(uuid.uuid1())) @pytest.mark.unit From cd201e56ede50c7eeaeb65654e8a28e7f3dd291f Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Wed, 29 Nov 2023 21:34:29 +0100 Subject: [PATCH 4/5] patch it with glue and sticks --- .../src/chroma_haystack/document_store.py | 1 + .../chroma/src/chroma_haystack/errors.py | 2 +- .../chroma/tests/test_document_store.py | 94 +++++++++++-------- integrations/chroma/tests/test_retriever.py | 4 +- 4 files changed, 60 insertions(+), 41 deletions(-) diff --git a/integrations/chroma/src/chroma_haystack/document_store.py b/integrations/chroma/src/chroma_haystack/document_store.py index b6840b2a7..580656f76 100644 --- a/integrations/chroma/src/chroma_haystack/document_store.py +++ b/integrations/chroma/src/chroma_haystack/document_store.py @@ -216,6 +216,7 @@ def _normalize_filters(self, filters: Dict[str, Any]) -> Tuple[List[str], Dict[s passed to `ids`, `where` and `where_document` respectively. """ if not isinstance(filters, dict): + print(filters) msg = "'filters' parameter must be a dictionary" raise ChromaDocumentStoreFilterError(msg) diff --git a/integrations/chroma/src/chroma_haystack/errors.py b/integrations/chroma/src/chroma_haystack/errors.py index 474938be4..aeb0230cd 100644 --- a/integrations/chroma/src/chroma_haystack/errors.py +++ b/integrations/chroma/src/chroma_haystack/errors.py @@ -9,7 +9,7 @@ class ChromaDocumentStoreError(DocumentStoreError): pass -class ChromaDocumentStoreFilterError(FilterError): +class ChromaDocumentStoreFilterError(FilterError, ValueError): pass diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index eb23293b6..a02905ef1 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -9,6 +9,7 @@ import pytest from chromadb.api.types import Documents, EmbeddingFunction, Embeddings from haystack import Document +from haystack.document_stores.protocols import DocumentStore from haystack.testing.document_store import ( CountDocumentsTest, WriteDocumentsTest, @@ -31,7 +32,7 @@ def __call__(self, input: Documents) -> Embeddings: # noqa - chroma will inspec return [np.random.default_rng().uniform(-1, 1, 768).tolist()] -class TestDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest, LegacyFilterDocumentsTest): +class TestDocumentStore(CountDocumentsTest, DeleteDocumentsTest, LegacyFilterDocumentsTest): """ Common test cases will be provided by `DocumentStoreBaseTests` but you can add more to this class. @@ -47,33 +48,48 @@ def document_store(self) -> ChromaDocumentStore: get_func.return_value = _TestEmbeddingFunction() return ChromaDocumentStore(embedding_function="test_function", collection_name=str(uuid.uuid1())) + def assert_documents_are_equal(self, received: List[Document], expected: List[Document]): + """ + Assert that two lists of Documents are equal. + This is used in every test, if a Document Store implementation has a different behaviour + it should override this method. + + This can happen for example when the Document Store sets a score to returned Documents. + Since we can't know what the score will be, we can't compare the Documents reliably. + """ + for doc_received, doc_expected in zip(received, expected): + assert doc_received.content == doc_expected.content + assert doc_received.meta == doc_expected.meta + @pytest.mark.unit - def test_ne_filter(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_ne_filter(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): """ We customize this test because Chroma consider "not equal" true when a field is missing """ - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"page": {"$ne": "100"}}) - assert self.contains_same_docs(result, [doc for doc in filterable_docs if doc.meta.get("page", "100") != "100"]) + document_store.write_documents(filterable_docs) + result = document_store.filter_documents(filters={"page": {"$ne": "100"}}) + self.assert_documents_are_equal( + result, [doc for doc in filterable_docs if doc.meta.get("page", "100") != "100"] + ) @pytest.mark.unit - def test_delete_empty(self, docstore: ChromaDocumentStore): + def test_delete_empty(self, document_store: ChromaDocumentStore): """ Deleting a non-existing document should not raise with Chroma """ - docstore.delete_documents(["test"]) + document_store.delete_documents(["test"]) @pytest.mark.unit - def test_delete_not_empty_nonexisting(self, docstore: ChromaDocumentStore): + def test_delete_not_empty_nonexisting(self, document_store: ChromaDocumentStore): """ Deleting a non-existing document should not raise with Chroma """ doc = Document(content="test doc") - docstore.write_documents([doc]) - docstore.delete_documents(["non_existing"]) + document_store.write_documents([doc]) + document_store.delete_documents(["non_existing"]) - assert docstore.filter_documents(filters={"id": doc.id}) == [doc] + assert document_store.filter_documents(filters={"id": doc.id}) == [doc] @pytest.mark.integration def test_to_json(self, request): @@ -100,141 +116,143 @@ def test_from_json(self): @pytest.mark.skip(reason="Filter on array contents is not supported.") @pytest.mark.unit - def test_filter_document_array(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_document_array(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on dataframe contents is not supported.") @pytest.mark.unit - def test_filter_document_dataframe(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_document_dataframe(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on table contents is not supported.") @pytest.mark.unit - def test_eq_filter_table(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_eq_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on embedding value is not supported.") @pytest.mark.unit - def test_eq_filter_embedding(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_eq_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$in operator is not supported.") @pytest.mark.unit - def test_in_filter_explicit(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_in_filter_explicit(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$in operator is not supported. Filter on table contents is not supported.") @pytest.mark.unit - def test_in_filter_table(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_in_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$in operator is not supported.") @pytest.mark.unit - def test_in_filter_embedding(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_in_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on table contents is not supported.") @pytest.mark.unit - def test_ne_filter_table(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_ne_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on embedding value is not supported.") @pytest.mark.unit - def test_ne_filter_embedding(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_ne_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$nin operator is not supported. Filter on table contents is not supported.") @pytest.mark.unit - def test_nin_filter_table(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_nin_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$nin operator is not supported. Filter on embedding value is not supported.") @pytest.mark.unit - def test_nin_filter_embedding(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_nin_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$nin operator is not supported.") @pytest.mark.unit - def test_nin_filter(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_nin_filter(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit def test_filter_simple_implicit_and_with_multi_key_dict( - self, docstore: ChromaDocumentStore, filterable_docs: List[Document] + self, document_store: ChromaDocumentStore, filterable_docs: List[Document] ): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit def test_filter_simple_explicit_and_with_multikey_dict( - self, docstore: ChromaDocumentStore, filterable_docs: List[Document] + self, document_store: ChromaDocumentStore, filterable_docs: List[Document] ): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_simple_explicit_and_with_list(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_simple_explicit_and_with_list( + self, document_store: ChromaDocumentStore, filterable_docs: List[Document] + ): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_simple_implicit_and(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_simple_implicit_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_nested_explicit_and(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_explicit_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_nested_implicit_and(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_implicit_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_simple_or(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_simple_or(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_nested_or(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_or(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on table contents is not supported.") @pytest.mark.unit - def test_filter_nested_and_or_explicit(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_and_or_explicit(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_nested_and_or_implicit(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_and_or_implicit(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_nested_or_and(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_or_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit def test_filter_nested_multiple_identical_operators_same_level( - self, docstore: ChromaDocumentStore, filterable_docs: List[Document] + self, document_store: ChromaDocumentStore, filterable_docs: List[Document] ): pass @pytest.mark.skip(reason="Duplicate policy not supported.") @pytest.mark.unit - def test_write_duplicate_fail(self, docstore: ChromaDocumentStore): + def test_write_duplicate_fail(self, document_store: ChromaDocumentStore): pass @pytest.mark.skip(reason="Duplicate policy not supported.") @pytest.mark.unit - def test_write_duplicate_skip(self, docstore: ChromaDocumentStore): + def test_write_duplicate_skip(self, document_store: ChromaDocumentStore): pass @pytest.mark.skip(reason="Duplicate policy not supported.") @pytest.mark.unit - def test_write_duplicate_overwrite(self, docstore: ChromaDocumentStore): + def test_write_duplicate_overwrite(self, document_store: ChromaDocumentStore): pass diff --git a/integrations/chroma/tests/test_retriever.py b/integrations/chroma/tests/test_retriever.py index d1bbe5c49..b77dd4ca4 100644 --- a/integrations/chroma/tests/test_retriever.py +++ b/integrations/chroma/tests/test_retriever.py @@ -11,7 +11,7 @@ def test_retriever_to_json(request): ) retriever = ChromaQueryRetriever(ds, filters={"foo": "bar"}, top_k=99) assert retriever.to_dict() == { - "type": "ChromaQueryRetriever", + "type": "chroma_haystack.retriever.ChromaQueryRetriever", "init_parameters": { "filters": {"foo": "bar"}, "top_k": 99, @@ -27,7 +27,7 @@ def test_retriever_to_json(request): @pytest.mark.integration def test_retriever_from_json(request): data = { - "type": "ChromaQueryRetriever", + "type": "chroma_haystack.retriever.ChromaQueryRetriever", "init_parameters": { "filters": {"bar": "baz"}, "top_k": 42, From 108d49bafe59263d3c00765dd50afc3cdf6193e9 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Wed, 29 Nov 2023 21:41:35 +0100 Subject: [PATCH 5/5] lint --- integrations/chroma/src/chroma_haystack/document_store.py | 1 - integrations/chroma/tests/test_document_store.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/integrations/chroma/src/chroma_haystack/document_store.py b/integrations/chroma/src/chroma_haystack/document_store.py index 580656f76..b6840b2a7 100644 --- a/integrations/chroma/src/chroma_haystack/document_store.py +++ b/integrations/chroma/src/chroma_haystack/document_store.py @@ -216,7 +216,6 @@ def _normalize_filters(self, filters: Dict[str, Any]) -> Tuple[List[str], Dict[s passed to `ids`, `where` and `where_document` respectively. """ if not isinstance(filters, dict): - print(filters) msg = "'filters' parameter must be a dictionary" raise ChromaDocumentStoreFilterError(msg) diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index a02905ef1..e99a2bbfd 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -9,10 +9,8 @@ import pytest from chromadb.api.types import Documents, EmbeddingFunction, Embeddings from haystack import Document -from haystack.document_stores.protocols import DocumentStore from haystack.testing.document_store import ( CountDocumentsTest, - WriteDocumentsTest, DeleteDocumentsTest, LegacyFilterDocumentsTest, )