From 5c46ba13a861c33d589a62b591936f40fe2b4d77 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 18 Oct 2024 13:33:11 +0200 Subject: [PATCH 1/5] initial import --- .../chroma/tests/test_document_store.py | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index f386b44ba..1fa952376 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -282,19 +282,29 @@ def test_contains(self, document_store: ChromaDocumentStore, filterable_docs: Li [doc for doc in filterable_docs if doc.content and "FOO" in doc.content], ) - def test_multiple_contains(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): - document_store.write_documents(filterable_docs) + def test_multiple_contains(self, document_store: ChromaDocumentStore): + + documents = [ + Document(content="The cat chased the mouse in the garden."), + Document(content="The cat sat on the windowsill watching the birds."), + Document(content="The cat played with a ball of yarn."), + Document(content="The cat napped peacefully in the sun.") + ] + + document_store.write_documents(documents) + filters = { - "operator": "OR", + "operator": "AND", "conditions": [ - {"field": "content", "operator": "contains", "value": "FOO"}, - {"field": "content", "operator": "not contains", "value": "BAR"}, + {"field": "content", "operator": "contains", "value": "cat"}, + {"field": "content", "operator": "not contains", "value": "birds"}, ], } result = document_store.filter_documents(filters=filters) + self.assert_documents_are_equal( result, - [doc for doc in filterable_docs if doc.content and ("FOO" in doc.content or "BAR" not in doc.content)], + [doc for doc in documents if "cat" in doc.content and "birds" not in doc.content], ) def test_nested_logical_filters(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): From a41a4688c2effd6d2707ea367012891b5aafbfce Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 18 Oct 2024 13:47:19 +0200 Subject: [PATCH 2/5] updating tests --- integrations/chroma/tests/test_document_store.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index 1fa952376..373e80feb 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import logging import operator +import pandas as pd import sys import uuid from typing import List @@ -288,7 +289,9 @@ def test_multiple_contains(self, document_store: ChromaDocumentStore): Document(content="The cat chased the mouse in the garden."), Document(content="The cat sat on the windowsill watching the birds."), Document(content="The cat played with a ball of yarn."), - Document(content="The cat napped peacefully in the sun.") + Document(content="The cat napped peacefully in the sun."), + Document(content=None), + Document(dataframe=pd.DataFrame({"text": ["Something irrelevant"]})), ] document_store.write_documents(documents) @@ -304,7 +307,7 @@ def test_multiple_contains(self, document_store: ChromaDocumentStore): self.assert_documents_are_equal( result, - [doc for doc in documents if "cat" in doc.content and "birds" not in doc.content], + [doc for doc in documents if doc.content and "cat" in doc.content and "birds" not in doc.content], ) def test_nested_logical_filters(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): From c8c9837b731a82cbfb134981b2c41f5f3f4208ce Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 18 Oct 2024 13:58:46 +0200 Subject: [PATCH 3/5] tryiing to get the hatch linting to run locally --- integrations/chroma/tests/test_document_store.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index 373e80feb..5777051a3 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -1,17 +1,19 @@ # SPDX-FileCopyrightText: 2023-present John Doe # # SPDX-License-Identifier: Apache-2.0 +from unittest import mock + import logging +import numpy as np import operator import pandas as pd +import pytest import sys import uuid +from chromadb.api.types import Documents, EmbeddingFunction, Embeddings +from haystack_integrations.document_stores.chroma import ChromaDocumentStore from typing import List -from unittest import mock -import numpy as np -import pytest -from chromadb.api.types import Documents, EmbeddingFunction, Embeddings from haystack import Document from haystack.testing.document_store import ( CountDocumentsTest, @@ -19,8 +21,6 @@ FilterDocumentsTest, ) -from haystack_integrations.document_stores.chroma import ChromaDocumentStore - class _TestEmbeddingFunction(EmbeddingFunction): """ From 91b928e40b9ef6166f0a0ad000036e149425d99a Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 18 Oct 2024 17:26:54 +0200 Subject: [PATCH 4/5] simplifying test_multiple_contains - so that it also uses the fixtures --- .../document_stores/chroma/filters.py | 2 +- .../chroma/tests/test_document_store.py | 58 +++++-------------- 2 files changed, 15 insertions(+), 45 deletions(-) diff --git a/integrations/chroma/src/haystack_integrations/document_stores/chroma/filters.py b/integrations/chroma/src/haystack_integrations/document_stores/chroma/filters.py index ef5c920a7..60046b6ad 100644 --- a/integrations/chroma/src/haystack_integrations/document_stores/chroma/filters.py +++ b/integrations/chroma/src/haystack_integrations/document_stores/chroma/filters.py @@ -27,7 +27,7 @@ class ChromaFilter: """ Dataclass to store the converted filter structure used in Chroma queries. - Following filter criterias are supported: + Following filter criteria are supported: - `ids`: A list of document IDs to filter by in Chroma collection. - `where`: A dictionary of metadata filters applied to the documents. - `where_document`: A dictionary of content-based filters applied to the documents' content. diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index 5777051a3..c05264d22 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -1,19 +1,17 @@ # SPDX-FileCopyrightText: 2023-present John Doe # # SPDX-License-Identifier: Apache-2.0 -from unittest import mock - import logging -import numpy as np import operator -import pandas as pd -import pytest import sys import uuid -from chromadb.api.types import Documents, EmbeddingFunction, Embeddings -from haystack_integrations.document_stores.chroma import ChromaDocumentStore from typing import List +from unittest import mock +import numpy as np +import pandas as pd +import pytest +from chromadb.api.types import Documents, EmbeddingFunction, Embeddings from haystack import Document from haystack.testing.document_store import ( CountDocumentsTest, @@ -21,6 +19,8 @@ FilterDocumentsTest, ) +from haystack_integrations.document_stores.chroma import ChromaDocumentStore + class _TestEmbeddingFunction(EmbeddingFunction): """ @@ -283,31 +283,20 @@ def test_contains(self, document_store: ChromaDocumentStore, filterable_docs: Li [doc for doc in filterable_docs if doc.content and "FOO" in doc.content], ) - def test_multiple_contains(self, document_store: ChromaDocumentStore): - - documents = [ - Document(content="The cat chased the mouse in the garden."), - Document(content="The cat sat on the windowsill watching the birds."), - Document(content="The cat played with a ball of yarn."), - Document(content="The cat napped peacefully in the sun."), - Document(content=None), - Document(dataframe=pd.DataFrame({"text": ["Something irrelevant"]})), - ] - - document_store.write_documents(documents) - + def test_multiple_contains(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): + filterable_docs = [doc for doc in filterable_docs if doc.content] # remove documents without content + document_store.write_documents(filterable_docs) filters = { - "operator": "AND", + "operator": "OR", "conditions": [ - {"field": "content", "operator": "contains", "value": "cat"}, - {"field": "content", "operator": "not contains", "value": "birds"}, + {"field": "content", "operator": "contains", "value": "FOO"}, + {"field": "content", "operator": "not contains", "value": "BAR"}, ], } result = document_store.filter_documents(filters=filters) - self.assert_documents_are_equal( result, - [doc for doc in documents if doc.content and "cat" in doc.content and "birds" not in doc.content], + [doc for doc in filterable_docs if doc.content and ("FOO" in doc.content or "BAR" not in doc.content)], ) def test_nested_logical_filters(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): @@ -355,25 +344,6 @@ def test_nested_logical_filters(self, document_store: ChromaDocumentStore, filte ], ) - # Override inequality tests from FilterDocumentsTest - # because chroma doesn't return documents with absent meta fields - - def test_comparison_not_equal(self, document_store, filterable_docs): - """Test filter_documents() with != comparator""" - document_store.write_documents(filterable_docs) - result = document_store.filter_documents({"field": "meta.number", "operator": "!=", "value": 100}) - self.assert_documents_are_equal( - result, [d for d in filterable_docs if "number" in d.meta and d.meta.get("number") != 100] - ) - - def test_comparison_not_in(self, document_store, filterable_docs): - """Test filter_documents() with 'not in' comparator""" - document_store.write_documents(filterable_docs) - result = document_store.filter_documents({"field": "meta.number", "operator": "not in", "value": [2, 9]}) - self.assert_documents_are_equal( - result, [d for d in filterable_docs if "number" in d.meta and d.meta.get("number") not in [2, 9]] - ) - @pytest.mark.skip(reason="Filter on dataframe contents is not supported.") def test_comparison_equal_with_dataframe( self, document_store: ChromaDocumentStore, filterable_docs: List[Document] From c90872caa2239a70a242ba200b8453a85ebaee56 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 18 Oct 2024 17:31:53 +0200 Subject: [PATCH 5/5] removing unused imports --- integrations/chroma/tests/test_document_store.py | 1 - 1 file changed, 1 deletion(-) diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index c05264d22..987f6d8b7 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -9,7 +9,6 @@ from unittest import mock import numpy as np -import pandas as pd import pytest from chromadb.api.types import Documents, EmbeddingFunction, Embeddings from haystack import Document