Mongodb keyword search (#1228)

* feat: add full-text search capability * feat: add full-text retriever * docs: update docs for mongodb atlas indexes * docs: update usage example for document store * feat: update embedding retrieval example * feat: add hybrid retrieval example * fix: correct typo for parameter name * test: add full-text retrieval test * test: add test for full-text aggregation pipeline * tested examples ; minor refactor adding prints * fix lint * fix test * update test * fix lint * fix fulltext_retriever tests * update workflow to set MONGO_CONNECTION_STRING_2 in env --------- Co-authored-by: kanenorman <[email protected]> Co-authored-by: Kane Norman <[email protected]> Co-authored-by: Vladimir Blagojevic <[email protected]>
deepset-ai · Dec 5, 2024 · 1959ab1 · 1959ab1
1 parent 2c80a0b
commit 1959ab1
Show file tree

Hide file tree

Showing 11 changed files with 744 additions and 18 deletions.
diff --git a/.github/workflows/mongodb_atlas.yml b/.github/workflows/mongodb_atlas.yml
@@ -4,11 +4,11 @@ name: Test / mongodb_atlas
 
 on:
   schedule:
-    - cron: "0 0 * * *"
+    - cron: '0 0 * * *'
   pull_request:
     paths:
-      - "integrations/mongodb_atlas/**"
-      - ".github/workflows/mongodb_atlas.yml"
+      - 'integrations/mongodb_atlas/**'
+      - '.github/workflows/mongodb_atlas.yml'
 
 defaults:
   run:
@@ -19,9 +19,10 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  PYTHONUNBUFFERED: "1"
-  FORCE_COLOR: "1"
+  PYTHONUNBUFFERED: '1'
+  FORCE_COLOR: '1'
   MONGO_CONNECTION_STRING: ${{ secrets.MONGO_CONNECTION_STRING }}
+  MONGO_CONNECTION_STRING_2: ${{ secrets.MONGO_CONNECTION_STRING_2 }}
 
 jobs:
   run:
@@ -31,7 +32,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest, windows-latest, macos-latest]
-        python-version: ["3.9", "3.10", "3.11"]
+        python-version: ['3.9', '3.10', '3.11']
 
     steps:
       - uses: actions/checkout@v4

diff --git a/...rations/mongodb_atlas/examples/example.py → ...odb_atlas/examples/embedding_retrieval.py b/...rations/mongodb_atlas/examples/example.py → ...odb_atlas/examples/embedding_retrieval.py
@@ -19,6 +19,8 @@
 
 # To use the MongoDBAtlasDocumentStore, you must have a running MongoDB Atlas database.
 # For details, see https://www.mongodb.com/docs/atlas/getting-started/
+# NOTE: you need to create manually the vector search index and the full text search
+#       index in your MongoDB Atlas database.
 
 # Once your database is set, set the environment variable `MONGO_CONNECTION_STRING`
 # with the connection string to your MongoDB Atlas database.
@@ -29,12 +31,17 @@
     database_name="haystack_test",
     collection_name="test_collection",
     vector_search_index="test_vector_search_index",
+    full_text_search_index="test_full_text_search_index",
 )
 
+# This is to avoid duplicates in the collection
+print(f"Cleaning up collection {document_store.collection_name}")
+document_store.collection.delete_many({})
+
 # Create the indexing Pipeline and index some documents
 file_paths = glob.glob("neural-search-pills/pills/*.md")
 
-
+print("Creating indexing pipeline")
 indexing = Pipeline()
 indexing.add_component("converter", MarkdownToDocument())
 indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2))
@@ -44,17 +51,20 @@
 indexing.connect("splitter", "embedder")
 indexing.connect("embedder", "writer")
 
+print(f"Running indexing pipeline with {len(file_paths)} files")
 indexing.run({"converter": {"sources": file_paths}})
 
-
-# Create the querying Pipeline and try a query
+print("Creating querying pipeline")
 querying = Pipeline()
 querying.add_component("embedder", SentenceTransformersTextEmbedder())
 querying.add_component("retriever", MongoDBAtlasEmbeddingRetriever(document_store=document_store, top_k=3))
 querying.connect("embedder", "retriever")
 
+query = "What is a cross-encoder?"
+print(f"Running querying pipeline with query: '{query}'")
 results = querying.run({"embedder": {"text": "What is a cross-encoder?"}})
 
+print(f"Results: {results}")
 for doc in results["retriever"]["documents"]:
     print(doc)
     print("-" * 10)
diff --git a/integrations/mongodb_atlas/examples/hybrid_retrieval.py b/integrations/mongodb_atlas/examples/hybrid_retrieval.py
@@ -0,0 +1,80 @@
+# Install required packages for this example, including mongodb-atlas-haystack and other libraries needed
+# for Markdown conversion and embeddings generation. Use the following command:
+#
+# pip install mongodb-atlas-haystack markdown-it-py mdit_plain "sentence-transformers>=2.2.0"
+#
+# Download some Markdown files to index.
+# git clone https://github.com/anakin87/neural-search-pills
+
+import glob
+
+from haystack import Pipeline
+from haystack.components.converters import MarkdownToDocument
+from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
+from haystack.components.joiners import DocumentJoiner
+from haystack.components.preprocessors import DocumentSplitter
+from haystack.components.writers import DocumentWriter
+
+from haystack_integrations.components.retrievers.mongodb_atlas import (
+    MongoDBAtlasEmbeddingRetriever,
+    MongoDBAtlasFullTextRetriever,
+)
+from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore
+
+# To use the MongoDBAtlasDocumentStore, you must have a running MongoDB Atlas database.
+# For details, see https://www.mongodb.com/docs/atlas/getting-started/
+# NOTE: you need to create manually the vector search index and the full text search
+#       index in your MongoDB Atlas database.
+
+# Once your database is set, set the environment variable `MONGO_CONNECTION_STRING`
+# with the connection string to your MongoDB Atlas database.
+# format: "mongodb+srv://{mongo_atlas_username}:{mongo_atlas_password}@{mongo_atlas_host}/?{mongo_atlas_params_string}".
+
+# Initialize the document store
+document_store = MongoDBAtlasDocumentStore(
+    database_name="haystack_test",
+    collection_name="test_collection",
+    vector_search_index="test_vector_search_index",
+    full_text_search_index="test_full_text_search_index",
+)
+
+file_paths = glob.glob("neural-search-pills/pills/*.md")
+
+# This is to avoid duplicates in the collection
+print(f"Cleaning up collection {document_store.collection_name}")
+document_store.collection.delete_many({})
+
+print("Creating indexing pipeline")
+indexing = Pipeline()
+indexing.add_component("converter", MarkdownToDocument())
+indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2))
+indexing.add_component("document_embedder", SentenceTransformersDocumentEmbedder())
+indexing.add_component("writer", DocumentWriter(document_store))
+indexing.connect("converter", "splitter")
+indexing.connect("splitter", "document_embedder")
+indexing.connect("document_embedder", "writer")
+
+print(f"Running indexing pipeline with {len(file_paths)} files")
+indexing.run({"converter": {"sources": file_paths}})
+
+print("Creating querying pipeline")
+querying = Pipeline()
+querying.add_component("text_embedder", SentenceTransformersTextEmbedder())
+querying.add_component("embedding_retriever", MongoDBAtlasEmbeddingRetriever(document_store=document_store, top_k=3))
+querying.add_component("full_text_retriever", MongoDBAtlasFullTextRetriever(document_store=document_store, top_k=3))
+querying.add_component(
+    "joiner",
+    DocumentJoiner(join_mode="reciprocal_rank_fusion", top_k=3),
+)
+querying.connect("text_embedder", "embedding_retriever")
+querying.connect("embedding_retriever", "joiner")
+querying.connect("full_text_retriever", "joiner")
+
+query = "cross-encoder"
+print(f"Running querying pipeline with query '{query}'")
+results = querying.run({"text_embedder": {"text": query}, "full_text_retriever": {"query": query}})
+
+print(f"Results: {results}")
+for doc in results["joiner"]["documents"]:
+    print(doc)
+    print("-" * 10)
diff --git a/...s/mongodb_atlas/src/haystack_integrations/components/retrievers/mongodb_atlas/__init__.py b/...s/mongodb_atlas/src/haystack_integrations/components/retrievers/mongodb_atlas/__init__.py
@@ -1,3 +1,4 @@
 from haystack_integrations.components.retrievers.mongodb_atlas.embedding_retriever import MongoDBAtlasEmbeddingRetriever
+from haystack_integrations.components.retrievers.mongodb_atlas.full_text_retriever import MongoDBAtlasFullTextRetriever
 
-__all__ = ["MongoDBAtlasEmbeddingRetriever"]
+__all__ = ["MongoDBAtlasEmbeddingRetriever", "MongoDBAtlasFullTextRetriever"]
diff --git a/...tlas/src/haystack_integrations/components/retrievers/mongodb_atlas/embedding_retriever.py b/...tlas/src/haystack_integrations/components/retrievers/mongodb_atlas/embedding_retriever.py
@@ -28,7 +28,8 @@ class MongoDBAtlasEmbeddingRetriever:
 
     store = MongoDBAtlasDocumentStore(database_name="haystack_integration_test",
                                       collection_name="test_embeddings_collection",
-                                      vector_search_index="cosine_index")
+                                      vector_search_index="cosine_index",
+                                      full_text_search_index="full_text_index")
     retriever = MongoDBAtlasEmbeddingRetriever(document_store=store)
 
     results = retriever.run(query_embedding=np.random.random(768).tolist())

diff --git a/...tlas/src/haystack_integrations/components/retrievers/mongodb_atlas/full_text_retriever.py b/...tlas/src/haystack_integrations/components/retrievers/mongodb_atlas/full_text_retriever.py
@@ -0,0 +1,150 @@
+# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
+#
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Dict, List, Literal, Optional, Union
+
+from haystack import component, default_from_dict, default_to_dict
+from haystack.dataclasses import Document
+from haystack.document_stores.types import FilterPolicy
+from haystack.document_stores.types.filter_policy import apply_filter_policy
+
+from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore
+
+
+@component
+class MongoDBAtlasFullTextRetriever:
+    """
+    Retrieves documents from the MongoDBAtlasDocumentStore by full-text search.
+
+    The full-text search is dependent on the full_text_search_index used in the MongoDBAtlasDocumentStore.
+    See MongoDBAtlasDocumentStore for more information.
+
+    Usage example:
+    ```python
+    from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore
+    from haystack_integrations.components.retrievers.mongodb_atlas import MongoDBAtlasFullTextRetriever
+
+    store = MongoDBAtlasDocumentStore(database_name="your_existing_db",
+                                      collection_name="your_existing_collection",
+                                      vector_search_index="your_existing_index",
+                                      full_text_search_index="your_existing_index")
+    retriever = MongoDBAtlasFullTextRetriever(document_store=store)
+
+    results = retriever.run(query="Lorem ipsum")
+    print(results["documents"])
+    ```
+
+    The example above retrieves the 10 most similar documents to the query "Lorem ipsum" from the
+    MongoDBAtlasDocumentStore.
+    """
+
+    def __init__(
+        self,
+        *,
+        document_store: MongoDBAtlasDocumentStore,
+        filters: Optional[Dict[str, Any]] = None,
+        top_k: int = 10,
+        filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE,
+    ):
+        """
+        :param document_store: An instance of MongoDBAtlasDocumentStore.
+        :param filters: Filters applied to the retrieved Documents. Make sure that the fields used in the filters are
+            included in the configuration of the `full_text_search_index`. The configuration must be done manually
+            in the Web UI of MongoDB Atlas.
+        :param top_k: Maximum number of Documents to return.
+        :param filter_policy: Policy to determine how filters are applied.
+
+        :raises ValueError: If `document_store` is not an instance of MongoDBAtlasDocumentStore.
+        """
+
+        if not isinstance(document_store, MongoDBAtlasDocumentStore):
+            msg = "document_store must be an instance of MongoDBAtlasDocumentStore"
+            raise ValueError(msg)
+
+        self.document_store = document_store
+        self.filters = filters or {}
+        self.top_k = top_k
+        self.filter_policy = (
+            filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy)
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
+        return default_to_dict(
+            self,
+            filters=self.filters,
+            top_k=self.top_k,
+            filter_policy=self.filter_policy.value,
+            document_store=self.document_store.to_dict(),
+        )
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "MongoDBAtlasFullTextRetriever":
+        """
+        Deserializes the component from a dictionary.
+
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+              Deserialized component.
+        """
+        data["init_parameters"]["document_store"] = MongoDBAtlasDocumentStore.from_dict(
+            data["init_parameters"]["document_store"]
+        )
+
+        return default_from_dict(cls, data)
+
+    @component.output_types(documents=List[Document])
+    def run(
+        self,
+        query: Union[str, List[str]],
+        fuzzy: Optional[Dict[str, int]] = None,
+        match_criteria: Optional[Literal["any", "all"]] = None,
+        score: Optional[Dict[str, Dict]] = None,
+        synonyms: Optional[str] = None,
+        filters: Optional[Dict[str, Any]] = None,
+        top_k: int = 10,
+    ) -> Dict[str, List[Document]]:
+        """
+        Retrieve documents from the MongoDBAtlasDocumentStore by full-text search.
+
+        :param query: The query string or a list of query strings to search for.
+            If the query contains multiple terms, Atlas Search evaluates each term separately for matches.
+        :param fuzzy: Enables finding strings similar to the search term(s).
+            Note, `fuzzy` cannot be used with `synonyms`. Configurable options include `maxEdits`, `prefixLength`,
+            and `maxExpansions`. For more details refer to MongoDB Atlas
+            [documentation](https://www.mongodb.com/docs/atlas/atlas-search/text/#fields).
+        :param match_criteria: Defines how terms in the query are matched. Supported options are `"any"` and `"all"`.
+            For more details refer to MongoDB Atlas
+            [documentation](https://www.mongodb.com/docs/atlas/atlas-search/text/#fields).
+        :param score: Specifies the scoring method for matching results. Supported options include `boost`, `constant`,
+            and `function`. For more details refer to MongoDB Atlas
+            [documentation](https://www.mongodb.com/docs/atlas/atlas-search/text/#fields).
+        :param synonyms: The name of the synonym mapping definition in the index. This value cannot be an empty string.
+            Note, `synonyms` can not be used with `fuzzy`.
+        :param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
+                        the `filter_policy` chosen at retriever initialization. See init method docstring for more
+                        details.
+        :param top_k: Maximum number of Documents to return. Overrides the value specified at initialization.
+        :returns: A dictionary with the following keys:
+            - `documents`: List of Documents most similar to the given `query`
+        """
+        filters = apply_filter_policy(self.filter_policy, self.filters, filters)
+        top_k = top_k or self.top_k
+
+        docs = self.document_store._fulltext_retrieval(
+            query=query,
+            fuzzy=fuzzy,
+            match_criteria=match_criteria,
+            score=score,
+            synonyms=synonyms,
+            filters=filters,
+            top_k=top_k,
+        )
+
+        return {"documents": docs}