Skip to content

Commit

Permalink
Mongodb keyword search (#1228)
Browse files Browse the repository at this point in the history
* feat: add full-text search capability

* feat: add full-text retriever

* docs: update docs for mongodb atlas indexes

* docs: update usage example for document store

* feat: update embedding retrieval example

* feat: add hybrid retrieval example

* fix: correct typo for parameter name

* test: add full-text retrieval test

* test: add test for full-text aggregation pipeline

* tested examples ; minor refactor adding prints

* fix lint

* fix test

* update test

* fix lint

* fix fulltext_retriever tests

* update workflow to set MONGO_CONNECTION_STRING_2 in env

---------

Co-authored-by: kanenorman <[email protected]>
Co-authored-by: Kane Norman <[email protected]>
Co-authored-by: Vladimir Blagojevic <[email protected]>
  • Loading branch information
4 people authored Dec 5, 2024
1 parent 2c80a0b commit 1959ab1
Show file tree
Hide file tree
Showing 11 changed files with 744 additions and 18 deletions.
13 changes: 7 additions & 6 deletions .github/workflows/mongodb_atlas.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ name: Test / mongodb_atlas

on:
schedule:
- cron: "0 0 * * *"
- cron: '0 0 * * *'
pull_request:
paths:
- "integrations/mongodb_atlas/**"
- ".github/workflows/mongodb_atlas.yml"
- 'integrations/mongodb_atlas/**'
- '.github/workflows/mongodb_atlas.yml'

defaults:
run:
Expand All @@ -19,9 +19,10 @@ concurrency:
cancel-in-progress: true

env:
PYTHONUNBUFFERED: "1"
FORCE_COLOR: "1"
PYTHONUNBUFFERED: '1'
FORCE_COLOR: '1'
MONGO_CONNECTION_STRING: ${{ secrets.MONGO_CONNECTION_STRING }}
MONGO_CONNECTION_STRING_2: ${{ secrets.MONGO_CONNECTION_STRING_2 }}

jobs:
run:
Expand All @@ -31,7 +32,7 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
python-version: ["3.9", "3.10", "3.11"]
python-version: ['3.9', '3.10', '3.11']

steps:
- uses: actions/checkout@v4
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

# To use the MongoDBAtlasDocumentStore, you must have a running MongoDB Atlas database.
# For details, see https://www.mongodb.com/docs/atlas/getting-started/
# NOTE: you need to create manually the vector search index and the full text search
# index in your MongoDB Atlas database.

# Once your database is set, set the environment variable `MONGO_CONNECTION_STRING`
# with the connection string to your MongoDB Atlas database.
Expand All @@ -29,12 +31,17 @@
database_name="haystack_test",
collection_name="test_collection",
vector_search_index="test_vector_search_index",
full_text_search_index="test_full_text_search_index",
)

# This is to avoid duplicates in the collection
print(f"Cleaning up collection {document_store.collection_name}")
document_store.collection.delete_many({})

# Create the indexing Pipeline and index some documents
file_paths = glob.glob("neural-search-pills/pills/*.md")


print("Creating indexing pipeline")
indexing = Pipeline()
indexing.add_component("converter", MarkdownToDocument())
indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2))
Expand All @@ -44,17 +51,20 @@
indexing.connect("splitter", "embedder")
indexing.connect("embedder", "writer")

print(f"Running indexing pipeline with {len(file_paths)} files")
indexing.run({"converter": {"sources": file_paths}})


# Create the querying Pipeline and try a query
print("Creating querying pipeline")
querying = Pipeline()
querying.add_component("embedder", SentenceTransformersTextEmbedder())
querying.add_component("retriever", MongoDBAtlasEmbeddingRetriever(document_store=document_store, top_k=3))
querying.connect("embedder", "retriever")

query = "What is a cross-encoder?"
print(f"Running querying pipeline with query: '{query}'")
results = querying.run({"embedder": {"text": "What is a cross-encoder?"}})

print(f"Results: {results}")
for doc in results["retriever"]["documents"]:
print(doc)
print("-" * 10)
80 changes: 80 additions & 0 deletions integrations/mongodb_atlas/examples/hybrid_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Install required packages for this example, including mongodb-atlas-haystack and other libraries needed
# for Markdown conversion and embeddings generation. Use the following command:
#
# pip install mongodb-atlas-haystack markdown-it-py mdit_plain "sentence-transformers>=2.2.0"
#
# Download some Markdown files to index.
# git clone https://github.com/anakin87/neural-search-pills

import glob

from haystack import Pipeline
from haystack.components.converters import MarkdownToDocument
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
from haystack.components.joiners import DocumentJoiner
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter

from haystack_integrations.components.retrievers.mongodb_atlas import (
MongoDBAtlasEmbeddingRetriever,
MongoDBAtlasFullTextRetriever,
)
from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore

# To use the MongoDBAtlasDocumentStore, you must have a running MongoDB Atlas database.
# For details, see https://www.mongodb.com/docs/atlas/getting-started/
# NOTE: you need to create manually the vector search index and the full text search
# index in your MongoDB Atlas database.

# Once your database is set, set the environment variable `MONGO_CONNECTION_STRING`
# with the connection string to your MongoDB Atlas database.
# format: "mongodb+srv://{mongo_atlas_username}:{mongo_atlas_password}@{mongo_atlas_host}/?{mongo_atlas_params_string}".

# Initialize the document store
document_store = MongoDBAtlasDocumentStore(
database_name="haystack_test",
collection_name="test_collection",
vector_search_index="test_vector_search_index",
full_text_search_index="test_full_text_search_index",
)

file_paths = glob.glob("neural-search-pills/pills/*.md")

# This is to avoid duplicates in the collection
print(f"Cleaning up collection {document_store.collection_name}")
document_store.collection.delete_many({})

print("Creating indexing pipeline")
indexing = Pipeline()
indexing.add_component("converter", MarkdownToDocument())
indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2))
indexing.add_component("document_embedder", SentenceTransformersDocumentEmbedder())
indexing.add_component("writer", DocumentWriter(document_store))
indexing.connect("converter", "splitter")
indexing.connect("splitter", "document_embedder")
indexing.connect("document_embedder", "writer")

print(f"Running indexing pipeline with {len(file_paths)} files")
indexing.run({"converter": {"sources": file_paths}})

print("Creating querying pipeline")
querying = Pipeline()
querying.add_component("text_embedder", SentenceTransformersTextEmbedder())
querying.add_component("embedding_retriever", MongoDBAtlasEmbeddingRetriever(document_store=document_store, top_k=3))
querying.add_component("full_text_retriever", MongoDBAtlasFullTextRetriever(document_store=document_store, top_k=3))
querying.add_component(
"joiner",
DocumentJoiner(join_mode="reciprocal_rank_fusion", top_k=3),
)
querying.connect("text_embedder", "embedding_retriever")
querying.connect("embedding_retriever", "joiner")
querying.connect("full_text_retriever", "joiner")

query = "cross-encoder"
print(f"Running querying pipeline with query '{query}'")
results = querying.run({"text_embedder": {"text": query}, "full_text_retriever": {"query": query}})

print(f"Results: {results}")
for doc in results["joiner"]["documents"]:
print(doc)
print("-" * 10)
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from haystack_integrations.components.retrievers.mongodb_atlas.embedding_retriever import MongoDBAtlasEmbeddingRetriever
from haystack_integrations.components.retrievers.mongodb_atlas.full_text_retriever import MongoDBAtlasFullTextRetriever

__all__ = ["MongoDBAtlasEmbeddingRetriever"]
__all__ = ["MongoDBAtlasEmbeddingRetriever", "MongoDBAtlasFullTextRetriever"]
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ class MongoDBAtlasEmbeddingRetriever:
store = MongoDBAtlasDocumentStore(database_name="haystack_integration_test",
collection_name="test_embeddings_collection",
vector_search_index="cosine_index")
vector_search_index="cosine_index",
full_text_search_index="full_text_index")
retriever = MongoDBAtlasEmbeddingRetriever(document_store=store)
results = retriever.run(query_embedding=np.random.random(768).tolist())
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Dict, List, Literal, Optional, Union

from haystack import component, default_from_dict, default_to_dict
from haystack.dataclasses import Document
from haystack.document_stores.types import FilterPolicy
from haystack.document_stores.types.filter_policy import apply_filter_policy

from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore


@component
class MongoDBAtlasFullTextRetriever:
"""
Retrieves documents from the MongoDBAtlasDocumentStore by full-text search.
The full-text search is dependent on the full_text_search_index used in the MongoDBAtlasDocumentStore.
See MongoDBAtlasDocumentStore for more information.
Usage example:
```python
from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore
from haystack_integrations.components.retrievers.mongodb_atlas import MongoDBAtlasFullTextRetriever
store = MongoDBAtlasDocumentStore(database_name="your_existing_db",
collection_name="your_existing_collection",
vector_search_index="your_existing_index",
full_text_search_index="your_existing_index")
retriever = MongoDBAtlasFullTextRetriever(document_store=store)
results = retriever.run(query="Lorem ipsum")
print(results["documents"])
```
The example above retrieves the 10 most similar documents to the query "Lorem ipsum" from the
MongoDBAtlasDocumentStore.
"""

def __init__(
self,
*,
document_store: MongoDBAtlasDocumentStore,
filters: Optional[Dict[str, Any]] = None,
top_k: int = 10,
filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE,
):
"""
:param document_store: An instance of MongoDBAtlasDocumentStore.
:param filters: Filters applied to the retrieved Documents. Make sure that the fields used in the filters are
included in the configuration of the `full_text_search_index`. The configuration must be done manually
in the Web UI of MongoDB Atlas.
:param top_k: Maximum number of Documents to return.
:param filter_policy: Policy to determine how filters are applied.
:raises ValueError: If `document_store` is not an instance of MongoDBAtlasDocumentStore.
"""

if not isinstance(document_store, MongoDBAtlasDocumentStore):
msg = "document_store must be an instance of MongoDBAtlasDocumentStore"
raise ValueError(msg)

self.document_store = document_store
self.filters = filters or {}
self.top_k = top_k
self.filter_policy = (
filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy)
)

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
filters=self.filters,
top_k=self.top_k,
filter_policy=self.filter_policy.value,
document_store=self.document_store.to_dict(),
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "MongoDBAtlasFullTextRetriever":
"""
Deserializes the component from a dictionary.
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
data["init_parameters"]["document_store"] = MongoDBAtlasDocumentStore.from_dict(
data["init_parameters"]["document_store"]
)

return default_from_dict(cls, data)

@component.output_types(documents=List[Document])
def run(
self,
query: Union[str, List[str]],
fuzzy: Optional[Dict[str, int]] = None,
match_criteria: Optional[Literal["any", "all"]] = None,
score: Optional[Dict[str, Dict]] = None,
synonyms: Optional[str] = None,
filters: Optional[Dict[str, Any]] = None,
top_k: int = 10,
) -> Dict[str, List[Document]]:
"""
Retrieve documents from the MongoDBAtlasDocumentStore by full-text search.
:param query: The query string or a list of query strings to search for.
If the query contains multiple terms, Atlas Search evaluates each term separately for matches.
:param fuzzy: Enables finding strings similar to the search term(s).
Note, `fuzzy` cannot be used with `synonyms`. Configurable options include `maxEdits`, `prefixLength`,
and `maxExpansions`. For more details refer to MongoDB Atlas
[documentation](https://www.mongodb.com/docs/atlas/atlas-search/text/#fields).
:param match_criteria: Defines how terms in the query are matched. Supported options are `"any"` and `"all"`.
For more details refer to MongoDB Atlas
[documentation](https://www.mongodb.com/docs/atlas/atlas-search/text/#fields).
:param score: Specifies the scoring method for matching results. Supported options include `boost`, `constant`,
and `function`. For more details refer to MongoDB Atlas
[documentation](https://www.mongodb.com/docs/atlas/atlas-search/text/#fields).
:param synonyms: The name of the synonym mapping definition in the index. This value cannot be an empty string.
Note, `synonyms` can not be used with `fuzzy`.
:param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
the `filter_policy` chosen at retriever initialization. See init method docstring for more
details.
:param top_k: Maximum number of Documents to return. Overrides the value specified at initialization.
:returns: A dictionary with the following keys:
- `documents`: List of Documents most similar to the given `query`
"""
filters = apply_filter_policy(self.filter_policy, self.filters, filters)
top_k = top_k or self.top_k

docs = self.document_store._fulltext_retrieval(
query=query,
fuzzy=fuzzy,
match_criteria=match_criteria,
score=score,
synonyms=synonyms,
filters=filters,
top_k=top_k,
)

return {"documents": docs}
Loading

0 comments on commit 1959ab1

Please sign in to comment.