-
Notifications
You must be signed in to change notification settings - Fork 127
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* feat: add full-text search capability * feat: add full-text retriever * docs: update docs for mongodb atlas indexes * docs: update usage example for document store * feat: update embedding retrieval example * feat: add hybrid retrieval example * fix: correct typo for parameter name * test: add full-text retrieval test * test: add test for full-text aggregation pipeline * tested examples ; minor refactor adding prints * fix lint * fix test * update test * fix lint * fix fulltext_retriever tests * update workflow to set MONGO_CONNECTION_STRING_2 in env --------- Co-authored-by: kanenorman <[email protected]> Co-authored-by: Kane Norman <[email protected]> Co-authored-by: Vladimir Blagojevic <[email protected]>
- Loading branch information
1 parent
2c80a0b
commit 1959ab1
Showing
11 changed files
with
744 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
# Install required packages for this example, including mongodb-atlas-haystack and other libraries needed | ||
# for Markdown conversion and embeddings generation. Use the following command: | ||
# | ||
# pip install mongodb-atlas-haystack markdown-it-py mdit_plain "sentence-transformers>=2.2.0" | ||
# | ||
# Download some Markdown files to index. | ||
# git clone https://github.com/anakin87/neural-search-pills | ||
|
||
import glob | ||
|
||
from haystack import Pipeline | ||
from haystack.components.converters import MarkdownToDocument | ||
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder | ||
from haystack.components.joiners import DocumentJoiner | ||
from haystack.components.preprocessors import DocumentSplitter | ||
from haystack.components.writers import DocumentWriter | ||
|
||
from haystack_integrations.components.retrievers.mongodb_atlas import ( | ||
MongoDBAtlasEmbeddingRetriever, | ||
MongoDBAtlasFullTextRetriever, | ||
) | ||
from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore | ||
|
||
# To use the MongoDBAtlasDocumentStore, you must have a running MongoDB Atlas database. | ||
# For details, see https://www.mongodb.com/docs/atlas/getting-started/ | ||
# NOTE: you need to create manually the vector search index and the full text search | ||
# index in your MongoDB Atlas database. | ||
|
||
# Once your database is set, set the environment variable `MONGO_CONNECTION_STRING` | ||
# with the connection string to your MongoDB Atlas database. | ||
# format: "mongodb+srv://{mongo_atlas_username}:{mongo_atlas_password}@{mongo_atlas_host}/?{mongo_atlas_params_string}". | ||
|
||
# Initialize the document store | ||
document_store = MongoDBAtlasDocumentStore( | ||
database_name="haystack_test", | ||
collection_name="test_collection", | ||
vector_search_index="test_vector_search_index", | ||
full_text_search_index="test_full_text_search_index", | ||
) | ||
|
||
file_paths = glob.glob("neural-search-pills/pills/*.md") | ||
|
||
# This is to avoid duplicates in the collection | ||
print(f"Cleaning up collection {document_store.collection_name}") | ||
document_store.collection.delete_many({}) | ||
|
||
print("Creating indexing pipeline") | ||
indexing = Pipeline() | ||
indexing.add_component("converter", MarkdownToDocument()) | ||
indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2)) | ||
indexing.add_component("document_embedder", SentenceTransformersDocumentEmbedder()) | ||
indexing.add_component("writer", DocumentWriter(document_store)) | ||
indexing.connect("converter", "splitter") | ||
indexing.connect("splitter", "document_embedder") | ||
indexing.connect("document_embedder", "writer") | ||
|
||
print(f"Running indexing pipeline with {len(file_paths)} files") | ||
indexing.run({"converter": {"sources": file_paths}}) | ||
|
||
print("Creating querying pipeline") | ||
querying = Pipeline() | ||
querying.add_component("text_embedder", SentenceTransformersTextEmbedder()) | ||
querying.add_component("embedding_retriever", MongoDBAtlasEmbeddingRetriever(document_store=document_store, top_k=3)) | ||
querying.add_component("full_text_retriever", MongoDBAtlasFullTextRetriever(document_store=document_store, top_k=3)) | ||
querying.add_component( | ||
"joiner", | ||
DocumentJoiner(join_mode="reciprocal_rank_fusion", top_k=3), | ||
) | ||
querying.connect("text_embedder", "embedding_retriever") | ||
querying.connect("embedding_retriever", "joiner") | ||
querying.connect("full_text_retriever", "joiner") | ||
|
||
query = "cross-encoder" | ||
print(f"Running querying pipeline with query '{query}'") | ||
results = querying.run({"text_embedder": {"text": query}, "full_text_retriever": {"query": query}}) | ||
|
||
print(f"Results: {results}") | ||
for doc in results["joiner"]["documents"]: | ||
print(doc) | ||
print("-" * 10) |
3 changes: 2 additions & 1 deletion
3
...s/mongodb_atlas/src/haystack_integrations/components/retrievers/mongodb_atlas/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
from haystack_integrations.components.retrievers.mongodb_atlas.embedding_retriever import MongoDBAtlasEmbeddingRetriever | ||
from haystack_integrations.components.retrievers.mongodb_atlas.full_text_retriever import MongoDBAtlasFullTextRetriever | ||
|
||
__all__ = ["MongoDBAtlasEmbeddingRetriever"] | ||
__all__ = ["MongoDBAtlasEmbeddingRetriever", "MongoDBAtlasFullTextRetriever"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
150 changes: 150 additions & 0 deletions
150
...tlas/src/haystack_integrations/components/retrievers/mongodb_atlas/full_text_retriever.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]> | ||
# | ||
# SPDX-License-Identifier: Apache-2.0 | ||
from typing import Any, Dict, List, Literal, Optional, Union | ||
|
||
from haystack import component, default_from_dict, default_to_dict | ||
from haystack.dataclasses import Document | ||
from haystack.document_stores.types import FilterPolicy | ||
from haystack.document_stores.types.filter_policy import apply_filter_policy | ||
|
||
from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore | ||
|
||
|
||
@component | ||
class MongoDBAtlasFullTextRetriever: | ||
""" | ||
Retrieves documents from the MongoDBAtlasDocumentStore by full-text search. | ||
The full-text search is dependent on the full_text_search_index used in the MongoDBAtlasDocumentStore. | ||
See MongoDBAtlasDocumentStore for more information. | ||
Usage example: | ||
```python | ||
from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore | ||
from haystack_integrations.components.retrievers.mongodb_atlas import MongoDBAtlasFullTextRetriever | ||
store = MongoDBAtlasDocumentStore(database_name="your_existing_db", | ||
collection_name="your_existing_collection", | ||
vector_search_index="your_existing_index", | ||
full_text_search_index="your_existing_index") | ||
retriever = MongoDBAtlasFullTextRetriever(document_store=store) | ||
results = retriever.run(query="Lorem ipsum") | ||
print(results["documents"]) | ||
``` | ||
The example above retrieves the 10 most similar documents to the query "Lorem ipsum" from the | ||
MongoDBAtlasDocumentStore. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
*, | ||
document_store: MongoDBAtlasDocumentStore, | ||
filters: Optional[Dict[str, Any]] = None, | ||
top_k: int = 10, | ||
filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE, | ||
): | ||
""" | ||
:param document_store: An instance of MongoDBAtlasDocumentStore. | ||
:param filters: Filters applied to the retrieved Documents. Make sure that the fields used in the filters are | ||
included in the configuration of the `full_text_search_index`. The configuration must be done manually | ||
in the Web UI of MongoDB Atlas. | ||
:param top_k: Maximum number of Documents to return. | ||
:param filter_policy: Policy to determine how filters are applied. | ||
:raises ValueError: If `document_store` is not an instance of MongoDBAtlasDocumentStore. | ||
""" | ||
|
||
if not isinstance(document_store, MongoDBAtlasDocumentStore): | ||
msg = "document_store must be an instance of MongoDBAtlasDocumentStore" | ||
raise ValueError(msg) | ||
|
||
self.document_store = document_store | ||
self.filters = filters or {} | ||
self.top_k = top_k | ||
self.filter_policy = ( | ||
filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy) | ||
) | ||
|
||
def to_dict(self) -> Dict[str, Any]: | ||
""" | ||
Serializes the component to a dictionary. | ||
:returns: | ||
Dictionary with serialized data. | ||
""" | ||
return default_to_dict( | ||
self, | ||
filters=self.filters, | ||
top_k=self.top_k, | ||
filter_policy=self.filter_policy.value, | ||
document_store=self.document_store.to_dict(), | ||
) | ||
|
||
@classmethod | ||
def from_dict(cls, data: Dict[str, Any]) -> "MongoDBAtlasFullTextRetriever": | ||
""" | ||
Deserializes the component from a dictionary. | ||
:param data: | ||
Dictionary to deserialize from. | ||
:returns: | ||
Deserialized component. | ||
""" | ||
data["init_parameters"]["document_store"] = MongoDBAtlasDocumentStore.from_dict( | ||
data["init_parameters"]["document_store"] | ||
) | ||
|
||
return default_from_dict(cls, data) | ||
|
||
@component.output_types(documents=List[Document]) | ||
def run( | ||
self, | ||
query: Union[str, List[str]], | ||
fuzzy: Optional[Dict[str, int]] = None, | ||
match_criteria: Optional[Literal["any", "all"]] = None, | ||
score: Optional[Dict[str, Dict]] = None, | ||
synonyms: Optional[str] = None, | ||
filters: Optional[Dict[str, Any]] = None, | ||
top_k: int = 10, | ||
) -> Dict[str, List[Document]]: | ||
""" | ||
Retrieve documents from the MongoDBAtlasDocumentStore by full-text search. | ||
:param query: The query string or a list of query strings to search for. | ||
If the query contains multiple terms, Atlas Search evaluates each term separately for matches. | ||
:param fuzzy: Enables finding strings similar to the search term(s). | ||
Note, `fuzzy` cannot be used with `synonyms`. Configurable options include `maxEdits`, `prefixLength`, | ||
and `maxExpansions`. For more details refer to MongoDB Atlas | ||
[documentation](https://www.mongodb.com/docs/atlas/atlas-search/text/#fields). | ||
:param match_criteria: Defines how terms in the query are matched. Supported options are `"any"` and `"all"`. | ||
For more details refer to MongoDB Atlas | ||
[documentation](https://www.mongodb.com/docs/atlas/atlas-search/text/#fields). | ||
:param score: Specifies the scoring method for matching results. Supported options include `boost`, `constant`, | ||
and `function`. For more details refer to MongoDB Atlas | ||
[documentation](https://www.mongodb.com/docs/atlas/atlas-search/text/#fields). | ||
:param synonyms: The name of the synonym mapping definition in the index. This value cannot be an empty string. | ||
Note, `synonyms` can not be used with `fuzzy`. | ||
:param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on | ||
the `filter_policy` chosen at retriever initialization. See init method docstring for more | ||
details. | ||
:param top_k: Maximum number of Documents to return. Overrides the value specified at initialization. | ||
:returns: A dictionary with the following keys: | ||
- `documents`: List of Documents most similar to the given `query` | ||
""" | ||
filters = apply_filter_policy(self.filter_policy, self.filters, filters) | ||
top_k = top_k or self.top_k | ||
|
||
docs = self.document_store._fulltext_retrieval( | ||
query=query, | ||
fuzzy=fuzzy, | ||
match_criteria=match_criteria, | ||
score=score, | ||
synonyms=synonyms, | ||
filters=filters, | ||
top_k=top_k, | ||
) | ||
|
||
return {"documents": docs} |
Oops, something went wrong.