Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
ttmenezes authored Nov 13, 2024
2 parents 1cb63f4 + 025a05a commit ae9ca89
Show file tree
Hide file tree
Showing 15 changed files with 575 additions and 104 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta
| [cohere-haystack](integrations/cohere/) | Embedder, Generator, Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/cohere-haystack.svg)](https://pypi.org/project/cohere-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml) |
| [deepeval-haystack](integrations/deepeval/) | Evaluator | [![PyPI - Version](https://img.shields.io/pypi/v/deepeval-haystack.svg)](https://pypi.org/project/deepeval-haystack) | [![Test / deepeval](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml) |
| [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) |
| [fastembed-haystack](integrations/fastembed/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/fastembed-haystack.svg)](https://pypi.org/project/fastembed-haystack/) | [![Test / fastembed](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml) |
| [fastembed-haystack](integrations/fastembed/) | Embedder, Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/fastembed-haystack.svg)](https://pypi.org/project/fastembed-haystack/) | [![Test / fastembed](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml) |
| [google-ai-haystack](integrations/google_ai/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-ai-haystack.svg)](https://pypi.org/project/google-ai-haystack) | [![Test / google-ai](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_ai.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_ai.yml) |
| [google-vertex-haystack](integrations/google_vertex/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-vertex-haystack.svg)](https://pypi.org/project/google-vertex-haystack) | [![Test / google-vertex](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml) |
| [instructor-embedders-haystack](integrations/instructor_embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) |
Expand Down
6 changes: 6 additions & 0 deletions integrations/fastembed/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## [integrations/fastembed-v1.4.0] - 2024-11-13

### ⚙️ Miscellaneous Tasks

- Adopt uv as installer (#1142)

## [integrations/fastembed-v1.3.0] - 2024-10-07

### 🚀 Features
Expand Down
22 changes: 22 additions & 0 deletions integrations/fastembed/examples/ranker_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from haystack import Document

from haystack_integrations.components.rankers.fastembed import FastembedRanker

query = "Who is maintaining Qdrant?"
documents = [
Document(
content="This is built to be faster and lighter than other embedding libraries e.g. Transformers, Sentence-Transformers, etc."
),
Document(content="fastembed is supported by and maintained by Qdrant."),
]

ranker = FastembedRanker(model_name="Xenova/ms-marco-MiniLM-L-6-v2")
ranker.warm_up()
reranked_documents = ranker.run(query=query, documents=documents)["documents"]


print(reranked_documents["documents"][0])

# Document(id=...,
# content: 'fastembed is supported by and maintained by Qdrant.',
# score: 5.472434997558594..)
3 changes: 2 additions & 1 deletion integrations/fastembed/pydoc/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ loaders:
"haystack_integrations.components.embedders.fastembed.fastembed_document_embedder",
"haystack_integrations.components.embedders.fastembed.fastembed_text_embedder",
"haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder",
"haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder"
"haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder",
"haystack_integrations.components.rankers.fastembed.ranker"
]
ignore_when_discovered: ["__init__"]
processors:
Expand Down
6 changes: 5 additions & 1 deletion integrations/fastembed/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ classifiers = [
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = ["haystack-ai>=2.0.1", "fastembed>=0.2.5", "onnxruntime<1.20.0"]
dependencies = ["haystack-ai>=2.0.1", "fastembed>=0.4.2"]

[project.urls]
Source = "https://github.com/deepset-ai/haystack-core-integrations"
Expand Down Expand Up @@ -154,6 +154,10 @@ omit = ["*/tests/*", "*/__init__.py"]
show_missing = true
exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]

[tool.pytest.ini_options]
minversion = "6.0"
markers = ["unit: unit tests", "integration: integration tests"]

[[tool.mypy.overrides]]
module = [
"haystack.*",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .ranker import FastembedRanker

__all__ = ["FastembedRanker"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
from typing import Any, Dict, List, Optional

from haystack import Document, component, default_from_dict, default_to_dict, logging

from fastembed.rerank.cross_encoder import TextCrossEncoder

logger = logging.getLogger(__name__)


@component
class FastembedRanker:
"""
Ranks Documents based on their similarity to the query using
[Fastembed models](https://qdrant.github.io/fastembed/examples/Supported_Models/).
Documents are indexed from most to least semantically relevant to the query.
Usage example:
```python
from haystack import Document
from haystack_integrations.components.rankers.fastembed import FastembedRanker
ranker = FastembedRanker(model_name="Xenova/ms-marco-MiniLM-L-6-v2", top_k=2)
docs = [Document(content="Paris"), Document(content="Berlin")]
query = "What is the capital of germany?"
output = ranker.run(query=query, documents=docs)
print(output["documents"][0].content)
# Berlin
```
"""

def __init__(
self,
model_name: str = "Xenova/ms-marco-MiniLM-L-6-v2",
top_k: int = 10,
cache_dir: Optional[str] = None,
threads: Optional[int] = None,
batch_size: int = 64,
parallel: Optional[int] = None,
local_files_only: bool = False,
meta_fields_to_embed: Optional[List[str]] = None,
meta_data_separator: str = "\n",
):
"""
Creates an instance of the 'FastembedRanker'.
:param model_name: Fastembed model name. Check the list of supported models in the [Fastembed documentation](https://qdrant.github.io/fastembed/examples/Supported_Models/).
:param top_k: The maximum number of documents to return.
:param cache_dir: The path to the cache directory.
Can be set using the `FASTEMBED_CACHE_PATH` env variable.
Defaults to `fastembed_cache` in the system's temp directory.
:param threads: The number of threads single onnxruntime session can use. Defaults to None.
:param batch_size: Number of strings to encode at once.
:param parallel:
If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
If 0, use all available cores.
If None, don't use data-parallel processing, use default onnxruntime threading instead.
:param local_files_only: If `True`, only use the model files in the `cache_dir`.
:param meta_fields_to_embed: List of meta fields that should be concatenated
with the document content for reranking.
:param meta_data_separator: Separator used to concatenate the meta fields
to the Document content.
"""
if top_k <= 0:
msg = f"top_k must be > 0, but got {top_k}"
raise ValueError(msg)

self.model_name = model_name
self.top_k = top_k
self.cache_dir = cache_dir
self.threads = threads
self.batch_size = batch_size
self.parallel = parallel
self.local_files_only = local_files_only
self.meta_fields_to_embed = meta_fields_to_embed or []
self.meta_data_separator = meta_data_separator
self._model = None

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
model_name=self.model_name,
top_k=self.top_k,
cache_dir=self.cache_dir,
threads=self.threads,
batch_size=self.batch_size,
parallel=self.parallel,
local_files_only=self.local_files_only,
meta_fields_to_embed=self.meta_fields_to_embed,
meta_data_separator=self.meta_data_separator,
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "FastembedRanker":
"""
Deserializes the component from a dictionary.
:param data:
The dictionary to deserialize from.
:returns:
The deserialized component.
"""
return default_from_dict(cls, data)

def warm_up(self):
"""
Initializes the component.
"""
if self._model is None:
self._model = TextCrossEncoder(
model_name=self.model_name,
cache_dir=self.cache_dir,
threads=self.threads,
local_files_only=self.local_files_only,
)

def _prepare_fastembed_input_docs(self, documents: List[Document]) -> List[str]:
"""
Prepare the input by concatenating the document text with the metadata fields specified.
:param documents: The list of Document objects.
:return: A list of strings to be given as input to Fastembed model.
"""
concatenated_input_list = []
for doc in documents:
meta_values_to_embed = [
str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta.get(key)
]
concatenated_input = self.meta_data_separator.join([*meta_values_to_embed, doc.content or ""])
concatenated_input_list.append(concatenated_input)

return concatenated_input_list

@component.output_types(documents=List[Document])
def run(self, query: str, documents: List[Document], top_k: Optional[int] = None):
"""
Returns a list of documents ranked by their similarity to the given query, using FastEmbed.
:param query:
The input query to compare the documents to.
:param documents:
A list of documents to be ranked.
:param top_k:
The maximum number of documents to return.
:returns:
A dictionary with the following keys:
- `documents`: A list of documents closest to the query, sorted from most similar to least similar.
:raises ValueError: If `top_k` is not > 0.
"""
if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
msg = "FastembedRanker expects a list of Documents as input. "
raise TypeError(msg)
if query == "":
msg = "No query provided"
raise ValueError(msg)

if not documents:
return {"documents": []}

top_k = top_k or self.top_k
if top_k <= 0:
msg = f"top_k must be > 0, but got {top_k}"
raise ValueError(msg)

if self._model is None:
msg = "The ranker model has not been loaded. Please call warm_up() before running."
raise RuntimeError(msg)

fastembed_input_docs = self._prepare_fastembed_input_docs(documents)

scores = list(
self._model.rerank(
query=query,
documents=fastembed_input_docs,
batch_size=self.batch_size,
parallel=self.parallel,
)
)

# Combine the two lists into a single list of tuples
doc_scores = list(zip(documents, scores))

# Sort the list of tuples by the score in descending order
sorted_doc_scores = sorted(doc_scores, key=lambda x: x[1], reverse=True)

# Get the top_k documents
top_k_documents = []
for doc, score in sorted_doc_scores[:top_k]:
doc.score = score
top_k_documents.append(doc)

return {"documents": top_k_documents}
Loading

0 comments on commit ae9ca89

Please sign in to comment.