diff --git a/.github/workflows/fastembed.yml b/.github/workflows/fastembed.yml index 37ee7f953..7a34378ee 100644 --- a/.github/workflows/fastembed.yml +++ b/.github/workflows/fastembed.yml @@ -43,7 +43,7 @@ jobs: - name: Run tests id: tests - run: hatch run cov + run: hatch run cov - name: Nightly - run unit tests with Haystack main branch if: github.event_name == 'schedule' @@ -60,4 +60,4 @@ jobs: core-integrations failure: ${{ (steps.tests.conclusion == 'nightly-haystack-main') && 'nightly-haystack-main' || 'tests' }} - ${{ github.workflow }} - api-key: ${{ secrets.CORE_DATADOG_API_KEY }} \ No newline at end of file + api-key: ${{ secrets.CORE_DATADOG_API_KEY }} diff --git a/integrations/fastembed/README.md b/integrations/fastembed/README.md index 5ad056af3..c021dec3b 100644 --- a/integrations/fastembed/README.md +++ b/integrations/fastembed/README.md @@ -43,6 +43,31 @@ doc = Document(content="fastembed is supported by and maintained by Qdrant.", me result = embedder.run(documents=[doc]) ``` +You can use `FastembedSparseTextEmbedder` and `FastembedSparseDocumentEmbedder` by importing as: + +```python +from haystack_integrations.components.embedders.fastembed import FastembedSparseTextEmbedder + +text = "fastembed is supported by and maintained by Qdrant." +text_embedder = FastembedSparseTextEmbedder( + model="prithvida/Splade_PP_en_v1" +) +text_embedder.warm_up() +embedding = text_embedder.run(text)["embedding"] +``` + +```python +from haystack_integrations.components.embedders.fastembed import FastembedSparseDocumentEmbedder +from haystack.dataclasses import Document + +embedder = FastembedSparseDocumentEmbedder( + model="prithvida/Splade_PP_en_v1", +) +embedder.warm_up() +doc = Document(content="fastembed is supported by and maintained by Qdrant.", meta={"long_answer": "no",}) +result = embedder.run(documents=[doc]) +``` + ## License `fastembed-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/integrations/fastembed/example/example.py b/integrations/fastembed/examples/example.py similarity index 95% rename from integrations/fastembed/example/example.py rename to integrations/fastembed/examples/example.py index 3e8077a79..e4d328210 100644 --- a/integrations/fastembed/example/example.py +++ b/integrations/fastembed/examples/example.py @@ -26,7 +26,7 @@ result = query_pipeline.run({"text_embedder": {"text": query}}) -print(result["retriever"]["documents"][0]) # noqa: T201 +print(result["retriever"]["documents"][0]) # Document(id=..., # content: 'fastembed is supported by and maintained by Qdrant.', diff --git a/integrations/fastembed/examples/sparse_example.py b/integrations/fastembed/examples/sparse_example.py new file mode 100644 index 000000000..bce3b363d --- /dev/null +++ b/integrations/fastembed/examples/sparse_example.py @@ -0,0 +1,32 @@ +# Currently, this example shows how to use the FastembedSparseDocumentEmbedder component to embed a list of documents. + +# TODO: Once we have a proper SparseEmbeddingRetriever, we should replace this naive example with a more realistic one, +# involving indexing and retrieval of documents. + +from haystack import Document +from haystack_integrations.components.embedders.fastembed import FastembedSparseDocumentEmbedder + +document_list = [ + Document( + content="Oxidative stress generated within inflammatory joints can produce autoimmune phenomena and joint destruction. Radical species with oxidative activity, including reactive nitrogen species, represent mediators of inflammation and cartilage damage.", + meta={ + "pubid": "25,445,628", + "long_answer": "yes", + }, + ), + Document( + content="Plasma levels of pancreatic polypeptide (PP) rise upon food intake. Although other pancreatic islet hormones, such as insulin and glucagon, have been extensively investigated, PP secretion and actions are still poorly understood.", + meta={ + "pubid": "25,445,712", + "long_answer": "yes", + }, + ), +] + +document_embedder = FastembedSparseDocumentEmbedder() +document_embedder.warm_up() +documents_with_embeddings = document_embedder.run(document_list)["documents"] + +for doc in documents_with_embeddings: + print(f"Document Text: {doc.content}") + print(f"Document Sparse Embedding: {doc.sparse_embedding.to_dict()}") diff --git a/integrations/fastembed/pydoc/config.yml b/integrations/fastembed/pydoc/config.yml index 2a4439b84..c8bd11762 100644 --- a/integrations/fastembed/pydoc/config.yml +++ b/integrations/fastembed/pydoc/config.yml @@ -5,6 +5,8 @@ loaders: [ "haystack_integrations.components.embedders.fastembed.fastembed_document_embedder", "haystack_integrations.components.embedders.fastembed.fastembed_text_embedder", + "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder", + "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder" ] ignore_when_discovered: ["__init__"] processors: diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index 15dfe907c..f4d005be3 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -25,8 +25,8 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ -"haystack-ai", -"fastembed>=0.2", +"haystack-ai>=2.0.1", +"fastembed>=0.2.5", ] [project.urls] @@ -155,7 +155,7 @@ ban-relative-imports = "parents" # Tests can use magic values, assertions, and relative imports "tests/**/*" = ["PLR2004", "S101", "TID252"] # examples can contain "print" commands -"examples/**/*" = ["T201"] +"examples/**/*" = ["T201", "E501"] [tool.coverage.run] source = ["haystack_integrations"] diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py index fdf4dd8de..e943a8ca1 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py @@ -2,6 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 from .fastembed_document_embedder import FastembedDocumentEmbedder +from .fastembed_sparse_document_embedder import FastembedSparseDocumentEmbedder +from .fastembed_sparse_text_embedder import FastembedSparseTextEmbedder from .fastembed_text_embedder import FastembedTextEmbedder -__all__ = ["FastembedDocumentEmbedder", "FastembedTextEmbedder"] +__all__ = [ + "FastembedDocumentEmbedder", + "FastembedTextEmbedder", + "FastembedSparseDocumentEmbedder", + "FastembedSparseTextEmbedder", +] diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py index e44e50a61..2fc7c5ca2 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py @@ -1,8 +1,10 @@ from typing import ClassVar, Dict, List, Optional +from haystack.dataclasses.sparse_embedding import SparseEmbedding from tqdm import tqdm from fastembed import TextEmbedding +from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding class _FastembedEmbeddingBackendFactory: @@ -50,3 +52,58 @@ def embed(self, data: List[str], progress_bar=True, **kwargs) -> List[List[float ): embeddings.append(np_array.tolist()) return embeddings + + +class _FastembedSparseEmbeddingBackendFactory: + """ + Factory class to create instances of fastembed sparse embedding backends. + """ + + _instances: ClassVar[Dict[str, "_FastembedSparseEmbeddingBackend"]] = {} + + @staticmethod + def get_embedding_backend( + model_name: str, + cache_dir: Optional[str] = None, + threads: Optional[int] = None, + ): + embedding_backend_id = f"{model_name}{cache_dir}{threads}" + + if embedding_backend_id in _FastembedSparseEmbeddingBackendFactory._instances: + return _FastembedSparseEmbeddingBackendFactory._instances[embedding_backend_id] + + embedding_backend = _FastembedSparseEmbeddingBackend( + model_name=model_name, cache_dir=cache_dir, threads=threads + ) + _FastembedSparseEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend + return embedding_backend + + +class _FastembedSparseEmbeddingBackend: + """ + Class to manage fastembed sparse embeddings. + """ + + def __init__( + self, + model_name: str, + cache_dir: Optional[str] = None, + threads: Optional[int] = None, + ): + self.model = SparseTextEmbedding(model_name=model_name, cache_dir=cache_dir, threads=threads) + + def embed(self, data: List[List[str]], progress_bar=True, **kwargs) -> List[SparseEmbedding]: + # The embed method returns a Iterable[SparseEmbedding], so we convert to Haystack SparseEmbedding type. + # Each SparseEmbedding contains an `indices` key containing a list of int and + # an `values` key containing a list of floats. + + sparse_embeddings = [] + sparse_embeddings_iterable = self.model.embed(data, **kwargs) + for sparse_embedding in tqdm( + sparse_embeddings_iterable, disable=not progress_bar, desc="Calculating sparse embeddings", total=len(data) + ): + sparse_embeddings.append( + SparseEmbedding(indices=sparse_embedding.indices.tolist(), values=sparse_embedding.values.tolist()) + ) + + return sparse_embeddings diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py new file mode 100644 index 000000000..ed5a3208b --- /dev/null +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py @@ -0,0 +1,160 @@ +from typing import Any, Dict, List, Optional + +from haystack import Document, component, default_to_dict + +from .embedding_backend.fastembed_backend import _FastembedSparseEmbeddingBackendFactory + + +@component +class FastembedSparseDocumentEmbedder: + """ + FastembedSparseDocumentEmbedder computes Document embeddings using Fastembed sparse models. + + Usage example: + ```python + # To use this component, install the "fastembed-haystack" package. + # pip install fastembed-haystack + + from haystack_integrations.components.embedders.fastembed import FastembedSparseDocumentEmbedder + from haystack.dataclasses import Document + + doc_embedder = FastembedSparseDocumentEmbedder( + model="prithvida/Splade_PP_en_v1", + batch_size=32, + ) + + doc_embedder.warm_up() + + # Text taken from PubMed QA Dataset (https://huggingface.co/datasets/pubmed_qa) + document_list = [ + Document( + content="Oxidative stress generated within inflammatory joints can produce autoimmune phenomena and joint destruction. Radical species with oxidative activity, including reactive nitrogen species, represent mediators of inflammation and cartilage damage.", + meta={ + "pubid": "25,445,628", + "long_answer": "yes", + }, + ), + Document( + content="Plasma levels of pancreatic polypeptide (PP) rise upon food intake. Although other pancreatic islet hormones, such as insulin and glucagon, have been extensively investigated, PP secretion and actions are still poorly understood.", + meta={ + "pubid": "25,445,712", + "long_answer": "yes", + }, + ), + ] + + result = doc_embedder.run(document_list) + print(f"Document Text: {result['documents'][0].content}") + print(f"Document Embedding: {result['documents'][0].sparse_embedding}") + print(f"Embedding Dimension: {len(result['documents'][0].sparse_embedding)}") + ``` + """ # noqa: E501 + + def __init__( + self, + model: str = "prithvida/Splade_PP_en_v1", + cache_dir: Optional[str] = None, + threads: Optional[int] = None, + batch_size: int = 32, + progress_bar: bool = True, + parallel: Optional[int] = None, + meta_fields_to_embed: Optional[List[str]] = None, + embedding_separator: str = "\n", + ): + """ + Create an FastembedDocumentEmbedder component. + + :param model: Local path or name of the model in Hugging Face's model hub, + such as `prithvida/Splade_PP_en_v1`. + :param cache_dir: The path to the cache directory. + Can be set using the `FASTEMBED_CACHE_PATH` env variable. + Defaults to `fastembed_cache` in the system's temp directory. + :param threads: The number of threads single onnxruntime session can use. + :param batch_size: Number of strings to encode at once. + :param progress_bar: If `True`, displays progress bar during embedding. + :param parallel: + If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. + If 0, use all available cores. + If None, don't use data-parallel processing, use default onnxruntime threading instead. + :param meta_fields_to_embed: List of meta fields that should be embedded along with the Document content. + :param embedding_separator: Separator used to concatenate the meta fields to the Document content. + """ + + self.model_name = model + self.cache_dir = cache_dir + self.threads = threads + self.batch_size = batch_size + self.progress_bar = progress_bar + self.parallel = parallel + self.meta_fields_to_embed = meta_fields_to_embed or [] + self.embedding_separator = embedding_separator + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + model=self.model_name, + cache_dir=self.cache_dir, + threads=self.threads, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + parallel=self.parallel, + meta_fields_to_embed=self.meta_fields_to_embed, + embedding_separator=self.embedding_separator, + ) + + def warm_up(self): + """ + Initializes the component. + """ + if not hasattr(self, "embedding_backend"): + self.embedding_backend = _FastembedSparseEmbeddingBackendFactory.get_embedding_backend( + model_name=self.model_name, cache_dir=self.cache_dir, threads=self.threads + ) + + def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: + texts_to_embed = [] + for doc in documents: + meta_values_to_embed = [ + str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None + ] + text_to_embed = self.embedding_separator.join([*meta_values_to_embed, doc.content or ""]) + + texts_to_embed.append(text_to_embed) + return texts_to_embed + + @component.output_types(documents=List[Document]) + def run(self, documents: List[Document]): + """ + Embeds a list of Documents. + + :param documents: List of Documents to embed. + :returns: A dictionary with the following keys: + - `documents`: List of Documents with each Document's `sparse_embedding` + field set to the computed embeddings. + """ + if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): + msg = ( + "FastembedSparseDocumentEmbedder expects a list of Documents as input. " + "In case you want to embed a list of strings, please use the FastembedTextEmbedder." + ) + raise TypeError(msg) + if not hasattr(self, "embedding_backend"): + msg = "The embedding model has not been loaded. Please call warm_up() before running." + raise RuntimeError(msg) + + texts_to_embed = self._prepare_texts_to_embed(documents=documents) + embeddings = self.embedding_backend.embed( + texts_to_embed, + batch_size=self.batch_size, + show_progress_bar=self.progress_bar, + parallel=self.parallel, + ) + + for doc, emb in zip(documents, embeddings): + doc.sparse_embedding = emb + return {"documents": documents} diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py new file mode 100644 index 000000000..b31677785 --- /dev/null +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py @@ -0,0 +1,117 @@ +from typing import Any, Dict, Optional + +from haystack import component, default_to_dict +from haystack.dataclasses.sparse_embedding import SparseEmbedding + +from .embedding_backend.fastembed_backend import _FastembedSparseEmbeddingBackendFactory + + +@component +class FastembedSparseTextEmbedder: + """ + FastembedSparseTextEmbedder computes string embedding using fastembed sparse models. + + Usage example: + ```python + # To use this component, install the "fastembed-haystack" package. + # pip install fastembed-haystack + + from haystack_integrations.components.embedders.fastembed import FastembedSparseTextEmbedder + + text = "It clearly says online this will work on a Mac OS system. The disk comes and it does not, only Windows. Do Not order this if you have a Mac!!" + + text_embedder = FastembedSparseTextEmbedder( + model="prithvida/Splade_PP_en_v1" + ) + text_embedder.warm_up() + + embedding = text_embedder.run(text)["embedding"] + ``` + """ # noqa: E501 + + def __init__( + self, + model: str = "prithvida/Splade_PP_en_v1", + cache_dir: Optional[str] = None, + threads: Optional[int] = None, + batch_size: int = 32, + progress_bar: bool = True, + parallel: Optional[int] = None, + ): + """ + Create a FastembedSparseTextEmbedder component. + + :param model: Local path or name of the model in Fastembed's model hub, such as `prithvida/Splade_PP_en_v1` + :param cache_dir: The path to the cache directory. + Can be set using the `FASTEMBED_CACHE_PATH` env variable. + Defaults to `fastembed_cache` in the system's temp directory. + :param threads: The number of threads single onnxruntime session can use. Defaults to None. + :param batch_size: Number of strings to encode at once. + :param progress_bar: If true, displays progress bar during embedding. + :param parallel: + If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. + If 0, use all available cores. + If None, don't use data-parallel processing, use default onnxruntime threading instead. + """ + + self.model_name = model + self.cache_dir = cache_dir + self.threads = threads + self.batch_size = batch_size + self.progress_bar = progress_bar + self.parallel = parallel + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + model=self.model_name, + cache_dir=self.cache_dir, + threads=self.threads, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + parallel=self.parallel, + ) + + def warm_up(self): + """ + Initializes the component. + """ + if not hasattr(self, "embedding_backend"): + self.embedding_backend = _FastembedSparseEmbeddingBackendFactory.get_embedding_backend( + model_name=self.model_name, cache_dir=self.cache_dir, threads=self.threads + ) + + @component.output_types(sparse_embedding=SparseEmbedding) + def run(self, text: str): + """ + Embeds text using the Fastembed model. + + :param text: A string to embed. + :returns: A dictionary with the following keys: + - `embedding`: A list of floats representing the embedding of the input text. + :raises TypeError: If the input is not a string. + :raises RuntimeError: If the embedding model has not been loaded. + """ + if not isinstance(text, str): + msg = ( + "FastembedSparseTextEmbedder expects a string as input. " + "In case you want to embed a list of Documents, please use the FastembedDocumentEmbedder." + ) + raise TypeError(msg) + if not hasattr(self, "embedding_backend"): + msg = "The embedding model has not been loaded. Please call warm_up() before running." + raise RuntimeError(msg) + + embedding = self.embedding_backend.embed( + [text], + batch_size=self.batch_size, + show_progress_bar=self.progress_bar, + parallel=self.parallel, + )[0] + return {"sparse_embedding": embedding} diff --git a/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py new file mode 100644 index 000000000..756eeb4b5 --- /dev/null +++ b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py @@ -0,0 +1,283 @@ +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest +from haystack import Document, default_from_dict +from haystack.dataclasses.sparse_embedding import SparseEmbedding +from haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder import ( + FastembedSparseDocumentEmbedder, +) + + +class TestFastembedSparseDocumentEmbedderDoc: + def test_init_default(self): + """ + Test default initialization parameters for FastembedSparseDocumentEmbedder. + """ + embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1") + assert embedder.model_name == "prithvida/Splade_PP_en_v1" + assert embedder.cache_dir is None + assert embedder.threads is None + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.parallel is None + assert embedder.meta_fields_to_embed == [] + assert embedder.embedding_separator == "\n" + + def test_init_with_parameters(self): + """ + Test custom initialization parameters for FastembedSparseDocumentEmbedder. + """ + embedder = FastembedSparseDocumentEmbedder( + model="prithvida/Splade_PP_en_v1", + cache_dir="fake_dir", + threads=2, + batch_size=64, + progress_bar=False, + parallel=1, + meta_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + assert embedder.model_name == "prithvida/Splade_PP_en_v1" + assert embedder.cache_dir == "fake_dir" + assert embedder.threads == 2 + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.parallel == 1 + assert embedder.meta_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == " | " + + def test_to_dict(self): + """ + Test serialization of FastembedSparseDocumentEmbedder to a dictionary, using default initialization parameters. + """ + embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1") + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa + "init_parameters": { + "model": "prithvida/Splade_PP_en_v1", + "cache_dir": None, + "threads": None, + "batch_size": 32, + "progress_bar": True, + "parallel": None, + "embedding_separator": "\n", + "meta_fields_to_embed": [], + }, + } + + def test_to_dict_with_custom_init_parameters(self): + """ + Test serialization of FastembedSparseDocumentEmbedder to a dictionary, using custom initialization parameters. + """ + embedder = FastembedSparseDocumentEmbedder( + model="prithvida/Splade_PP_en_v1", + cache_dir="fake_dir", + threads=2, + batch_size=64, + progress_bar=False, + parallel=1, + meta_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa + "init_parameters": { + "model": "prithvida/Splade_PP_en_v1", + "cache_dir": "fake_dir", + "threads": 2, + "batch_size": 64, + "progress_bar": False, + "parallel": 1, + "meta_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + }, + } + + def test_from_dict(self): + """ + Test deserialization of FastembedSparseDocumentEmbedder from a dictionary, + using default initialization parameters. + """ + embedder_dict = { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa + "init_parameters": { + "model": "prithvida/Splade_PP_en_v1", + "cache_dir": None, + "threads": None, + "batch_size": 32, + "progress_bar": True, + "parallel": None, + "meta_fields_to_embed": [], + "embedding_separator": "\n", + }, + } + embedder = default_from_dict(FastembedSparseDocumentEmbedder, embedder_dict) + assert embedder.model_name == "prithvida/Splade_PP_en_v1" + assert embedder.cache_dir is None + assert embedder.threads is None + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.parallel is None + assert embedder.meta_fields_to_embed == [] + assert embedder.embedding_separator == "\n" + + def test_from_dict_with_custom_init_parameters(self): + """ + Test deserialization of FastembedSparseDocumentEmbedder from a dictionary, + using custom initialization parameters. + """ + embedder_dict = { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa + "init_parameters": { + "model": "prithvida/Splade_PP_en_v1", + "cache_dir": "fake_dir", + "threads": 2, + "batch_size": 64, + "progress_bar": False, + "parallel": 1, + "meta_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + }, + } + embedder = default_from_dict(FastembedSparseDocumentEmbedder, embedder_dict) + assert embedder.model_name == "prithvida/Splade_PP_en_v1" + assert embedder.cache_dir == "fake_dir" + assert embedder.threads == 2 + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.parallel == 1 + assert embedder.meta_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == " | " + + @patch( + "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder._FastembedSparseEmbeddingBackendFactory" + ) + def test_warmup(self, mocked_factory): + """ + Test for checking embedder instances after warm-up. + """ + embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once_with( + model_name="prithvida/Splade_PP_en_v1", cache_dir=None, threads=None + ) + + @patch( + "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder._FastembedSparseEmbeddingBackendFactory" + ) + def test_warmup_does_not_reload(self, mocked_factory): + """ + Test for checking backend instances after multiple warm-ups. + """ + embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once() + + def _generate_mocked_sparse_embedding(self, n): + list_of_sparse_vectors = [] + for _ in range(n): + random_indice_length = np.random.randint(3, 15) + data = { + "indices": list(range(random_indice_length)), + "values": [np.random.random_sample() for _ in range(random_indice_length)], + } + list_of_sparse_vectors.append(data) + return list_of_sparse_vectors + + def test_embed(self): + """ + Test for checking output dimensions and embedding dimensions. + """ + embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1") + embedder.embedding_backend = MagicMock() + embedder.embedding_backend.embed = lambda x, **kwargs: self._generate_mocked_sparse_embedding( # noqa: ARG005 + len(x) + ) + + documents = [Document(content=f"Sample-document text {i}") for i in range(5)] + + result = embedder.run(documents=documents) + + assert isinstance(result["documents"], list) + assert len(result["documents"]) == len(documents) + for doc in result["documents"]: + assert isinstance(doc, Document) + assert isinstance(doc.sparse_embedding, dict) + assert isinstance(doc.sparse_embedding["indices"], list) + assert isinstance(doc.sparse_embedding["indices"][0], int) + assert isinstance(doc.sparse_embedding["values"], list) + assert isinstance(doc.sparse_embedding["values"][0], float) + + def test_embed_incorrect_input_format(self): + """ + Test for checking incorrect input format when creating embedding. + """ + embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1") + + string_input = "text" + list_integers_input = [1, 2, 3] + + with pytest.raises( + TypeError, + match="FastembedSparseDocumentEmbedder expects a list of Documents as input.", + ): + embedder.run(documents=string_input) + + with pytest.raises( + TypeError, + match="FastembedSparseDocumentEmbedder expects a list of Documents as input.", + ): + embedder.run(documents=list_integers_input) + + def test_embed_metadata(self): + """ + Test for checking output dimensions and embedding dimensions for documents + with a custom instruction and metadata. + """ + embedder = FastembedSparseDocumentEmbedder( + model="model", + meta_fields_to_embed=["meta_field"], + embedding_separator="\n", + ) + embedder.embedding_backend = MagicMock() + + documents = [Document(content=f"document-number {i}", meta={"meta_field": f"meta_value {i}"}) for i in range(5)] + + embedder.run(documents=documents) + + embedder.embedding_backend.embed.assert_called_once_with( + [ + "meta_value 0\ndocument-number 0", + "meta_value 1\ndocument-number 1", + "meta_value 2\ndocument-number 2", + "meta_value 3\ndocument-number 3", + "meta_value 4\ndocument-number 4", + ], + batch_size=32, + show_progress_bar=True, + parallel=None, + ) + + @pytest.mark.integration + def test_run(self): + embedder = FastembedSparseDocumentEmbedder( + model="prithvida/Splade_PP_en_v1", + ) + embedder.warm_up() + + doc = Document(content="Parton energy loss in QCD matter") + + result = embedder.run(documents=[doc]) + embedding = result["documents"][0].sparse_embedding + embedding_dict = embedding.to_dict() + assert isinstance(embedding, SparseEmbedding) + assert isinstance(embedding_dict["indices"], list) + assert isinstance(embedding_dict["values"], list) + assert isinstance(embedding_dict["indices"][0], int) + assert isinstance(embedding_dict["values"][0], float) diff --git a/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py new file mode 100644 index 000000000..3751eea14 --- /dev/null +++ b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py @@ -0,0 +1,220 @@ +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest +from haystack import default_from_dict +from haystack.dataclasses.sparse_embedding import SparseEmbedding +from haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder import ( + FastembedSparseTextEmbedder, +) + + +class TestFastembedSparseTextEmbedder: + def test_init_default(self): + """ + Test default initialization parameters for FastembedSparseTextEmbedder. + """ + embedder = FastembedSparseTextEmbedder(model="prithvida/Splade_PP_en_v1") + assert embedder.model_name == "prithvida/Splade_PP_en_v1" + assert embedder.cache_dir is None + assert embedder.threads is None + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.parallel is None + + def test_init_with_parameters(self): + """ + Test custom initialization parameters for FastembedSparseTextEmbedder. + """ + embedder = FastembedSparseTextEmbedder( + model="prithvida/Splade_PP_en_v1", + cache_dir="fake_dir", + threads=2, + batch_size=64, + progress_bar=False, + parallel=1, + ) + assert embedder.model_name == "prithvida/Splade_PP_en_v1" + assert embedder.cache_dir == "fake_dir" + assert embedder.threads == 2 + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.parallel == 1 + + def test_to_dict(self): + """ + Test serialization of FastembedSparseTextEmbedder to a dictionary, using default initialization parameters. + """ + embedder = FastembedSparseTextEmbedder(model="prithvida/Splade_PP_en_v1") + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder.FastembedSparseTextEmbedder", # noqa + "init_parameters": { + "model": "prithvida/Splade_PP_en_v1", + "cache_dir": None, + "threads": None, + "batch_size": 32, + "progress_bar": True, + "parallel": None, + }, + } + + def test_to_dict_with_custom_init_parameters(self): + """ + Test serialization of FastembedSparseTextEmbedder to a dictionary, using custom initialization parameters. + """ + embedder = FastembedSparseTextEmbedder( + model="prithvida/Splade_PP_en_v1", + cache_dir="fake_dir", + threads=2, + batch_size=64, + progress_bar=False, + parallel=1, + ) + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder.FastembedSparseTextEmbedder", # noqa + "init_parameters": { + "model": "prithvida/Splade_PP_en_v1", + "cache_dir": "fake_dir", + "threads": 2, + "batch_size": 64, + "progress_bar": False, + "parallel": 1, + }, + } + + def test_from_dict(self): + """ + Test deserialization of FastembedSparseTextEmbedder from a dictionary, using default initialization parameters. + """ + embedder_dict = { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder.FastembedSparseTextEmbedder", # noqa + "init_parameters": { + "model": "prithvida/Splade_PP_en_v1", + "cache_dir": None, + "threads": None, + "batch_size": 32, + "progress_bar": True, + "parallel": None, + }, + } + embedder = default_from_dict(FastembedSparseTextEmbedder, embedder_dict) + assert embedder.model_name == "prithvida/Splade_PP_en_v1" + assert embedder.cache_dir is None + assert embedder.threads is None + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.parallel is None + + def test_from_dict_with_custom_init_parameters(self): + """ + Test deserialization of FastembedSparseTextEmbedder from a dictionary, using custom initialization parameters. + """ + embedder_dict = { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder.FastembedSparseTextEmbedder", # noqa + "init_parameters": { + "model": "prithvida/Splade_PP_en_v1", + "cache_dir": "fake_dir", + "threads": 2, + "batch_size": 64, + "progress_bar": False, + "parallel": 1, + }, + } + embedder = default_from_dict(FastembedSparseTextEmbedder, embedder_dict) + assert embedder.model_name == "prithvida/Splade_PP_en_v1" + assert embedder.cache_dir == "fake_dir" + assert embedder.threads == 2 + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.parallel == 1 + + @patch( + "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder._FastembedSparseEmbeddingBackendFactory" + ) + def test_warmup(self, mocked_factory): + """ + Test for checking embedder instances after warm-up. + """ + embedder = FastembedSparseTextEmbedder(model="prithvida/Splade_PP_en_v1") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once_with( + model_name="prithvida/Splade_PP_en_v1", cache_dir=None, threads=None + ) + + @patch( + "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder._FastembedSparseEmbeddingBackendFactory" + ) + def test_warmup_does_not_reload(self, mocked_factory): + """ + Test for checking backend instances after multiple warm-ups. + """ + embedder = FastembedSparseTextEmbedder(model="prithvida/Splade_PP_en_v1") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once() + + def _generate_mocked_sparse_embedding(self, n): + list_of_sparse_vectors = [] + for _ in range(n): + random_indice_length = np.random.randint(3, 15) + data = { + "indices": list(range(random_indice_length)), + "values": [np.random.random_sample() for _ in range(random_indice_length)], + } + list_of_sparse_vectors.append(data) + + return list_of_sparse_vectors + + def test_embed(self): + """ + Test for checking output dimensions and embedding dimensions. + """ + embedder = FastembedSparseTextEmbedder(model="BAAI/bge-base-en-v1.5") + embedder.embedding_backend = MagicMock() + embedder.embedding_backend.embed = lambda x, **kwargs: self._generate_mocked_sparse_embedding( # noqa: ARG005 + len(x) + ) + + text = "Good text to embed" + + result = embedder.run(text=text) + embedding = result["sparse_embedding"] + assert isinstance(embedding, dict) + assert isinstance(embedding["indices"], list) + assert isinstance(embedding["indices"][0], int) + assert isinstance(embedding["values"], list) + assert isinstance(embedding["values"][0], float) + + def test_run_wrong_incorrect_format(self): + """ + Test for checking incorrect input format when creating embedding. + """ + embedder = FastembedSparseTextEmbedder(model="BAAI/bge-base-en-v1.5") + embedder.embedding_backend = MagicMock() + + list_integers_input = [1, 2, 3] + + with pytest.raises(TypeError, match="FastembedSparseTextEmbedder expects a string as input"): + embedder.run(text=list_integers_input) + + @pytest.mark.integration + def test_run(self): + embedder = FastembedSparseTextEmbedder( + model="prithvida/Splade_PP_en_v1", + ) + embedder.warm_up() + + text = "Parton energy loss in QCD matter" + + result = embedder.run(text=text) + embedding = result["sparse_embedding"] + embedding_dict = embedding.to_dict() + assert isinstance(embedding, SparseEmbedding) + assert isinstance(embedding_dict["indices"], list) + assert isinstance(embedding_dict["values"], list) + assert isinstance(embedding_dict["indices"][0], int) + assert isinstance(embedding_dict["values"][0], float)