From afc8e791f9f5b1f3b2c1484dc6ee6ff4493faafb Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Wed, 6 Mar 2024 13:37:28 +0100 Subject: [PATCH 01/32] fix(opensearch): bulk error without create key --- .../document_stores/opensearch/document_store.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index dc6941854..66109ddf8 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -168,6 +168,9 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D duplicate_errors_ids = [] other_errors = [] for e in errors: + if "create" not in e: + other_errors.append(e) + continue error_type = e["create"]["error"]["type"] if policy == DuplicatePolicy.FAIL and error_type == "version_conflict_engine_exception": duplicate_errors_ids.append(e["create"]["_id"]) From aa95f13bf8444b6d1a27660dacb83b7f654c77a0 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Wed, 13 Mar 2024 16:46:16 +0100 Subject: [PATCH 02/32] feat(FastEmbed): Scaffold for SPLADE Sparse Embedding Support --- integrations/fastembed/pyproject.toml | 2 +- .../embedders/fastembed/__init__.py | 3 +- .../embedding_backend/fastembed_backend.py | 44 ++- .../fastembed_document_SPLADE_embedder.py | 175 +++++++++++ .../fastembed_text_SPLADE_embedder.py | 128 ++++++++ ...test_fastembed_document_SPLADE_embedder.py | 285 ++++++++++++++++++ .../test_fastembed_text_SPLADE_embedder.py | 222 ++++++++++++++ 7 files changed, 856 insertions(+), 3 deletions(-) create mode 100644 integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py create mode 100644 integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py create mode 100644 integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py create mode 100644 integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index 6ebb99142..21d4c7506 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ ] dependencies = [ "haystack-ai", -"fastembed>=0.2", +"fastembed>=0.2.3", ] [project.urls] diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py index fdf4dd8de..a4490a24f 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py @@ -3,5 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 from .fastembed_document_embedder import FastembedDocumentEmbedder from .fastembed_text_embedder import FastembedTextEmbedder +from .fastembed_document_SPLADE_embedder import FastembedDocumentSPLADEEmbedder -__all__ = ["FastembedDocumentEmbedder", "FastembedTextEmbedder"] +__all__ = ["FastembedDocumentEmbedder", "FastembedTextEmbedder", "FastembedDocumentSPLADEEmbedder"] diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py index baf21c8a3..81c3c5bb5 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py @@ -1,7 +1,9 @@ from typing import ClassVar, Dict, List, Optional -from fastembed import TextEmbedding +import numpy as np +from fastembed import TextEmbedding +from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding class _FastembedEmbeddingBackendFactory: """ @@ -43,3 +45,43 @@ def embed(self, data: List[List[str]], **kwargs) -> List[List[float]]: # the embed method returns a Iterable[np.ndarray], so we convert it to a list of lists embeddings = [np_array.tolist() for np_array in self.model.embed(data, **kwargs)] return embeddings + +class _FastembedSparseEmbeddingBackendFactory: + """ + Factory class to create instances of fastembed sparse embedding backends. + """ + + _instances: ClassVar[Dict[str, "_FastembedSparseEmbeddingBackend"]] = {} + + @staticmethod + def get_embedding_backend( + model_name: str, + cache_dir: Optional[str] = None, + threads: Optional[int] = None, + ): + embedding_backend_id = f"{model_name}{cache_dir}{threads}" + + if embedding_backend_id in _FastembedSparseEmbeddingBackendFactory._instances: + return _FastembedSparseEmbeddingBackendFactory._instances[embedding_backend_id] + + embedding_backend = _FastembedSparseEmbeddingBackend(model_name=model_name, cache_dir=cache_dir, threads=threads) + _FastembedSparseEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend + return embedding_backend + +class _FastembedSparseEmbeddingBackend: + """ + Class to manage fastembed sparse embeddings. + """ + + def __init__( + self, + model_name: str, + cache_dir: Optional[str] = None, + threads: Optional[int] = None, + ): + self.model = SparseTextEmbedding(model_name=model_name, cache_dir=cache_dir, threads=threads) + + def embed(self, data: List[List[str]], **kwargs) -> List[Dict[str, np.ndarray]]: + # The embed method returns a Iterable[SparseEmbedding], so we convert it to a list of dictionaries + sparse_embeddings = [sparse_embedding.as_object() for sparse_embedding in self.model.embed(data, **kwargs)] + return sparse_embeddings diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py new file mode 100644 index 000000000..df24a2684 --- /dev/null +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py @@ -0,0 +1,175 @@ +from typing import Any, Dict, List, Optional + +from haystack import Document, component, default_to_dict + +from .embedding_backend.fastembed_backend import _FastembedSparseEmbeddingBackendFactory + + +@component +class FastembedDocumentSPLADEEmbedder: + """ + FastembedDocumentSPLADEEmbedder computes Document embeddings using Fastembed SPLADE models. + # TODO: check where to store the sparse embedding in the Document object + The embedding of each Document is stored in the `meta["_sparse_vector"]` field of the Document. + + Usage example: + ```python + # To use this component, install the "fastembed-haystack" package. + # pip install fastembed-haystack + + from haystack_integrations.components.embedders.fastembed import FastembedDocumentSPLADEEmbedder + from haystack.dataclasses import Document + + doc_embedder = FastembedDocumentSPLADEEmbedder( + model="prithvida/SPLADE_PP_en_v1", + batch_size=256, + ) + + doc_embedder.warm_up() + + # Text taken from PubMed QA Dataset (https://huggingface.co/datasets/pubmed_qa) + document_list = [ + Document( + content="Oxidative stress generated within inflammatory joints can produce autoimmune phenomena and joint destruction. Radical species with oxidative activity, including reactive nitrogen species, represent mediators of inflammation and cartilage damage.", + meta={ + "pubid": "25,445,628", + "long_answer": "yes", + }, + ), + Document( + content="Plasma levels of pancreatic polypeptide (PP) rise upon food intake. Although other pancreatic islet hormones, such as insulin and glucagon, have been extensively investigated, PP secretion and actions are still poorly understood.", + meta={ + "pubid": "25,445,712", + "long_answer": "yes", + }, + ), + ] + + result = doc_embedder.run(document_list) + print(f"Document Text: {result['documents'][0].content}") + # TODO: WHERE DO WE STORE THE EMBEDDINGS ? + print(f"Document Embedding: {result['documents'][0].meta["_sparse_vector"]}") + print(f"Embedding Dimension: {len(result['documents'][0].meta["_sparse_vector"])}") + ``` + """ # noqa: E501 + + def __init__( + self, + model: str = "prithvida/SPLADE_PP_en_v1", + cache_dir: Optional[str] = None, + threads: Optional[int] = None, + prefix: str = "", + suffix: str = "", + batch_size: int = 256, + progress_bar: bool = True, + parallel: Optional[int] = None, + meta_fields_to_embed: Optional[List[str]] = None, + embedding_separator: str = "\n", + ): + """ + Create an FastembedDocumentEmbedder component. + + :param model: Local path or name of the model in Hugging Face's model hub, + such as `prithvida/SPLADE_PP_en_v1`. + :param cache_dir: The path to the cache directory. + Can be set using the `FASTEMBED_CACHE_PATH` env variable. + Defaults to `fastembed_cache` in the system's temp directory. + :param threads: The number of threads single onnxruntime session can use. Defaults to None. + :param prefix: A string to add to the beginning of each text. + :param suffix: A string to add to the end of each text. + :param batch_size: Number of strings to encode at once. + :param progress_bar: If true, displays progress bar during embedding. + :param parallel: + If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. + If 0, use all available cores. + If None, don't use data-parallel processing, use default onnxruntime threading instead. + :param meta_fields_to_embed: List of meta fields that should be embedded along with the Document content. + :param embedding_separator: Separator used to concatenate the meta fields to the Document content. + """ + + self.model_name = model + self.cache_dir = cache_dir + self.threads = threads + self.prefix = prefix + self.suffix = suffix + self.batch_size = batch_size + self.progress_bar = progress_bar + self.parallel = parallel + self.meta_fields_to_embed = meta_fields_to_embed or [] + self.embedding_separator = embedding_separator + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + model=self.model_name, + cache_dir=self.cache_dir, + threads=self.threads, + prefix=self.prefix, + suffix=self.suffix, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + parallel=self.parallel, + meta_fields_to_embed=self.meta_fields_to_embed, + embedding_separator=self.embedding_separator, + ) + + def warm_up(self): + """ + Initializes the component. + """ + if not hasattr(self, "embedding_backend"): + self.embedding_backend = _FastembedSparseEmbeddingBackendFactory.get_embedding_backend( + model_name=self.model_name, cache_dir=self.cache_dir, threads=self.threads + ) + + def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: + texts_to_embed = [] + for doc in documents: + meta_values_to_embed = [ + str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None + ] + text_to_embed = [ + self.prefix + self.embedding_separator.join([*meta_values_to_embed, doc.content or ""]) + self.suffix, + ] + + texts_to_embed.append(text_to_embed[0]) + return texts_to_embed + + @component.output_types(documents=List[Document]) + def run(self, documents: List[Document]): + """ + Embeds a list of Documents. + + :param documents: List of Documents to embed. + :returns: A dictionary with the following keys: + - `documents`: List of Documents with each Document's `embedding` field set to the computed embeddings. + """ + if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): + msg = ( + "FastembedDocumentEmbedder expects a list of Documents as input. " + "In case you want to embed a list of strings, please use the FastembedTextEmbedder." + ) + raise TypeError(msg) + if not hasattr(self, "embedding_backend"): + msg = "The embedding model has not been loaded. Please call warm_up() before running." + raise RuntimeError(msg) + + # TODO: once non textual Documents are properly supported, we should also prepare them for embedding here + + texts_to_embed = self._prepare_texts_to_embed(documents=documents) + embeddings = self.embedding_backend.embed( + texts_to_embed, + batch_size=self.batch_size, + show_progress_bar=self.progress_bar, + parallel=self.parallel, + ) + + for doc, emb in zip(documents, embeddings): + doc.meta["_sparse_vector"] = emb + print(doc) + return {"documents": documents} diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py new file mode 100644 index 000000000..517fb5ca9 --- /dev/null +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py @@ -0,0 +1,128 @@ +from typing import Any, Dict, List, Optional +import numpy as np +from haystack import component, default_to_dict + +from .embedding_backend.fastembed_backend import _FastembedSparseEmbeddingBackendFactory + + +@component +class FastembedTextSPLADEEmbedder: + """ + FastembedTextSPLADEEmbedder computes string embedding using fastembed SPLADE models. + + Usage example: + ```python + # To use this component, install the "fastembed-haystack" package. + # pip install fastembed-haystack + + from haystack_integrations.components.embedders.fastembed import FastembedTextSPLADEEmbedder + + text = "It clearly says online this will work on a Mac OS system. The disk comes and it does not, only Windows. Do Not order this if you have a Mac!!" + + text_embedder = FastembedTextSPLADEEmbedder( + model="prithvida/SPLADE_PP_en_v1" + ) + text_embedder.warm_up() + + embedding = text_embedder.run(text)["embedding"] + ``` + """ # noqa: E501 + + def __init__( + self, + model: str = "prithvida/SPLADE_PP_en_v1", + cache_dir: Optional[str] = None, + threads: Optional[int] = None, + prefix: str = "", + suffix: str = "", + batch_size: int = 256, + progress_bar: bool = True, + parallel: Optional[int] = None, + ): + """ + Create a FastembedTextSPLADEEmbedder component. + + :param model: Local path or name of the model in Fastembed's model hub, such as `prithvida/SPLADE_PP_en_v1` + :param cache_dir: The path to the cache directory. + Can be set using the `FASTEMBED_CACHE_PATH` env variable. + Defaults to `fastembed_cache` in the system's temp directory. + :param threads: The number of threads single onnxruntime session can use. Defaults to None. + :param batch_size: Number of strings to encode at once. + :param prefix: A string to add to the beginning of each text. + :param suffix: A string to add to the end of each text. + :param progress_bar: If true, displays progress bar during embedding. + :param parallel: + If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. + If 0, use all available cores. + If None, don't use data-parallel processing, use default onnxruntime threading instead. + """ + + self.model_name = model + self.cache_dir = cache_dir + self.threads = threads + self.prefix = prefix + self.suffix = suffix + self.batch_size = batch_size + self.progress_bar = progress_bar + self.parallel = parallel + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + model=self.model_name, + cache_dir=self.cache_dir, + threads=self.threads, + prefix=self.prefix, + suffix=self.suffix, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + parallel=self.parallel, + ) + + def warm_up(self): + """ + Initializes the component. + """ + if not hasattr(self, "embedding_backend"): + self.embedding_backend = _FastembedSparseEmbeddingBackendFactory.get_embedding_backend( + model_name=self.model_name, cache_dir=self.cache_dir, threads=self.threads + ) + + @component.output_types(embedding=List[Dict[str, np.ndarray]]) + def run(self, text: str): + """ + Embeds text using the Fastembed model. + + :param text: A string to embed. + :returns: A dictionary with the following keys: + - `embedding`: A list of floats representing the embedding of the input text. + :raises TypeError: If the input is not a string. + :raises RuntimeError: If the embedding model has not been loaded. + """ + if not isinstance(text, str): + msg = ( + "FastembedTextSPLADEEmbedder expects a string as input. " + "In case you want to embed a list of Documents, please use the FastembedDocumentEmbedder." + ) + raise TypeError(msg) + if not hasattr(self, "embedding_backend"): + msg = "The embedding model has not been loaded. Please call warm_up() before running." + raise RuntimeError(msg) + + text_to_embed = [self.prefix + text + self.suffix] + embedding = list( + self.embedding_backend.embed( + text_to_embed, + batch_size=self.batch_size, + show_progress_bar=self.progress_bar, + parallel=self.parallel, + )[0] + ) + print(embedding) + return {"embedding": embedding} diff --git a/integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py b/integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py new file mode 100644 index 000000000..2021b0df7 --- /dev/null +++ b/integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py @@ -0,0 +1,285 @@ +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest +from haystack import Document, default_from_dict +from haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder import ( + FastembedDocumentSPLADEEmbedder, +) + + +class TestFastembedDocumentSPLADEEmbedderDoc: + def test_init_default(self): + """ + Test default initialization parameters for FastembedDocumentSPLADEEmbedder. + """ + embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" + assert embedder.cache_dir is None + assert embedder.threads is None + assert embedder.prefix == "" + assert embedder.suffix == "" + assert embedder.batch_size == 256 + assert embedder.progress_bar is True + assert embedder.parallel is None + assert embedder.meta_fields_to_embed == [] + assert embedder.embedding_separator == "\n" + + def test_init_with_parameters(self): + """ + Test custom initialization parameters for FastembedDocumentSPLADEEmbedder. + """ + embedder = FastembedDocumentSPLADEEmbedder( + model="prithvida/SPLADE_PP_en_v1", + cache_dir="fake_dir", + threads=2, + prefix="prefix", + suffix="suffix", + batch_size=64, + progress_bar=False, + parallel=1, + meta_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" + assert embedder.cache_dir == "fake_dir" + assert embedder.threads == 2 + assert embedder.prefix == "prefix" + assert embedder.suffix == "suffix" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.parallel == 1 + assert embedder.meta_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == " | " + + def test_to_dict(self): + """ + Test serialization of FastembedDocumentSPLADEEmbedder to a dictionary, using default initialization parameters. + """ + embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder.FastembedDocumentSPLADEEmbedder", # noqa + "init_parameters": { + "model": "prithvida/SPLADE_PP_en_v1", + "cache_dir": None, + "threads": None, + "prefix": "", + "suffix": "", + "batch_size": 256, + "progress_bar": True, + "parallel": None, + "embedding_separator": "\n", + "meta_fields_to_embed": [], + }, + } + + def test_to_dict_with_custom_init_parameters(self): + """ + Test serialization of FastembedDocumentSPLADEEmbedder to a dictionary, using custom initialization parameters. + """ + embedder = FastembedDocumentSPLADEEmbedder( + model="prithvida/SPLADE_PP_en_v1", + cache_dir="fake_dir", + threads=2, + prefix="prefix", + suffix="suffix", + batch_size=64, + progress_bar=False, + parallel=1, + meta_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder.FastembedDocumentSPLADEEmbedder", # noqa + "init_parameters": { + "model": "prithvida/SPLADE_PP_en_v1", + "cache_dir": "fake_dir", + "threads": 2, + "prefix": "prefix", + "suffix": "suffix", + "batch_size": 64, + "progress_bar": False, + "parallel": 1, + "meta_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + }, + } + + def test_from_dict(self): + """ + Test deserialization of FastembedDocumentSPLADEEmbedder from a dictionary, using default initialization parameters. + """ + embedder_dict = { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder.FastembedDocumentSPLADEEmbedder", # noqa + "init_parameters": { + "model": "prithvida/SPLADE_PP_en_v1", + "cache_dir": None, + "threads": None, + "prefix": "", + "suffix": "", + "batch_size": 256, + "progress_bar": True, + "parallel": None, + "meta_fields_to_embed": [], + "embedding_separator": "\n", + }, + } + embedder = default_from_dict(FastembedDocumentSPLADEEmbedder, embedder_dict) + assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" + assert embedder.cache_dir is None + assert embedder.threads is None + assert embedder.prefix == "" + assert embedder.suffix == "" + assert embedder.batch_size == 256 + assert embedder.progress_bar is True + assert embedder.parallel is None + assert embedder.meta_fields_to_embed == [] + assert embedder.embedding_separator == "\n" + + def test_from_dict_with_custom_init_parameters(self): + """ + Test deserialization of FastembedDocumentSPLADEEmbedder from a dictionary, using custom initialization parameters. + """ + embedder_dict = { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder.FastembedDocumentSPLADEEmbedder", # noqa + "init_parameters": { + "model": "prithvida/SPLADE_PP_en_v1", + "cache_dir": "fake_dir", + "threads": 2, + "prefix": "prefix", + "suffix": "suffix", + "batch_size": 64, + "progress_bar": False, + "parallel": 1, + "meta_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + }, + } + embedder = default_from_dict(FastembedDocumentSPLADEEmbedder, embedder_dict) + assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" + assert embedder.cache_dir == "fake_dir" + assert embedder.threads == 2 + assert embedder.prefix == "prefix" + assert embedder.suffix == "suffix" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.parallel == 1 + assert embedder.meta_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == " | " + + @patch( + "haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder._FastembedSparseEmbeddingBackendFactory" + ) + def test_warmup(self, mocked_factory): + """ + Test for checking embedder instances after warm-up. + """ + embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once_with( + model_name="prithvida/SPLADE_PP_en_v1", cache_dir=None, threads=None + ) + + @patch( + "haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder._FastembedSparseEmbeddingBackendFactory" + ) + def test_warmup_does_not_reload(self, mocked_factory): + """ + Test for checking backend instances after multiple warm-ups. + """ + embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once() + + def test_embed(self): + """ + Test for checking output dimensions and embedding dimensions. + """ + embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder.embedding_backend = MagicMock() + # TODO adapt for sparse + embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 3).tolist() # noqa: ARG005 + + documents = [Document(content=f"Sample-document text {i}") for i in range(5)] + + result = embedder.run(documents=documents) + + assert isinstance(result["documents"], list) + assert len(result["documents"]) == len(documents) + for doc in result["documents"]: + assert isinstance(doc, Document) + # TODO adapt for sparse + assert isinstance(doc.meta["_sparse_vector"], list) + assert isinstance(doc.meta["_sparse_vector"][0], float) + + def test_embed_incorrect_input_format(self): + """ + Test for checking incorrect input format when creating embedding. + """ + embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + + string_input = "text" + list_integers_input = [1, 2, 3] + + with pytest.raises( + TypeError, + match="FastembedDocumentSPLADEEmbedder expects a list of Documents as input.", + ): + embedder.run(documents=string_input) + + with pytest.raises( + TypeError, + match="FastembedDocumentSPLADEEmbedder expects a list of Documents as input.", + ): + embedder.run(documents=list_integers_input) + + def test_embed_metadata(self): + """ + Test for checking output dimensions and embedding dimensions for documents + with a custom instruction and metadata. + """ + embedder = FastembedDocumentSPLADEEmbedder( + model="model", + meta_fields_to_embed=["meta_field"], + embedding_separator="\n", + ) + embedder.embedding_backend = MagicMock() + + documents = [Document(content=f"document-number {i}", meta={"meta_field": f"meta_value {i}"}) for i in range(5)] + + embedder.run(documents=documents) + + embedder.embedding_backend.embed.assert_called_once_with( + [ + "meta_value 0\ndocument-number 0", + "meta_value 1\ndocument-number 1", + "meta_value 2\ndocument-number 2", + "meta_value 3\ndocument-number 3", + "meta_value 4\ndocument-number 4", + ], + batch_size=256, + show_progress_bar=True, + parallel=None, + ) + + @pytest.mark.integration + def test_run(self): + embedder = FastembedDocumentSPLADEEmbedder( + model="prithvida/SPLADE_PP_en_v1", + ) + embedder.warm_up() + + doc = Document(content="Parton energy loss in QCD matter") + + result = embedder.run(documents=[doc]) + # TODO adapt for sparse + embedding = result["documents"][0].embedding + # TODO adapt for sparse + assert isinstance(embedding, list) + assert len(embedding) == 384 + assert all(isinstance(emb, float) for emb in embedding) diff --git a/integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py b/integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py new file mode 100644 index 000000000..f88c5c2a7 --- /dev/null +++ b/integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py @@ -0,0 +1,222 @@ +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest +from haystack import default_from_dict +from haystack_integrations.components.embedders.fastembed.fastembed_text_SPLADE_embedder import ( + FastembedTextSPLADEEmbedder, +) + + +class TestFastembedTextSPLADEEmbedder: + def test_init_default(self): + """ + Test default initialization parameters for FastembedTextSPLADEEmbedder. + """ + embedder = FastembedTextSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" + assert embedder.cache_dir is None + assert embedder.threads is None + assert embedder.prefix == "" + assert embedder.suffix == "" + assert embedder.batch_size == 256 + assert embedder.progress_bar is True + assert embedder.parallel is None + + def test_init_with_parameters(self): + """ + Test custom initialization parameters for FastembedTextSPLADEEmbedder. + """ + embedder = FastembedTextSPLADEEmbedder( + model="prithvida/SPLADE_PP_en_v1", + cache_dir="fake_dir", + threads=2, + prefix="prefix", + suffix="suffix", + batch_size=64, + progress_bar=False, + parallel=1, + ) + assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" + assert embedder.cache_dir == "fake_dir" + assert embedder.threads == 2 + assert embedder.prefix == "prefix" + assert embedder.suffix == "suffix" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.parallel == 1 + + def test_to_dict(self): + """ + Test serialization of FastembedTextSPLADEEmbedder to a dictionary, using default initialization parameters. + """ + embedder = FastembedTextSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_text_SPLADE_embedder.FastembedTextSPLADEEmbedder", # noqa + "init_parameters": { + "model": "prithvida/SPLADE_PP_en_v1", + "cache_dir": None, + "threads": None, + "prefix": "", + "suffix": "", + "batch_size": 256, + "progress_bar": True, + "parallel": None, + }, + } + + def test_to_dict_with_custom_init_parameters(self): + """ + Test serialization of FastembedTextSPLADEEmbedder to a dictionary, using custom initialization parameters. + """ + embedder = FastembedTextSPLADEEmbedder( + model="prithvida/SPLADE_PP_en_v1", + cache_dir="fake_dir", + threads=2, + prefix="prefix", + suffix="suffix", + batch_size=64, + progress_bar=False, + parallel=1, + ) + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_text_SPLADE_embedder.FastembedTextSPLADEEmbedder", # noqa + "init_parameters": { + "model": "prithvida/SPLADE_PP_en_v1", + "cache_dir": "fake_dir", + "threads": 2, + "prefix": "prefix", + "suffix": "suffix", + "batch_size": 64, + "progress_bar": False, + "parallel": 1, + }, + } + + def test_from_dict(self): + """ + Test deserialization of FastembedTextSPLADEEmbedder from a dictionary, using default initialization parameters. + """ + embedder_dict = { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_text_SPLADE_embedder.FastembedTextSPLADEEmbedder", # noqa + "init_parameters": { + "model": "prithvida/SPLADE_PP_en_v1", + "cache_dir": None, + "threads": None, + "prefix": "", + "suffix": "", + "batch_size": 256, + "progress_bar": True, + "parallel": None, + }, + } + embedder = default_from_dict(FastembedTextSPLADEEmbedder, embedder_dict) + assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" + assert embedder.cache_dir is None + assert embedder.threads is None + assert embedder.prefix == "" + assert embedder.suffix == "" + assert embedder.batch_size == 256 + assert embedder.progress_bar is True + assert embedder.parallel is None + + def test_from_dict_with_custom_init_parameters(self): + """ + Test deserialization of FastembedTextSPLADEEmbedder from a dictionary, using custom initialization parameters. + """ + embedder_dict = { + "type": "haystack_integrations.components.embedders.fastembed.fastembed_text_SPLADE_embedder.FastembedTextSPLADEEmbedder", # noqa + "init_parameters": { + "model": "prithvida/SPLADE_PP_en_v1", + "cache_dir": "fake_dir", + "threads": 2, + "prefix": "prefix", + "suffix": "suffix", + "batch_size": 64, + "progress_bar": False, + "parallel": 1, + }, + } + embedder = default_from_dict(FastembedTextSPLADEEmbedder, embedder_dict) + assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" + assert embedder.cache_dir == "fake_dir" + assert embedder.threads == 2 + assert embedder.prefix == "prefix" + assert embedder.suffix == "suffix" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.parallel == 1 + + @patch( + "haystack_integrations.components.embedders.fastembed.fastembed_text_SPLADE_embedder._FastembedSparseEmbeddingBackendFactory" + ) + def test_warmup(self, mocked_factory): + """ + Test for checking embedder instances after warm-up. + """ + embedder = FastembedTextSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once_with( + model_name="prithvida/SPLADE_PP_en_v1", cache_dir=None, threads=None + ) + + @patch( + "haystack_integrations.components.embedders.fastembed.fastembed_text_SPLADE_embedder._FastembedSparseEmbeddingBackendFactory" + ) + def test_warmup_does_not_reload(self, mocked_factory): + """ + Test for checking backend instances after multiple warm-ups. + """ + embedder = FastembedTextSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once() + + def test_embed(self): + """ + Test for checking output dimensions and embedding dimensions. + """ + embedder = FastembedTextSPLADEEmbedder(model="BAAI/bge-base-en-v1.5") + embedder.embedding_backend = MagicMock() + # TODO adapt to sparse + embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() # noqa: ARG005 + + text = "Good text to embed" + + result = embedder.run(text=text) + embedding = result["embedding"] + # TODO adapt to sparse + assert isinstance(embedding, list) + assert all(isinstance(emb, float) for emb in embedding) + + def test_run_wrong_incorrect_format(self): + """ + Test for checking incorrect input format when creating embedding. + """ + embedder = FastembedTextSPLADEEmbedder(model="BAAI/bge-base-en-v1.5") + embedder.embedding_backend = MagicMock() + + list_integers_input = [1, 2, 3] + + with pytest.raises(TypeError, match="FastembedTextSPLADEEmbedder expects a string as input"): + embedder.run(text=list_integers_input) + + @pytest.mark.integration + def test_run(self): + embedder = FastembedTextSPLADEEmbedder( + model="prithvida/SPLADE_PP_en_v1", + ) + embedder.warm_up() + + text = "Parton energy loss in QCD matter" + + result = embedder.run(text=text) + embedding = result["embedding"] + # TODO adapt to sparse + assert isinstance(embedding, list) + assert len(embedding) == 384 + assert all(isinstance(emb, float) for emb in embedding) From 4b1d8f945ade5e0f401a5cfdb73eb3a37bcd2cfb Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Wed, 13 Mar 2024 16:51:19 +0100 Subject: [PATCH 03/32] Revert "fix(opensearch): bulk error without create key" This reverts commit afc8e791f9f5b1f3b2c1484dc6ee6ff4493faafb. --- .../document_stores/opensearch/document_store.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 768da7528..e9c88274c 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -181,9 +181,6 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D duplicate_errors_ids = [] other_errors = [] for e in errors: - if "create" not in e: - other_errors.append(e) - continue error_type = e["create"]["error"]["type"] if policy == DuplicatePolicy.FAIL and error_type == "version_conflict_engine_exception": duplicate_errors_ids.append(e["create"]["_id"]) From 62d84780b447f2703a9c38ec8d4b171f13b09eaa Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Wed, 13 Mar 2024 16:54:29 +0100 Subject: [PATCH 04/32] feat(FastEmbed): __all__ fix --- .../components/embedders/fastembed/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py index a4490a24f..fa1ae6043 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py @@ -4,5 +4,6 @@ from .fastembed_document_embedder import FastembedDocumentEmbedder from .fastembed_text_embedder import FastembedTextEmbedder from .fastembed_document_SPLADE_embedder import FastembedDocumentSPLADEEmbedder +from .fastembed_text_SPLADE_embedder import FastembedTextSPLADEEmbedder -__all__ = ["FastembedDocumentEmbedder", "FastembedTextEmbedder", "FastembedDocumentSPLADEEmbedder"] +__all__ = ["FastembedDocumentEmbedder", "FastembedTextEmbedder", "FastembedDocumentSPLADEEmbedder", "FastembedTextSPLADEEmbedder"] From 0e0968a1a9da5f25a8c06c72d857db72199b629b Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Wed, 13 Mar 2024 17:09:01 +0100 Subject: [PATCH 05/32] feat(FastEmbed): fix one test --- .../embedders/fastembed/fastembed_document_SPLADE_embedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py index df24a2684..df23e5e3d 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py @@ -151,7 +151,7 @@ def run(self, documents: List[Document]): """ if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): msg = ( - "FastembedDocumentEmbedder expects a list of Documents as input. " + "FastembedDocumentSPLADEEmbedder expects a list of Documents as input. " "In case you want to embed a list of strings, please use the FastembedTextEmbedder." ) raise TypeError(msg) From 1feea082fb27d6a28970a4624d637791b7cbbb97 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Wed, 13 Mar 2024 17:33:06 +0100 Subject: [PATCH 06/32] feat(FastEmbed): fix one test --- .../fastembed_document_SPLADE_embedder.py | 1 - ...test_fastembed_document_SPLADE_embedder.py | 21 +++++++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py index df23e5e3d..3b52c40f1 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py @@ -171,5 +171,4 @@ def run(self, documents: List[Document]): for doc, emb in zip(documents, embeddings): doc.meta["_sparse_vector"] = emb - print(doc) return {"documents": documents} diff --git a/integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py b/integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py index 2021b0df7..c2b811cbc 100644 --- a/integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py @@ -196,14 +196,24 @@ def test_warmup_does_not_reload(self, mocked_factory): embedder.warm_up() mocked_factory.get_embedding_backend.assert_called_once() + def _generate_mocked_sparse_embedding(self, n): + list_of_sparse_vectors = [] + for _ in range(n): + random_indice_length = np.random.randint(0, 20) + data = { + "indices": [i for i in range(random_indice_length)], + "values": [np.random.random_sample() for _ in range(random_indice_length)] + } + list_of_sparse_vectors.append(data) + return list_of_sparse_vectors + def test_embed(self): """ Test for checking output dimensions and embedding dimensions. """ embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") embedder.embedding_backend = MagicMock() - # TODO adapt for sparse - embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 3).tolist() # noqa: ARG005 + embedder.embedding_backend.embed = lambda x, **kwargs: self._generate_mocked_sparse_embedding(len(x)) # noqa: ARG005 documents = [Document(content=f"Sample-document text {i}") for i in range(5)] @@ -214,8 +224,11 @@ def test_embed(self): for doc in result["documents"]: assert isinstance(doc, Document) # TODO adapt for sparse - assert isinstance(doc.meta["_sparse_vector"], list) - assert isinstance(doc.meta["_sparse_vector"][0], float) + assert isinstance(doc.meta["_sparse_vector"], dict) + assert isinstance(doc.meta["_sparse_vector"]["indices"], list) + assert isinstance(doc.meta["_sparse_vector"]["indices"][0], int) + assert isinstance(doc.meta["_sparse_vector"]["values"], list) + assert isinstance(doc.meta["_sparse_vector"]["values"][0], float) def test_embed_incorrect_input_format(self): """ From e1c56022fe423ff83cdbf73358b3abd446efa402 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Wed, 13 Mar 2024 17:40:07 +0100 Subject: [PATCH 07/32] feat(FastEmbed): fix a second test --- .../fastembed_text_SPLADE_embedder.py | 5 +--- .../test_fastembed_text_SPLADE_embedder.py | 23 +++++++++++++++---- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py index 517fb5ca9..ef8c73803 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py @@ -116,13 +116,10 @@ def run(self, text: str): raise RuntimeError(msg) text_to_embed = [self.prefix + text + self.suffix] - embedding = list( - self.embedding_backend.embed( + embedding = self.embedding_backend.embed( text_to_embed, batch_size=self.batch_size, show_progress_bar=self.progress_bar, parallel=self.parallel, )[0] - ) - print(embedding) return {"embedding": embedding} diff --git a/integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py b/integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py index f88c5c2a7..7cc5ad573 100644 --- a/integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py @@ -176,22 +176,37 @@ def test_warmup_does_not_reload(self, mocked_factory): embedder.warm_up() mocked_factory.get_embedding_backend.assert_called_once() + def _generate_mocked_sparse_embedding(self, n): + list_of_sparse_vectors = [] + for _ in range(n): + random_indice_length = np.random.randint(0, 20) + data = { + "indices": [i for i in range(random_indice_length)], + "values": [np.random.random_sample() for _ in range(random_indice_length)] + } + list_of_sparse_vectors.append(data) + + return list_of_sparse_vectors + def test_embed(self): """ Test for checking output dimensions and embedding dimensions. """ embedder = FastembedTextSPLADEEmbedder(model="BAAI/bge-base-en-v1.5") embedder.embedding_backend = MagicMock() - # TODO adapt to sparse - embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() # noqa: ARG005 + embedder.embedding_backend.embed = lambda x, **kwargs: self._generate_mocked_sparse_embedding( + len(x)) # noqa: ARG005 text = "Good text to embed" result = embedder.run(text=text) embedding = result["embedding"] # TODO adapt to sparse - assert isinstance(embedding, list) - assert all(isinstance(emb, float) for emb in embedding) + assert isinstance(embedding, dict) + assert isinstance(embedding["indices"], list) + assert isinstance(embedding["indices"][0], int) + assert isinstance(embedding["values"], list) + assert isinstance(embedding["values"][0], float) def test_run_wrong_incorrect_format(self): """ From a9b3827bc16fb54fdc62bf359d8ef98ee70ee550 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Wed, 13 Mar 2024 17:45:05 +0100 Subject: [PATCH 08/32] feat(FastEmbed): removed old TODO (fixed) --- .../embedders/fastembed/fastembed_document_SPLADE_embedder.py | 2 -- .../embedders/fastembed/fastembed_document_embedder.py | 2 -- .../fastembed/tests/test_fastembed_document_SPLADE_embedder.py | 2 -- .../fastembed/tests/test_fastembed_text_SPLADE_embedder.py | 1 - 4 files changed, 7 deletions(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py index 3b52c40f1..ef5e79441 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py @@ -159,8 +159,6 @@ def run(self, documents: List[Document]): msg = "The embedding model has not been loaded. Please call warm_up() before running." raise RuntimeError(msg) - # TODO: once non textual Documents are properly supported, we should also prepare them for embedding here - texts_to_embed = self._prepare_texts_to_embed(documents=documents) embeddings = self.embedding_backend.embed( texts_to_embed, diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py index b5dd71231..170ac64ab 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py @@ -157,8 +157,6 @@ def run(self, documents: List[Document]): msg = "The embedding model has not been loaded. Please call warm_up() before running." raise RuntimeError(msg) - # TODO: once non textual Documents are properly supported, we should also prepare them for embedding here - texts_to_embed = self._prepare_texts_to_embed(documents=documents) embeddings = self.embedding_backend.embed( texts_to_embed, diff --git a/integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py b/integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py index c2b811cbc..e90fae658 100644 --- a/integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py @@ -223,7 +223,6 @@ def test_embed(self): assert len(result["documents"]) == len(documents) for doc in result["documents"]: assert isinstance(doc, Document) - # TODO adapt for sparse assert isinstance(doc.meta["_sparse_vector"], dict) assert isinstance(doc.meta["_sparse_vector"]["indices"], list) assert isinstance(doc.meta["_sparse_vector"]["indices"][0], int) @@ -292,7 +291,6 @@ def test_run(self): result = embedder.run(documents=[doc]) # TODO adapt for sparse embedding = result["documents"][0].embedding - # TODO adapt for sparse assert isinstance(embedding, list) assert len(embedding) == 384 assert all(isinstance(emb, float) for emb in embedding) diff --git a/integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py b/integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py index 7cc5ad573..98e08625f 100644 --- a/integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py @@ -201,7 +201,6 @@ def test_embed(self): result = embedder.run(text=text) embedding = result["embedding"] - # TODO adapt to sparse assert isinstance(embedding, dict) assert isinstance(embedding["indices"], list) assert isinstance(embedding["indices"][0], int) From 69129c8cac1771814ce167c76a43348600b1d27e Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Wed, 13 Mar 2024 21:38:29 +0100 Subject: [PATCH 09/32] feat(FastEmbed): fixing all test + doc --- integrations/fastembed/README.md | 26 +++++++++++++++++++ integrations/fastembed/pyproject.toml | 2 +- .../embedding_backend/fastembed_backend.py | 7 ++++- .../fastembed_document_SPLADE_embedder.py | 5 ++-- ...test_fastembed_document_SPLADE_embedder.py | 13 +++++----- .../test_fastembed_text_SPLADE_embedder.py | 11 ++++---- 6 files changed, 48 insertions(+), 16 deletions(-) diff --git a/integrations/fastembed/README.md b/integrations/fastembed/README.md index 5ad056af3..70c10087f 100644 --- a/integrations/fastembed/README.md +++ b/integrations/fastembed/README.md @@ -43,6 +43,32 @@ doc = Document(content="fastembed is supported by and maintained by Qdrant.", me result = embedder.run(documents=[doc]) ``` +You can use `FastembedTextSPLADEEmbedder` and `FastembedDocumentSPLADEEmbedder` by importing as: + +```python +from haystack_integrations.components.embedders.fastembed import FastembedTextSPLADEEmbedder + +text = "fastembed is supported by and maintained by Qdrant." +text_embedder = FastembedTextSPLADEEmbedder( + model="prithvida/SPLADE_PP_en_v1" +) +text_embedder.warm_up() +embedding = text_embedder.run(text)["embedding"] +``` + +```python +from haystack_integrations.components.embedders.fastembed import FastembedDocumentSPLADEEmbedder +from haystack.dataclasses import Document + +embedder = FastembedDocumentSPLADEEmbedder( + model="prithvida/SPLADE_PP_en_v1", +) +embedder.warm_up() +doc = Document(content="fastembed is supported by and maintained by Qdrant.", meta={"long_answer": "no",}) +result = embedder.run(documents=[doc]) +# Sparse embeddings are placed in the meta field of the document under the `_sparse_vector` key. +``` + ## License `fastembed-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index 21d4c7506..9b9fb9822 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ ] dependencies = [ "haystack-ai", -"fastembed>=0.2.3", +"fastembed>=0.2.4", ] [project.urls] diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py index 81c3c5bb5..d0a32a88c 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py @@ -82,6 +82,11 @@ def __init__( self.model = SparseTextEmbedding(model_name=model_name, cache_dir=cache_dir, threads=threads) def embed(self, data: List[List[str]], **kwargs) -> List[Dict[str, np.ndarray]]: - # The embed method returns a Iterable[SparseEmbedding], so we convert it to a list of dictionaries + # The embed method returns a Iterable[SparseEmbedding], so we convert it to a list of dictionaries. + # Each dict contains an `indices` key containing a list of int and an `values` key containing a list of floats. sparse_embeddings = [sparse_embedding.as_object() for sparse_embedding in self.model.embed(data, **kwargs)] + for embedding in sparse_embeddings: + embedding['indices'] = embedding['indices'].tolist() + embedding['values'] = embedding['values'].tolist() + return sparse_embeddings diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py index ef5e79441..6ec71ff56 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py @@ -9,7 +9,7 @@ class FastembedDocumentSPLADEEmbedder: """ FastembedDocumentSPLADEEmbedder computes Document embeddings using Fastembed SPLADE models. - # TODO: check where to store the sparse embedding in the Document object + The embedding of each Document is stored in the `meta["_sparse_vector"]` field of the Document. Usage example: @@ -47,7 +47,6 @@ class FastembedDocumentSPLADEEmbedder: result = doc_embedder.run(document_list) print(f"Document Text: {result['documents'][0].content}") - # TODO: WHERE DO WE STORE THE EMBEDDINGS ? print(f"Document Embedding: {result['documents'][0].meta["_sparse_vector"]}") print(f"Embedding Dimension: {len(result['documents'][0].meta["_sparse_vector"])}") ``` @@ -147,7 +146,7 @@ def run(self, documents: List[Document]): :param documents: List of Documents to embed. :returns: A dictionary with the following keys: - - `documents`: List of Documents with each Document's `embedding` field set to the computed embeddings. + - `documents`: List of Documents with each Document's `meta.["_sparse_vector"]` field set to the computed embeddings. """ if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): msg = ( diff --git a/integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py b/integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py index e90fae658..384b3d9a5 100644 --- a/integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py @@ -199,7 +199,7 @@ def test_warmup_does_not_reload(self, mocked_factory): def _generate_mocked_sparse_embedding(self, n): list_of_sparse_vectors = [] for _ in range(n): - random_indice_length = np.random.randint(0, 20) + random_indice_length = np.random.randint(3, 15) data = { "indices": [i for i in range(random_indice_length)], "values": [np.random.random_sample() for _ in range(random_indice_length)] @@ -289,8 +289,9 @@ def test_run(self): doc = Document(content="Parton energy loss in QCD matter") result = embedder.run(documents=[doc]) - # TODO adapt for sparse - embedding = result["documents"][0].embedding - assert isinstance(embedding, list) - assert len(embedding) == 384 - assert all(isinstance(emb, float) for emb in embedding) + embedding = result["documents"][0].meta["_sparse_vector"] + assert isinstance(embedding, dict) + assert isinstance(embedding["indices"], list) + assert isinstance(embedding["values"], list) + assert isinstance(embedding["indices"][0], int) + assert isinstance(embedding["values"][0], float) diff --git a/integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py b/integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py index 98e08625f..daa211561 100644 --- a/integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py @@ -179,7 +179,7 @@ def test_warmup_does_not_reload(self, mocked_factory): def _generate_mocked_sparse_embedding(self, n): list_of_sparse_vectors = [] for _ in range(n): - random_indice_length = np.random.randint(0, 20) + random_indice_length = np.random.randint(3, 15) data = { "indices": [i for i in range(random_indice_length)], "values": [np.random.random_sample() for _ in range(random_indice_length)] @@ -230,7 +230,8 @@ def test_run(self): result = embedder.run(text=text) embedding = result["embedding"] - # TODO adapt to sparse - assert isinstance(embedding, list) - assert len(embedding) == 384 - assert all(isinstance(emb, float) for emb in embedding) + assert isinstance(embedding, dict) + assert isinstance(embedding["indices"], list) + assert isinstance(embedding["values"], list) + assert isinstance(embedding["indices"][0], int) + assert isinstance(embedding["values"][0], float) From 10ea1293dc4cad23ce4718f7548de2b77a65be12 Mon Sep 17 00:00:00 2001 From: Corentin Date: Wed, 13 Mar 2024 23:38:50 +0100 Subject: [PATCH 10/32] fix output typing --- .../fastembed/embedding_backend/fastembed_backend.py | 4 ++-- .../embedders/fastembed/fastembed_text_SPLADE_embedder.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py index d0a32a88c..dbc4fdb64 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py @@ -1,4 +1,4 @@ -from typing import ClassVar, Dict, List, Optional +from typing import ClassVar, Dict, List, Optional, Union import numpy as np @@ -81,7 +81,7 @@ def __init__( ): self.model = SparseTextEmbedding(model_name=model_name, cache_dir=cache_dir, threads=threads) - def embed(self, data: List[List[str]], **kwargs) -> List[Dict[str, np.ndarray]]: + def embed(self, data: List[List[str]], **kwargs) -> List[Dict[str, Union[List[int], List[float]]]]: # The embed method returns a Iterable[SparseEmbedding], so we convert it to a list of dictionaries. # Each dict contains an `indices` key containing a list of int and an `values` key containing a list of floats. sparse_embeddings = [sparse_embedding.as_object() for sparse_embedding in self.model.embed(data, **kwargs)] diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py index ef8c73803..33700ac12 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union import numpy as np from haystack import component, default_to_dict @@ -94,7 +94,7 @@ def warm_up(self): model_name=self.model_name, cache_dir=self.cache_dir, threads=self.threads ) - @component.output_types(embedding=List[Dict[str, np.ndarray]]) + @component.output_types(embedding=List[Dict[str, Union[List[int], List[float]]]]) def run(self, text: str): """ Embeds text using the Fastembed model. From 8e20cee84b3b471a2417c3c8c1b554783939686b Mon Sep 17 00:00:00 2001 From: Corentin Date: Wed, 13 Mar 2024 23:44:56 +0100 Subject: [PATCH 11/32] Fix output component --- .../embedders/fastembed/fastembed_text_SPLADE_embedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py index 33700ac12..6703efa19 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py @@ -94,7 +94,7 @@ def warm_up(self): model_name=self.model_name, cache_dir=self.cache_dir, threads=self.threads ) - @component.output_types(embedding=List[Dict[str, Union[List[int], List[float]]]]) + @component.output_types(embedding=Dict[str, Union[List[int], List[float]]]) def run(self, text: str): """ Embeds text using the Fastembed model. From d4f836ae1393c0e621593cfaa037f51cd32a6237 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Thu, 14 Mar 2024 08:59:39 +0100 Subject: [PATCH 12/32] feat(FastEmbed): renaming SPLADE to Sparse because it makes more sense --- integrations/fastembed/README.md | 10 ++-- .../embedders/fastembed/__init__.py | 6 +- ... => fastembed_sparse_document_embedder.py} | 10 ++-- ...r.py => fastembed_sparse_text_embedder.py} | 12 ++-- ...est_fastembed_sparse_document_embedder.py} | 58 +++++++++---------- ...=> test_fastembed_sparse_text_embedder.py} | 54 ++++++++--------- 6 files changed, 75 insertions(+), 75 deletions(-) rename integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/{fastembed_document_SPLADE_embedder.py => fastembed_sparse_document_embedder.py} (95%) rename integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/{fastembed_text_SPLADE_embedder.py => fastembed_sparse_text_embedder.py} (93%) rename integrations/fastembed/tests/{test_fastembed_document_SPLADE_embedder.py => test_fastembed_sparse_document_embedder.py} (84%) rename integrations/fastembed/tests/{test_fastembed_text_SPLADE_embedder.py => test_fastembed_sparse_text_embedder.py} (82%) diff --git a/integrations/fastembed/README.md b/integrations/fastembed/README.md index 70c10087f..d8bd3ce7a 100644 --- a/integrations/fastembed/README.md +++ b/integrations/fastembed/README.md @@ -43,13 +43,13 @@ doc = Document(content="fastembed is supported by and maintained by Qdrant.", me result = embedder.run(documents=[doc]) ``` -You can use `FastembedTextSPLADEEmbedder` and `FastembedDocumentSPLADEEmbedder` by importing as: +You can use `FastembedSparseTextEmbedder` and `FastembedSparseDocumentEmbedder` by importing as: ```python -from haystack_integrations.components.embedders.fastembed import FastembedTextSPLADEEmbedder +from haystack_integrations.components.embedders.fastembed import FastembedSparseTextEmbedder text = "fastembed is supported by and maintained by Qdrant." -text_embedder = FastembedTextSPLADEEmbedder( +text_embedder = FastembedSparseTextEmbedder( model="prithvida/SPLADE_PP_en_v1" ) text_embedder.warm_up() @@ -57,10 +57,10 @@ embedding = text_embedder.run(text)["embedding"] ``` ```python -from haystack_integrations.components.embedders.fastembed import FastembedDocumentSPLADEEmbedder +from haystack_integrations.components.embedders.fastembed import FastembedSparseDocumentEmbedder from haystack.dataclasses import Document -embedder = FastembedDocumentSPLADEEmbedder( +embedder = FastembedSparseDocumentEmbedder( model="prithvida/SPLADE_PP_en_v1", ) embedder.warm_up() diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py index fa1ae6043..57c88e3c2 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from .fastembed_document_embedder import FastembedDocumentEmbedder from .fastembed_text_embedder import FastembedTextEmbedder -from .fastembed_document_SPLADE_embedder import FastembedDocumentSPLADEEmbedder -from .fastembed_text_SPLADE_embedder import FastembedTextSPLADEEmbedder +from .fastembed_sparse_document_embedder import FastembedSparseDocumentEmbedder +from .fastembed_sparse_text_embedder import FastembedSparseTextEmbedder -__all__ = ["FastembedDocumentEmbedder", "FastembedTextEmbedder", "FastembedDocumentSPLADEEmbedder", "FastembedTextSPLADEEmbedder"] +__all__ = ["FastembedDocumentEmbedder", "FastembedTextEmbedder", "FastembedSparseDocumentEmbedder", "FastembedSparseTextEmbedder"] diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py similarity index 95% rename from integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py rename to integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py index 6ec71ff56..be06aa1a9 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_SPLADE_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py @@ -6,9 +6,9 @@ @component -class FastembedDocumentSPLADEEmbedder: +class FastembedSparseDocumentEmbedder: """ - FastembedDocumentSPLADEEmbedder computes Document embeddings using Fastembed SPLADE models. + FastembedSparseDocumentEmbedder computes Document embeddings using Fastembed sparse models. The embedding of each Document is stored in the `meta["_sparse_vector"]` field of the Document. @@ -17,10 +17,10 @@ class FastembedDocumentSPLADEEmbedder: # To use this component, install the "fastembed-haystack" package. # pip install fastembed-haystack - from haystack_integrations.components.embedders.fastembed import FastembedDocumentSPLADEEmbedder + from haystack_integrations.components.embedders.fastembed import FastembedSparseDocumentEmbedder from haystack.dataclasses import Document - doc_embedder = FastembedDocumentSPLADEEmbedder( + doc_embedder = FastembedSparseDocumentEmbedder( model="prithvida/SPLADE_PP_en_v1", batch_size=256, ) @@ -150,7 +150,7 @@ def run(self, documents: List[Document]): """ if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): msg = ( - "FastembedDocumentSPLADEEmbedder expects a list of Documents as input. " + "FastembedSparseDocumentEmbedder expects a list of Documents as input. " "In case you want to embed a list of strings, please use the FastembedTextEmbedder." ) raise TypeError(msg) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py similarity index 93% rename from integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py rename to integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py index 6703efa19..094f29624 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_SPLADE_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py @@ -6,20 +6,20 @@ @component -class FastembedTextSPLADEEmbedder: +class FastembedSparseTextEmbedder: """ - FastembedTextSPLADEEmbedder computes string embedding using fastembed SPLADE models. + FastembedSparseTextEmbedder computes string embedding using fastembed sparse models. Usage example: ```python # To use this component, install the "fastembed-haystack" package. # pip install fastembed-haystack - from haystack_integrations.components.embedders.fastembed import FastembedTextSPLADEEmbedder + from haystack_integrations.components.embedders.fastembed import FastembedSparseTextEmbedder text = "It clearly says online this will work on a Mac OS system. The disk comes and it does not, only Windows. Do Not order this if you have a Mac!!" - text_embedder = FastembedTextSPLADEEmbedder( + text_embedder = FastembedSparseTextEmbedder( model="prithvida/SPLADE_PP_en_v1" ) text_embedder.warm_up() @@ -40,7 +40,7 @@ def __init__( parallel: Optional[int] = None, ): """ - Create a FastembedTextSPLADEEmbedder component. + Create a FastembedSparseTextEmbedder component. :param model: Local path or name of the model in Fastembed's model hub, such as `prithvida/SPLADE_PP_en_v1` :param cache_dir: The path to the cache directory. @@ -107,7 +107,7 @@ def run(self, text: str): """ if not isinstance(text, str): msg = ( - "FastembedTextSPLADEEmbedder expects a string as input. " + "FastembedSparseTextEmbedder expects a string as input. " "In case you want to embed a list of Documents, please use the FastembedDocumentEmbedder." ) raise TypeError(msg) diff --git a/integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py similarity index 84% rename from integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py rename to integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py index 384b3d9a5..35cd521ba 100644 --- a/integrations/fastembed/tests/test_fastembed_document_SPLADE_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py @@ -3,17 +3,17 @@ import numpy as np import pytest from haystack import Document, default_from_dict -from haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder import ( - FastembedDocumentSPLADEEmbedder, +from haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder import ( + FastembedSparseDocumentEmbedder, ) -class TestFastembedDocumentSPLADEEmbedderDoc: +class TestFastembedSparseDocumentEmbedderDoc: def test_init_default(self): """ - Test default initialization parameters for FastembedDocumentSPLADEEmbedder. + Test default initialization parameters for FastembedSparseDocumentEmbedder. """ - embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1") assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" assert embedder.cache_dir is None assert embedder.threads is None @@ -27,9 +27,9 @@ def test_init_default(self): def test_init_with_parameters(self): """ - Test custom initialization parameters for FastembedDocumentSPLADEEmbedder. + Test custom initialization parameters for FastembedSparseDocumentEmbedder. """ - embedder = FastembedDocumentSPLADEEmbedder( + embedder = FastembedSparseDocumentEmbedder( model="prithvida/SPLADE_PP_en_v1", cache_dir="fake_dir", threads=2, @@ -54,12 +54,12 @@ def test_init_with_parameters(self): def test_to_dict(self): """ - Test serialization of FastembedDocumentSPLADEEmbedder to a dictionary, using default initialization parameters. + Test serialization of FastembedSparseDocumentEmbedder to a dictionary, using default initialization parameters. """ - embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1") embedder_dict = embedder.to_dict() assert embedder_dict == { - "type": "haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder.FastembedDocumentSPLADEEmbedder", # noqa + "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa "init_parameters": { "model": "prithvida/SPLADE_PP_en_v1", "cache_dir": None, @@ -76,9 +76,9 @@ def test_to_dict(self): def test_to_dict_with_custom_init_parameters(self): """ - Test serialization of FastembedDocumentSPLADEEmbedder to a dictionary, using custom initialization parameters. + Test serialization of FastembedSparseDocumentEmbedder to a dictionary, using custom initialization parameters. """ - embedder = FastembedDocumentSPLADEEmbedder( + embedder = FastembedSparseDocumentEmbedder( model="prithvida/SPLADE_PP_en_v1", cache_dir="fake_dir", threads=2, @@ -92,7 +92,7 @@ def test_to_dict_with_custom_init_parameters(self): ) embedder_dict = embedder.to_dict() assert embedder_dict == { - "type": "haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder.FastembedDocumentSPLADEEmbedder", # noqa + "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa "init_parameters": { "model": "prithvida/SPLADE_PP_en_v1", "cache_dir": "fake_dir", @@ -109,10 +109,10 @@ def test_to_dict_with_custom_init_parameters(self): def test_from_dict(self): """ - Test deserialization of FastembedDocumentSPLADEEmbedder from a dictionary, using default initialization parameters. + Test deserialization of FastembedSparseDocumentEmbedder from a dictionary, using default initialization parameters. """ embedder_dict = { - "type": "haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder.FastembedDocumentSPLADEEmbedder", # noqa + "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa "init_parameters": { "model": "prithvida/SPLADE_PP_en_v1", "cache_dir": None, @@ -126,7 +126,7 @@ def test_from_dict(self): "embedding_separator": "\n", }, } - embedder = default_from_dict(FastembedDocumentSPLADEEmbedder, embedder_dict) + embedder = default_from_dict(FastembedSparseDocumentEmbedder, embedder_dict) assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" assert embedder.cache_dir is None assert embedder.threads is None @@ -140,10 +140,10 @@ def test_from_dict(self): def test_from_dict_with_custom_init_parameters(self): """ - Test deserialization of FastembedDocumentSPLADEEmbedder from a dictionary, using custom initialization parameters. + Test deserialization of FastembedSparseDocumentEmbedder from a dictionary, using custom initialization parameters. """ embedder_dict = { - "type": "haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder.FastembedDocumentSPLADEEmbedder", # noqa + "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa "init_parameters": { "model": "prithvida/SPLADE_PP_en_v1", "cache_dir": "fake_dir", @@ -157,7 +157,7 @@ def test_from_dict_with_custom_init_parameters(self): "embedding_separator": " | ", }, } - embedder = default_from_dict(FastembedDocumentSPLADEEmbedder, embedder_dict) + embedder = default_from_dict(FastembedSparseDocumentEmbedder, embedder_dict) assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" assert embedder.cache_dir == "fake_dir" assert embedder.threads == 2 @@ -170,13 +170,13 @@ def test_from_dict_with_custom_init_parameters(self): assert embedder.embedding_separator == " | " @patch( - "haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder._FastembedSparseEmbeddingBackendFactory" + "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder._FastembedSparseEmbeddingBackendFactory" ) def test_warmup(self, mocked_factory): """ Test for checking embedder instances after warm-up. """ - embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1") mocked_factory.get_embedding_backend.assert_not_called() embedder.warm_up() mocked_factory.get_embedding_backend.assert_called_once_with( @@ -184,13 +184,13 @@ def test_warmup(self, mocked_factory): ) @patch( - "haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder._FastembedSparseEmbeddingBackendFactory" + "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder._FastembedSparseEmbeddingBackendFactory" ) def test_warmup_does_not_reload(self, mocked_factory): """ Test for checking backend instances after multiple warm-ups. """ - embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1") mocked_factory.get_embedding_backend.assert_not_called() embedder.warm_up() embedder.warm_up() @@ -211,7 +211,7 @@ def test_embed(self): """ Test for checking output dimensions and embedding dimensions. """ - embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1") embedder.embedding_backend = MagicMock() embedder.embedding_backend.embed = lambda x, **kwargs: self._generate_mocked_sparse_embedding(len(x)) # noqa: ARG005 @@ -233,20 +233,20 @@ def test_embed_incorrect_input_format(self): """ Test for checking incorrect input format when creating embedding. """ - embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1") string_input = "text" list_integers_input = [1, 2, 3] with pytest.raises( TypeError, - match="FastembedDocumentSPLADEEmbedder expects a list of Documents as input.", + match="FastembedSparseDocumentEmbedder expects a list of Documents as input.", ): embedder.run(documents=string_input) with pytest.raises( TypeError, - match="FastembedDocumentSPLADEEmbedder expects a list of Documents as input.", + match="FastembedSparseDocumentEmbedder expects a list of Documents as input.", ): embedder.run(documents=list_integers_input) @@ -255,7 +255,7 @@ def test_embed_metadata(self): Test for checking output dimensions and embedding dimensions for documents with a custom instruction and metadata. """ - embedder = FastembedDocumentSPLADEEmbedder( + embedder = FastembedSparseDocumentEmbedder( model="model", meta_fields_to_embed=["meta_field"], embedding_separator="\n", @@ -281,7 +281,7 @@ def test_embed_metadata(self): @pytest.mark.integration def test_run(self): - embedder = FastembedDocumentSPLADEEmbedder( + embedder = FastembedSparseDocumentEmbedder( model="prithvida/SPLADE_PP_en_v1", ) embedder.warm_up() diff --git a/integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py similarity index 82% rename from integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py rename to integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py index daa211561..fd9ce7b73 100644 --- a/integrations/fastembed/tests/test_fastembed_text_SPLADE_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py @@ -3,17 +3,17 @@ import numpy as np import pytest from haystack import default_from_dict -from haystack_integrations.components.embedders.fastembed.fastembed_text_SPLADE_embedder import ( - FastembedTextSPLADEEmbedder, +from haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder import ( + FastembedSparseTextEmbedder, ) -class TestFastembedTextSPLADEEmbedder: +class TestFastembedSparseTextEmbedder: def test_init_default(self): """ - Test default initialization parameters for FastembedTextSPLADEEmbedder. + Test default initialization parameters for FastembedSparseTextEmbedder. """ - embedder = FastembedTextSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseTextEmbedder(model="prithvida/SPLADE_PP_en_v1") assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" assert embedder.cache_dir is None assert embedder.threads is None @@ -25,9 +25,9 @@ def test_init_default(self): def test_init_with_parameters(self): """ - Test custom initialization parameters for FastembedTextSPLADEEmbedder. + Test custom initialization parameters for FastembedSparseTextEmbedder. """ - embedder = FastembedTextSPLADEEmbedder( + embedder = FastembedSparseTextEmbedder( model="prithvida/SPLADE_PP_en_v1", cache_dir="fake_dir", threads=2, @@ -48,12 +48,12 @@ def test_init_with_parameters(self): def test_to_dict(self): """ - Test serialization of FastembedTextSPLADEEmbedder to a dictionary, using default initialization parameters. + Test serialization of FastembedSparseTextEmbedder to a dictionary, using default initialization parameters. """ - embedder = FastembedTextSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseTextEmbedder(model="prithvida/SPLADE_PP_en_v1") embedder_dict = embedder.to_dict() assert embedder_dict == { - "type": "haystack_integrations.components.embedders.fastembed.fastembed_text_SPLADE_embedder.FastembedTextSPLADEEmbedder", # noqa + "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder.FastembedSparseTextEmbedder", # noqa "init_parameters": { "model": "prithvida/SPLADE_PP_en_v1", "cache_dir": None, @@ -68,9 +68,9 @@ def test_to_dict(self): def test_to_dict_with_custom_init_parameters(self): """ - Test serialization of FastembedTextSPLADEEmbedder to a dictionary, using custom initialization parameters. + Test serialization of FastembedSparseTextEmbedder to a dictionary, using custom initialization parameters. """ - embedder = FastembedTextSPLADEEmbedder( + embedder = FastembedSparseTextEmbedder( model="prithvida/SPLADE_PP_en_v1", cache_dir="fake_dir", threads=2, @@ -82,7 +82,7 @@ def test_to_dict_with_custom_init_parameters(self): ) embedder_dict = embedder.to_dict() assert embedder_dict == { - "type": "haystack_integrations.components.embedders.fastembed.fastembed_text_SPLADE_embedder.FastembedTextSPLADEEmbedder", # noqa + "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder.FastembedSparseTextEmbedder", # noqa "init_parameters": { "model": "prithvida/SPLADE_PP_en_v1", "cache_dir": "fake_dir", @@ -97,10 +97,10 @@ def test_to_dict_with_custom_init_parameters(self): def test_from_dict(self): """ - Test deserialization of FastembedTextSPLADEEmbedder from a dictionary, using default initialization parameters. + Test deserialization of FastembedSparseTextEmbedder from a dictionary, using default initialization parameters. """ embedder_dict = { - "type": "haystack_integrations.components.embedders.fastembed.fastembed_text_SPLADE_embedder.FastembedTextSPLADEEmbedder", # noqa + "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder.FastembedSparseTextEmbedder", # noqa "init_parameters": { "model": "prithvida/SPLADE_PP_en_v1", "cache_dir": None, @@ -112,7 +112,7 @@ def test_from_dict(self): "parallel": None, }, } - embedder = default_from_dict(FastembedTextSPLADEEmbedder, embedder_dict) + embedder = default_from_dict(FastembedSparseTextEmbedder, embedder_dict) assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" assert embedder.cache_dir is None assert embedder.threads is None @@ -124,10 +124,10 @@ def test_from_dict(self): def test_from_dict_with_custom_init_parameters(self): """ - Test deserialization of FastembedTextSPLADEEmbedder from a dictionary, using custom initialization parameters. + Test deserialization of FastembedSparseTextEmbedder from a dictionary, using custom initialization parameters. """ embedder_dict = { - "type": "haystack_integrations.components.embedders.fastembed.fastembed_text_SPLADE_embedder.FastembedTextSPLADEEmbedder", # noqa + "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder.FastembedSparseTextEmbedder", # noqa "init_parameters": { "model": "prithvida/SPLADE_PP_en_v1", "cache_dir": "fake_dir", @@ -139,7 +139,7 @@ def test_from_dict_with_custom_init_parameters(self): "parallel": 1, }, } - embedder = default_from_dict(FastembedTextSPLADEEmbedder, embedder_dict) + embedder = default_from_dict(FastembedSparseTextEmbedder, embedder_dict) assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" assert embedder.cache_dir == "fake_dir" assert embedder.threads == 2 @@ -150,13 +150,13 @@ def test_from_dict_with_custom_init_parameters(self): assert embedder.parallel == 1 @patch( - "haystack_integrations.components.embedders.fastembed.fastembed_text_SPLADE_embedder._FastembedSparseEmbeddingBackendFactory" + "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder._FastembedSparseEmbeddingBackendFactory" ) def test_warmup(self, mocked_factory): """ Test for checking embedder instances after warm-up. """ - embedder = FastembedTextSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseTextEmbedder(model="prithvida/SPLADE_PP_en_v1") mocked_factory.get_embedding_backend.assert_not_called() embedder.warm_up() mocked_factory.get_embedding_backend.assert_called_once_with( @@ -164,13 +164,13 @@ def test_warmup(self, mocked_factory): ) @patch( - "haystack_integrations.components.embedders.fastembed.fastembed_text_SPLADE_embedder._FastembedSparseEmbeddingBackendFactory" + "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder._FastembedSparseEmbeddingBackendFactory" ) def test_warmup_does_not_reload(self, mocked_factory): """ Test for checking backend instances after multiple warm-ups. """ - embedder = FastembedTextSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseTextEmbedder(model="prithvida/SPLADE_PP_en_v1") mocked_factory.get_embedding_backend.assert_not_called() embedder.warm_up() embedder.warm_up() @@ -192,7 +192,7 @@ def test_embed(self): """ Test for checking output dimensions and embedding dimensions. """ - embedder = FastembedTextSPLADEEmbedder(model="BAAI/bge-base-en-v1.5") + embedder = FastembedSparseTextEmbedder(model="BAAI/bge-base-en-v1.5") embedder.embedding_backend = MagicMock() embedder.embedding_backend.embed = lambda x, **kwargs: self._generate_mocked_sparse_embedding( len(x)) # noqa: ARG005 @@ -211,17 +211,17 @@ def test_run_wrong_incorrect_format(self): """ Test for checking incorrect input format when creating embedding. """ - embedder = FastembedTextSPLADEEmbedder(model="BAAI/bge-base-en-v1.5") + embedder = FastembedSparseTextEmbedder(model="BAAI/bge-base-en-v1.5") embedder.embedding_backend = MagicMock() list_integers_input = [1, 2, 3] - with pytest.raises(TypeError, match="FastembedTextSPLADEEmbedder expects a string as input"): + with pytest.raises(TypeError, match="FastembedSparseTextEmbedder expects a string as input"): embedder.run(text=list_integers_input) @pytest.mark.integration def test_run(self): - embedder = FastembedTextSPLADEEmbedder( + embedder = FastembedSparseTextEmbedder( model="prithvida/SPLADE_PP_en_v1", ) embedder.warm_up() From 6cb01956f316869dfd7691ba36e7c14a38f9d23b Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Thu, 14 Mar 2024 09:14:18 +0100 Subject: [PATCH 13/32] feat(FastEmbed): hatch run all lint --- .../components/embedders/fastembed/__init__.py | 9 +++++++-- .../embedding_backend/fastembed_backend.py | 13 ++++++++----- .../fastembed_sparse_document_embedder.py | 3 ++- .../fastembed/fastembed_sparse_text_embedder.py | 12 ++++++------ .../test_fastembed_sparse_document_embedder.py | 14 +++++++++----- .../tests/test_fastembed_sparse_text_embedder.py | 9 +++++---- 6 files changed, 37 insertions(+), 23 deletions(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py index 57c88e3c2..e943a8ca1 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py @@ -2,8 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 from .fastembed_document_embedder import FastembedDocumentEmbedder -from .fastembed_text_embedder import FastembedTextEmbedder from .fastembed_sparse_document_embedder import FastembedSparseDocumentEmbedder from .fastembed_sparse_text_embedder import FastembedSparseTextEmbedder +from .fastembed_text_embedder import FastembedTextEmbedder -__all__ = ["FastembedDocumentEmbedder", "FastembedTextEmbedder", "FastembedSparseDocumentEmbedder", "FastembedSparseTextEmbedder"] +__all__ = [ + "FastembedDocumentEmbedder", + "FastembedTextEmbedder", + "FastembedSparseDocumentEmbedder", + "FastembedSparseTextEmbedder", +] diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py index dbc4fdb64..0c2631787 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py @@ -1,10 +1,9 @@ from typing import ClassVar, Dict, List, Optional, Union -import numpy as np - from fastembed import TextEmbedding from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding + class _FastembedEmbeddingBackendFactory: """ Factory class to create instances of fastembed embedding backends. @@ -46,6 +45,7 @@ def embed(self, data: List[List[str]], **kwargs) -> List[List[float]]: embeddings = [np_array.tolist() for np_array in self.model.embed(data, **kwargs)] return embeddings + class _FastembedSparseEmbeddingBackendFactory: """ Factory class to create instances of fastembed sparse embedding backends. @@ -64,10 +64,13 @@ def get_embedding_backend( if embedding_backend_id in _FastembedSparseEmbeddingBackendFactory._instances: return _FastembedSparseEmbeddingBackendFactory._instances[embedding_backend_id] - embedding_backend = _FastembedSparseEmbeddingBackend(model_name=model_name, cache_dir=cache_dir, threads=threads) + embedding_backend = _FastembedSparseEmbeddingBackend( + model_name=model_name, cache_dir=cache_dir, threads=threads + ) _FastembedSparseEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend return embedding_backend + class _FastembedSparseEmbeddingBackend: """ Class to manage fastembed sparse embeddings. @@ -86,7 +89,7 @@ def embed(self, data: List[List[str]], **kwargs) -> List[Dict[str, Union[List[in # Each dict contains an `indices` key containing a list of int and an `values` key containing a list of floats. sparse_embeddings = [sparse_embedding.as_object() for sparse_embedding in self.model.embed(data, **kwargs)] for embedding in sparse_embeddings: - embedding['indices'] = embedding['indices'].tolist() - embedding['values'] = embedding['values'].tolist() + embedding["indices"] = embedding["indices"].tolist() + embedding["values"] = embedding["values"].tolist() return sparse_embeddings diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py index be06aa1a9..3e7cf2923 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py @@ -146,7 +146,8 @@ def run(self, documents: List[Document]): :param documents: List of Documents to embed. :returns: A dictionary with the following keys: - - `documents`: List of Documents with each Document's `meta.["_sparse_vector"]` field set to the computed embeddings. + - `documents`: List of Documents with each Document's `meta.["_sparse_vector"]` field set + to the computed embeddings. """ if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): msg = ( diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py index 094f29624..7bc5e6553 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py @@ -1,5 +1,5 @@ from typing import Any, Dict, List, Optional, Union -import numpy as np + from haystack import component, default_to_dict from .embedding_backend.fastembed_backend import _FastembedSparseEmbeddingBackendFactory @@ -117,9 +117,9 @@ def run(self, text: str): text_to_embed = [self.prefix + text + self.suffix] embedding = self.embedding_backend.embed( - text_to_embed, - batch_size=self.batch_size, - show_progress_bar=self.progress_bar, - parallel=self.parallel, - )[0] + text_to_embed, + batch_size=self.batch_size, + show_progress_bar=self.progress_bar, + parallel=self.parallel, + )[0] return {"embedding": embedding} diff --git a/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py index 35cd521ba..573f3979d 100644 --- a/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py @@ -109,7 +109,8 @@ def test_to_dict_with_custom_init_parameters(self): def test_from_dict(self): """ - Test deserialization of FastembedSparseDocumentEmbedder from a dictionary, using default initialization parameters. + Test deserialization of FastembedSparseDocumentEmbedder from a dictionary, + using default initialization parameters. """ embedder_dict = { "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa @@ -140,7 +141,8 @@ def test_from_dict(self): def test_from_dict_with_custom_init_parameters(self): """ - Test deserialization of FastembedSparseDocumentEmbedder from a dictionary, using custom initialization parameters. + Test deserialization of FastembedSparseDocumentEmbedder from a dictionary, + using custom initialization parameters. """ embedder_dict = { "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa @@ -201,8 +203,8 @@ def _generate_mocked_sparse_embedding(self, n): for _ in range(n): random_indice_length = np.random.randint(3, 15) data = { - "indices": [i for i in range(random_indice_length)], - "values": [np.random.random_sample() for _ in range(random_indice_length)] + "indices": list(range(random_indice_length)), + "values": [np.random.random_sample() for _ in range(random_indice_length)], } list_of_sparse_vectors.append(data) return list_of_sparse_vectors @@ -213,7 +215,9 @@ def test_embed(self): """ embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1") embedder.embedding_backend = MagicMock() - embedder.embedding_backend.embed = lambda x, **kwargs: self._generate_mocked_sparse_embedding(len(x)) # noqa: ARG005 + embedder.embedding_backend.embed = lambda x, **kwargs: self._generate_mocked_sparse_embedding( # noqa: ARG005 + len(x) + ) documents = [Document(content=f"Sample-document text {i}") for i in range(5)] diff --git a/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py index fd9ce7b73..64871004e 100644 --- a/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py @@ -181,8 +181,8 @@ def _generate_mocked_sparse_embedding(self, n): for _ in range(n): random_indice_length = np.random.randint(3, 15) data = { - "indices": [i for i in range(random_indice_length)], - "values": [np.random.random_sample() for _ in range(random_indice_length)] + "indices": list(range(random_indice_length)), + "values": [np.random.random_sample() for _ in range(random_indice_length)], } list_of_sparse_vectors.append(data) @@ -194,8 +194,9 @@ def test_embed(self): """ embedder = FastembedSparseTextEmbedder(model="BAAI/bge-base-en-v1.5") embedder.embedding_backend = MagicMock() - embedder.embedding_backend.embed = lambda x, **kwargs: self._generate_mocked_sparse_embedding( - len(x)) # noqa: ARG005 + embedder.embedding_backend.embed = lambda x, **kwargs: self._generate_mocked_sparse_embedding( # noqa: ARG005 + len(x) + ) text = "Good text to embed" From a6de1e97d9735cc6f7429ef75a9e47f1cbe84ff2 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Wed, 20 Mar 2024 22:41:30 +0100 Subject: [PATCH 14/32] feat(FastEmbed): modify PR for haystack 2.1.0 with proper sparse vectors --- integrations/fastembed/README.md | 1 - integrations/fastembed/pyproject.toml | 2 +- .../fastembed/fastembed_sparse_document_embedder.py | 10 ++++------ .../tests/test_fastembed_sparse_document_embedder.py | 12 ++++++------ 4 files changed, 11 insertions(+), 14 deletions(-) diff --git a/integrations/fastembed/README.md b/integrations/fastembed/README.md index d8bd3ce7a..6d47aa9c4 100644 --- a/integrations/fastembed/README.md +++ b/integrations/fastembed/README.md @@ -66,7 +66,6 @@ embedder = FastembedSparseDocumentEmbedder( embedder.warm_up() doc = Document(content="fastembed is supported by and maintained by Qdrant.", meta={"long_answer": "no",}) result = embedder.run(documents=[doc]) -# Sparse embeddings are placed in the meta field of the document under the `_sparse_vector` key. ``` ## License diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index 9b9fb9822..681a6a20c 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -25,7 +25,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ -"haystack-ai", +"haystack-ai>=2.1.0", "fastembed>=0.2.4", ] diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py index 3e7cf2923..2a35353bb 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py @@ -10,8 +10,6 @@ class FastembedSparseDocumentEmbedder: """ FastembedSparseDocumentEmbedder computes Document embeddings using Fastembed sparse models. - The embedding of each Document is stored in the `meta["_sparse_vector"]` field of the Document. - Usage example: ```python # To use this component, install the "fastembed-haystack" package. @@ -47,8 +45,8 @@ class FastembedSparseDocumentEmbedder: result = doc_embedder.run(document_list) print(f"Document Text: {result['documents'][0].content}") - print(f"Document Embedding: {result['documents'][0].meta["_sparse_vector"]}") - print(f"Embedding Dimension: {len(result['documents'][0].meta["_sparse_vector"])}") + print(f"Document Embedding: {result['documents'][0].sparse_embedding}") + print(f"Embedding Dimension: {len(result['documents'][0].sparse_embedding)}") ``` """ # noqa: E501 @@ -146,7 +144,7 @@ def run(self, documents: List[Document]): :param documents: List of Documents to embed. :returns: A dictionary with the following keys: - - `documents`: List of Documents with each Document's `meta.["_sparse_vector"]` field set + - `documents`: List of Documents to the computed embeddings. """ if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): @@ -168,5 +166,5 @@ def run(self, documents: List[Document]): ) for doc, emb in zip(documents, embeddings): - doc.meta["_sparse_vector"] = emb + doc.sparse_embedding = emb return {"documents": documents} diff --git a/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py index 573f3979d..3a5c15890 100644 --- a/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py @@ -227,11 +227,11 @@ def test_embed(self): assert len(result["documents"]) == len(documents) for doc in result["documents"]: assert isinstance(doc, Document) - assert isinstance(doc.meta["_sparse_vector"], dict) - assert isinstance(doc.meta["_sparse_vector"]["indices"], list) - assert isinstance(doc.meta["_sparse_vector"]["indices"][0], int) - assert isinstance(doc.meta["_sparse_vector"]["values"], list) - assert isinstance(doc.meta["_sparse_vector"]["values"][0], float) + assert isinstance(doc.sparse_embedding, dict) + assert isinstance(doc.sparse_embedding["indices"], list) + assert isinstance(doc.sparse_embedding["indices"][0], int) + assert isinstance(doc.sparse_embedding["values"], list) + assert isinstance(doc.sparse_embedding["values"][0], float) def test_embed_incorrect_input_format(self): """ @@ -293,7 +293,7 @@ def test_run(self): doc = Document(content="Parton energy loss in QCD matter") result = embedder.run(documents=[doc]) - embedding = result["documents"][0].meta["_sparse_vector"] + embedding = result["documents"][0].sparse_embedding assert isinstance(embedding, dict) assert isinstance(embedding["indices"], list) assert isinstance(embedding["values"], list) From d37e788c50908cf43cac80aef24a079bb737b42d Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 21 Mar 2024 12:26:58 +0100 Subject: [PATCH 15/32] try testing with Haystack main branch --- .github/workflows/fastembed.yml | 4 +++- integrations/fastembed/pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/fastembed.yml b/.github/workflows/fastembed.yml index fe736029a..6f927a98e 100644 --- a/.github/workflows/fastembed.yml +++ b/.github/workflows/fastembed.yml @@ -42,4 +42,6 @@ jobs: run: hatch run docs - name: Run tests - run: hatch run cov + run: | + hatch run pip install git+https://github.com/deepset-ai/haystack.git #TODO: rm before merging + hatch run cov diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index 681a6a20c..9b9fb9822 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -25,7 +25,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ -"haystack-ai>=2.1.0", +"haystack-ai", "fastembed>=0.2.4", ] From 0050a6b7d2c711121de94152cf53cf0587888564 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 21 Mar 2024 12:35:34 +0100 Subject: [PATCH 16/32] update model name --- integrations/fastembed/README.md | 4 +-- .../fastembed_sparse_document_embedder.py | 6 ++-- .../fastembed_sparse_text_embedder.py | 6 ++-- ...test_fastembed_sparse_document_embedder.py | 36 +++++++++---------- .../test_fastembed_sparse_text_embedder.py | 32 ++++++++--------- 5 files changed, 42 insertions(+), 42 deletions(-) diff --git a/integrations/fastembed/README.md b/integrations/fastembed/README.md index 6d47aa9c4..c021dec3b 100644 --- a/integrations/fastembed/README.md +++ b/integrations/fastembed/README.md @@ -50,7 +50,7 @@ from haystack_integrations.components.embedders.fastembed import FastembedSparse text = "fastembed is supported by and maintained by Qdrant." text_embedder = FastembedSparseTextEmbedder( - model="prithvida/SPLADE_PP_en_v1" + model="prithvida/Splade_PP_en_v1" ) text_embedder.warm_up() embedding = text_embedder.run(text)["embedding"] @@ -61,7 +61,7 @@ from haystack_integrations.components.embedders.fastembed import FastembedSparse from haystack.dataclasses import Document embedder = FastembedSparseDocumentEmbedder( - model="prithvida/SPLADE_PP_en_v1", + model="prithvida/Splade_PP_en_v1", ) embedder.warm_up() doc = Document(content="fastembed is supported by and maintained by Qdrant.", meta={"long_answer": "no",}) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py index 2a35353bb..ad3376329 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py @@ -19,7 +19,7 @@ class FastembedSparseDocumentEmbedder: from haystack.dataclasses import Document doc_embedder = FastembedSparseDocumentEmbedder( - model="prithvida/SPLADE_PP_en_v1", + model="prithvida/Splade_PP_en_v1", batch_size=256, ) @@ -52,7 +52,7 @@ class FastembedSparseDocumentEmbedder: def __init__( self, - model: str = "prithvida/SPLADE_PP_en_v1", + model: str = "prithvida/Splade_PP_en_v1", cache_dir: Optional[str] = None, threads: Optional[int] = None, prefix: str = "", @@ -67,7 +67,7 @@ def __init__( Create an FastembedDocumentEmbedder component. :param model: Local path or name of the model in Hugging Face's model hub, - such as `prithvida/SPLADE_PP_en_v1`. + such as `prithvida/Splade_PP_en_v1`. :param cache_dir: The path to the cache directory. Can be set using the `FASTEMBED_CACHE_PATH` env variable. Defaults to `fastembed_cache` in the system's temp directory. diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py index 7bc5e6553..88227f5c6 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py @@ -20,7 +20,7 @@ class FastembedSparseTextEmbedder: text = "It clearly says online this will work on a Mac OS system. The disk comes and it does not, only Windows. Do Not order this if you have a Mac!!" text_embedder = FastembedSparseTextEmbedder( - model="prithvida/SPLADE_PP_en_v1" + model="prithvida/Splade_PP_en_v1" ) text_embedder.warm_up() @@ -30,7 +30,7 @@ class FastembedSparseTextEmbedder: def __init__( self, - model: str = "prithvida/SPLADE_PP_en_v1", + model: str = "prithvida/Splade_PP_en_v1", cache_dir: Optional[str] = None, threads: Optional[int] = None, prefix: str = "", @@ -42,7 +42,7 @@ def __init__( """ Create a FastembedSparseTextEmbedder component. - :param model: Local path or name of the model in Fastembed's model hub, such as `prithvida/SPLADE_PP_en_v1` + :param model: Local path or name of the model in Fastembed's model hub, such as `prithvida/Splade_PP_en_v1` :param cache_dir: The path to the cache directory. Can be set using the `FASTEMBED_CACHE_PATH` env variable. Defaults to `fastembed_cache` in the system's temp directory. diff --git a/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py index 3a5c15890..01800ceae 100644 --- a/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py @@ -13,8 +13,8 @@ def test_init_default(self): """ Test default initialization parameters for FastembedSparseDocumentEmbedder. """ - embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1") - assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" + embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1") + assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir is None assert embedder.threads is None assert embedder.prefix == "" @@ -30,7 +30,7 @@ def test_init_with_parameters(self): Test custom initialization parameters for FastembedSparseDocumentEmbedder. """ embedder = FastembedSparseDocumentEmbedder( - model="prithvida/SPLADE_PP_en_v1", + model="prithvida/Splade_PP_en_v1", cache_dir="fake_dir", threads=2, prefix="prefix", @@ -41,7 +41,7 @@ def test_init_with_parameters(self): meta_fields_to_embed=["test_field"], embedding_separator=" | ", ) - assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" + assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir == "fake_dir" assert embedder.threads == 2 assert embedder.prefix == "prefix" @@ -56,12 +56,12 @@ def test_to_dict(self): """ Test serialization of FastembedSparseDocumentEmbedder to a dictionary, using default initialization parameters. """ - embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1") embedder_dict = embedder.to_dict() assert embedder_dict == { "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa "init_parameters": { - "model": "prithvida/SPLADE_PP_en_v1", + "model": "prithvida/Splade_PP_en_v1", "cache_dir": None, "threads": None, "prefix": "", @@ -79,7 +79,7 @@ def test_to_dict_with_custom_init_parameters(self): Test serialization of FastembedSparseDocumentEmbedder to a dictionary, using custom initialization parameters. """ embedder = FastembedSparseDocumentEmbedder( - model="prithvida/SPLADE_PP_en_v1", + model="prithvida/Splade_PP_en_v1", cache_dir="fake_dir", threads=2, prefix="prefix", @@ -94,7 +94,7 @@ def test_to_dict_with_custom_init_parameters(self): assert embedder_dict == { "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa "init_parameters": { - "model": "prithvida/SPLADE_PP_en_v1", + "model": "prithvida/Splade_PP_en_v1", "cache_dir": "fake_dir", "threads": 2, "prefix": "prefix", @@ -115,7 +115,7 @@ def test_from_dict(self): embedder_dict = { "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa "init_parameters": { - "model": "prithvida/SPLADE_PP_en_v1", + "model": "prithvida/Splade_PP_en_v1", "cache_dir": None, "threads": None, "prefix": "", @@ -128,7 +128,7 @@ def test_from_dict(self): }, } embedder = default_from_dict(FastembedSparseDocumentEmbedder, embedder_dict) - assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" + assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir is None assert embedder.threads is None assert embedder.prefix == "" @@ -147,7 +147,7 @@ def test_from_dict_with_custom_init_parameters(self): embedder_dict = { "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa "init_parameters": { - "model": "prithvida/SPLADE_PP_en_v1", + "model": "prithvida/Splade_PP_en_v1", "cache_dir": "fake_dir", "threads": 2, "prefix": "prefix", @@ -160,7 +160,7 @@ def test_from_dict_with_custom_init_parameters(self): }, } embedder = default_from_dict(FastembedSparseDocumentEmbedder, embedder_dict) - assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" + assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir == "fake_dir" assert embedder.threads == 2 assert embedder.prefix == "prefix" @@ -178,11 +178,11 @@ def test_warmup(self, mocked_factory): """ Test for checking embedder instances after warm-up. """ - embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1") mocked_factory.get_embedding_backend.assert_not_called() embedder.warm_up() mocked_factory.get_embedding_backend.assert_called_once_with( - model_name="prithvida/SPLADE_PP_en_v1", cache_dir=None, threads=None + model_name="prithvida/Splade_PP_en_v1", cache_dir=None, threads=None ) @patch( @@ -192,7 +192,7 @@ def test_warmup_does_not_reload(self, mocked_factory): """ Test for checking backend instances after multiple warm-ups. """ - embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1") mocked_factory.get_embedding_backend.assert_not_called() embedder.warm_up() embedder.warm_up() @@ -213,7 +213,7 @@ def test_embed(self): """ Test for checking output dimensions and embedding dimensions. """ - embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1") embedder.embedding_backend = MagicMock() embedder.embedding_backend.embed = lambda x, **kwargs: self._generate_mocked_sparse_embedding( # noqa: ARG005 len(x) @@ -237,7 +237,7 @@ def test_embed_incorrect_input_format(self): """ Test for checking incorrect input format when creating embedding. """ - embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1") string_input = "text" list_integers_input = [1, 2, 3] @@ -286,7 +286,7 @@ def test_embed_metadata(self): @pytest.mark.integration def test_run(self): embedder = FastembedSparseDocumentEmbedder( - model="prithvida/SPLADE_PP_en_v1", + model="prithvida/Splade_PP_en_v1", ) embedder.warm_up() diff --git a/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py index 64871004e..5861ac209 100644 --- a/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py @@ -13,8 +13,8 @@ def test_init_default(self): """ Test default initialization parameters for FastembedSparseTextEmbedder. """ - embedder = FastembedSparseTextEmbedder(model="prithvida/SPLADE_PP_en_v1") - assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" + embedder = FastembedSparseTextEmbedder(model="prithvida/Splade_PP_en_v1") + assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir is None assert embedder.threads is None assert embedder.prefix == "" @@ -28,7 +28,7 @@ def test_init_with_parameters(self): Test custom initialization parameters for FastembedSparseTextEmbedder. """ embedder = FastembedSparseTextEmbedder( - model="prithvida/SPLADE_PP_en_v1", + model="prithvida/Splade_PP_en_v1", cache_dir="fake_dir", threads=2, prefix="prefix", @@ -37,7 +37,7 @@ def test_init_with_parameters(self): progress_bar=False, parallel=1, ) - assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" + assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir == "fake_dir" assert embedder.threads == 2 assert embedder.prefix == "prefix" @@ -50,12 +50,12 @@ def test_to_dict(self): """ Test serialization of FastembedSparseTextEmbedder to a dictionary, using default initialization parameters. """ - embedder = FastembedSparseTextEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseTextEmbedder(model="prithvida/Splade_PP_en_v1") embedder_dict = embedder.to_dict() assert embedder_dict == { "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder.FastembedSparseTextEmbedder", # noqa "init_parameters": { - "model": "prithvida/SPLADE_PP_en_v1", + "model": "prithvida/Splade_PP_en_v1", "cache_dir": None, "threads": None, "prefix": "", @@ -71,7 +71,7 @@ def test_to_dict_with_custom_init_parameters(self): Test serialization of FastembedSparseTextEmbedder to a dictionary, using custom initialization parameters. """ embedder = FastembedSparseTextEmbedder( - model="prithvida/SPLADE_PP_en_v1", + model="prithvida/Splade_PP_en_v1", cache_dir="fake_dir", threads=2, prefix="prefix", @@ -84,7 +84,7 @@ def test_to_dict_with_custom_init_parameters(self): assert embedder_dict == { "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder.FastembedSparseTextEmbedder", # noqa "init_parameters": { - "model": "prithvida/SPLADE_PP_en_v1", + "model": "prithvida/Splade_PP_en_v1", "cache_dir": "fake_dir", "threads": 2, "prefix": "prefix", @@ -102,7 +102,7 @@ def test_from_dict(self): embedder_dict = { "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder.FastembedSparseTextEmbedder", # noqa "init_parameters": { - "model": "prithvida/SPLADE_PP_en_v1", + "model": "prithvida/Splade_PP_en_v1", "cache_dir": None, "threads": None, "prefix": "", @@ -113,7 +113,7 @@ def test_from_dict(self): }, } embedder = default_from_dict(FastembedSparseTextEmbedder, embedder_dict) - assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" + assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir is None assert embedder.threads is None assert embedder.prefix == "" @@ -129,7 +129,7 @@ def test_from_dict_with_custom_init_parameters(self): embedder_dict = { "type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder.FastembedSparseTextEmbedder", # noqa "init_parameters": { - "model": "prithvida/SPLADE_PP_en_v1", + "model": "prithvida/Splade_PP_en_v1", "cache_dir": "fake_dir", "threads": 2, "prefix": "prefix", @@ -140,7 +140,7 @@ def test_from_dict_with_custom_init_parameters(self): }, } embedder = default_from_dict(FastembedSparseTextEmbedder, embedder_dict) - assert embedder.model_name == "prithvida/SPLADE_PP_en_v1" + assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir == "fake_dir" assert embedder.threads == 2 assert embedder.prefix == "prefix" @@ -156,11 +156,11 @@ def test_warmup(self, mocked_factory): """ Test for checking embedder instances after warm-up. """ - embedder = FastembedSparseTextEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseTextEmbedder(model="prithvida/Splade_PP_en_v1") mocked_factory.get_embedding_backend.assert_not_called() embedder.warm_up() mocked_factory.get_embedding_backend.assert_called_once_with( - model_name="prithvida/SPLADE_PP_en_v1", cache_dir=None, threads=None + model_name="prithvida/Splade_PP_en_v1", cache_dir=None, threads=None ) @patch( @@ -170,7 +170,7 @@ def test_warmup_does_not_reload(self, mocked_factory): """ Test for checking backend instances after multiple warm-ups. """ - embedder = FastembedSparseTextEmbedder(model="prithvida/SPLADE_PP_en_v1") + embedder = FastembedSparseTextEmbedder(model="prithvida/Splade_PP_en_v1") mocked_factory.get_embedding_backend.assert_not_called() embedder.warm_up() embedder.warm_up() @@ -223,7 +223,7 @@ def test_run_wrong_incorrect_format(self): @pytest.mark.integration def test_run(self): embedder = FastembedSparseTextEmbedder( - model="prithvida/SPLADE_PP_en_v1", + model="prithvida/Splade_PP_en_v1", ) embedder.warm_up() From 5ea12b56f180b0e3673e0ec435e0a8e86eedf455 Mon Sep 17 00:00:00 2001 From: Corentin Date: Thu, 21 Mar 2024 13:27:58 +0100 Subject: [PATCH 17/32] Update integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py Co-authored-by: Stefano Fiorucci --- .../fastembed/fastembed_sparse_document_embedder.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py index ad3376329..0865eac0c 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py @@ -130,11 +130,9 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: meta_values_to_embed = [ str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None ] - text_to_embed = [ - self.prefix + self.embedding_separator.join([*meta_values_to_embed, doc.content or ""]) + self.suffix, - ] + text_to_embed = self.embedding_separator.join([*meta_values_to_embed, doc.content or ""] - texts_to_embed.append(text_to_embed[0]) + texts_to_embed.append(text_to_embed) return texts_to_embed @component.output_types(documents=List[Document]) From 14a8c2d9b17282bed8c8c2f0fac9326e5723cadf Mon Sep 17 00:00:00 2001 From: Corentin Date: Thu, 21 Mar 2024 13:28:07 +0100 Subject: [PATCH 18/32] Update integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py Co-authored-by: Stefano Fiorucci --- .../embedders/fastembed/fastembed_sparse_document_embedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py index 0865eac0c..c5ca4f625 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py @@ -71,7 +71,7 @@ def __init__( :param cache_dir: The path to the cache directory. Can be set using the `FASTEMBED_CACHE_PATH` env variable. Defaults to `fastembed_cache` in the system's temp directory. - :param threads: The number of threads single onnxruntime session can use. Defaults to None. + :param threads: The number of threads single onnxruntime session can use. :param prefix: A string to add to the beginning of each text. :param suffix: A string to add to the end of each text. :param batch_size: Number of strings to encode at once. From 709ac1241c7462017c4396ce4b8eca9cfc660e08 Mon Sep 17 00:00:00 2001 From: Corentin Date: Thu, 21 Mar 2024 13:28:14 +0100 Subject: [PATCH 19/32] Update integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py Co-authored-by: Stefano Fiorucci --- .../embedders/fastembed/fastembed_sparse_document_embedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py index c5ca4f625..41f1cbaa4 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py @@ -75,7 +75,7 @@ def __init__( :param prefix: A string to add to the beginning of each text. :param suffix: A string to add to the end of each text. :param batch_size: Number of strings to encode at once. - :param progress_bar: If true, displays progress bar during embedding. + :param progress_bar: If `True`, displays progress bar during embedding. :param parallel: If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. If 0, use all available cores. From 11f858421742a852552ab8727539037515863cd4 Mon Sep 17 00:00:00 2001 From: Corentin Date: Thu, 21 Mar 2024 13:28:24 +0100 Subject: [PATCH 20/32] Update integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py Co-authored-by: Stefano Fiorucci --- .../embedders/fastembed/fastembed_sparse_document_embedder.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py index 41f1cbaa4..3b0a5614e 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py @@ -142,8 +142,7 @@ def run(self, documents: List[Document]): :param documents: List of Documents to embed. :returns: A dictionary with the following keys: - - `documents`: List of Documents - to the computed embeddings. + - `documents`: List of Documents with each Document's `sparse_embedding` field set to the computed embeddings. """ if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): msg = ( From 727b5abbcce7bd9825cdb85666cc53e7ecec455e Mon Sep 17 00:00:00 2001 From: Corentin Date: Thu, 21 Mar 2024 13:37:57 +0100 Subject: [PATCH 21/32] Update integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py Co-authored-by: Stefano Fiorucci --- .../embedders/fastembed/fastembed_sparse_text_embedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py index 88227f5c6..18b728693 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py @@ -94,7 +94,7 @@ def warm_up(self): model_name=self.model_name, cache_dir=self.cache_dir, threads=self.threads ) - @component.output_types(embedding=Dict[str, Union[List[int], List[float]]]) + @component.output_types(sparse_embedding=Dict[str, Union[List[int], List[float]]]) def run(self, text: str): """ Embeds text using the Fastembed model. From 40cb5b625c1d895b07a1c53a066fb8b6ea1c0888 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Thu, 21 Mar 2024 13:31:29 +0100 Subject: [PATCH 22/32] feat(FastEmbed): remove prefix/suffix --- .../fastembed_sparse_document_embedder.py | 8 -------- .../fastembed_sparse_text_embedder.py | 11 +--------- ...test_fastembed_sparse_document_embedder.py | 20 ------------------- .../test_fastembed_sparse_text_embedder.py | 20 ------------------- 4 files changed, 1 insertion(+), 58 deletions(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py index 3b0a5614e..77483e850 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py @@ -55,8 +55,6 @@ def __init__( model: str = "prithvida/Splade_PP_en_v1", cache_dir: Optional[str] = None, threads: Optional[int] = None, - prefix: str = "", - suffix: str = "", batch_size: int = 256, progress_bar: bool = True, parallel: Optional[int] = None, @@ -72,8 +70,6 @@ def __init__( Can be set using the `FASTEMBED_CACHE_PATH` env variable. Defaults to `fastembed_cache` in the system's temp directory. :param threads: The number of threads single onnxruntime session can use. - :param prefix: A string to add to the beginning of each text. - :param suffix: A string to add to the end of each text. :param batch_size: Number of strings to encode at once. :param progress_bar: If `True`, displays progress bar during embedding. :param parallel: @@ -87,8 +83,6 @@ def __init__( self.model_name = model self.cache_dir = cache_dir self.threads = threads - self.prefix = prefix - self.suffix = suffix self.batch_size = batch_size self.progress_bar = progress_bar self.parallel = parallel @@ -106,8 +100,6 @@ def to_dict(self) -> Dict[str, Any]: model=self.model_name, cache_dir=self.cache_dir, threads=self.threads, - prefix=self.prefix, - suffix=self.suffix, batch_size=self.batch_size, progress_bar=self.progress_bar, parallel=self.parallel, diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py index 18b728693..43d59a148 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py @@ -33,8 +33,6 @@ def __init__( model: str = "prithvida/Splade_PP_en_v1", cache_dir: Optional[str] = None, threads: Optional[int] = None, - prefix: str = "", - suffix: str = "", batch_size: int = 256, progress_bar: bool = True, parallel: Optional[int] = None, @@ -48,8 +46,6 @@ def __init__( Defaults to `fastembed_cache` in the system's temp directory. :param threads: The number of threads single onnxruntime session can use. Defaults to None. :param batch_size: Number of strings to encode at once. - :param prefix: A string to add to the beginning of each text. - :param suffix: A string to add to the end of each text. :param progress_bar: If true, displays progress bar during embedding. :param parallel: If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. @@ -60,8 +56,6 @@ def __init__( self.model_name = model self.cache_dir = cache_dir self.threads = threads - self.prefix = prefix - self.suffix = suffix self.batch_size = batch_size self.progress_bar = progress_bar self.parallel = parallel @@ -78,8 +72,6 @@ def to_dict(self) -> Dict[str, Any]: model=self.model_name, cache_dir=self.cache_dir, threads=self.threads, - prefix=self.prefix, - suffix=self.suffix, batch_size=self.batch_size, progress_bar=self.progress_bar, parallel=self.parallel, @@ -115,9 +107,8 @@ def run(self, text: str): msg = "The embedding model has not been loaded. Please call warm_up() before running." raise RuntimeError(msg) - text_to_embed = [self.prefix + text + self.suffix] embedding = self.embedding_backend.embed( - text_to_embed, + [text], batch_size=self.batch_size, show_progress_bar=self.progress_bar, parallel=self.parallel, diff --git a/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py index 01800ceae..c080ae630 100644 --- a/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py @@ -17,8 +17,6 @@ def test_init_default(self): assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir is None assert embedder.threads is None - assert embedder.prefix == "" - assert embedder.suffix == "" assert embedder.batch_size == 256 assert embedder.progress_bar is True assert embedder.parallel is None @@ -33,8 +31,6 @@ def test_init_with_parameters(self): model="prithvida/Splade_PP_en_v1", cache_dir="fake_dir", threads=2, - prefix="prefix", - suffix="suffix", batch_size=64, progress_bar=False, parallel=1, @@ -44,8 +40,6 @@ def test_init_with_parameters(self): assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir == "fake_dir" assert embedder.threads == 2 - assert embedder.prefix == "prefix" - assert embedder.suffix == "suffix" assert embedder.batch_size == 64 assert embedder.progress_bar is False assert embedder.parallel == 1 @@ -64,8 +58,6 @@ def test_to_dict(self): "model": "prithvida/Splade_PP_en_v1", "cache_dir": None, "threads": None, - "prefix": "", - "suffix": "", "batch_size": 256, "progress_bar": True, "parallel": None, @@ -82,8 +74,6 @@ def test_to_dict_with_custom_init_parameters(self): model="prithvida/Splade_PP_en_v1", cache_dir="fake_dir", threads=2, - prefix="prefix", - suffix="suffix", batch_size=64, progress_bar=False, parallel=1, @@ -97,8 +87,6 @@ def test_to_dict_with_custom_init_parameters(self): "model": "prithvida/Splade_PP_en_v1", "cache_dir": "fake_dir", "threads": 2, - "prefix": "prefix", - "suffix": "suffix", "batch_size": 64, "progress_bar": False, "parallel": 1, @@ -118,8 +106,6 @@ def test_from_dict(self): "model": "prithvida/Splade_PP_en_v1", "cache_dir": None, "threads": None, - "prefix": "", - "suffix": "", "batch_size": 256, "progress_bar": True, "parallel": None, @@ -131,8 +117,6 @@ def test_from_dict(self): assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir is None assert embedder.threads is None - assert embedder.prefix == "" - assert embedder.suffix == "" assert embedder.batch_size == 256 assert embedder.progress_bar is True assert embedder.parallel is None @@ -150,8 +134,6 @@ def test_from_dict_with_custom_init_parameters(self): "model": "prithvida/Splade_PP_en_v1", "cache_dir": "fake_dir", "threads": 2, - "prefix": "prefix", - "suffix": "suffix", "batch_size": 64, "progress_bar": False, "parallel": 1, @@ -163,8 +145,6 @@ def test_from_dict_with_custom_init_parameters(self): assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir == "fake_dir" assert embedder.threads == 2 - assert embedder.prefix == "prefix" - assert embedder.suffix == "suffix" assert embedder.batch_size == 64 assert embedder.progress_bar is False assert embedder.parallel == 1 diff --git a/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py index 5861ac209..6a57b8e24 100644 --- a/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py @@ -17,8 +17,6 @@ def test_init_default(self): assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir is None assert embedder.threads is None - assert embedder.prefix == "" - assert embedder.suffix == "" assert embedder.batch_size == 256 assert embedder.progress_bar is True assert embedder.parallel is None @@ -31,8 +29,6 @@ def test_init_with_parameters(self): model="prithvida/Splade_PP_en_v1", cache_dir="fake_dir", threads=2, - prefix="prefix", - suffix="suffix", batch_size=64, progress_bar=False, parallel=1, @@ -40,8 +36,6 @@ def test_init_with_parameters(self): assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir == "fake_dir" assert embedder.threads == 2 - assert embedder.prefix == "prefix" - assert embedder.suffix == "suffix" assert embedder.batch_size == 64 assert embedder.progress_bar is False assert embedder.parallel == 1 @@ -58,8 +52,6 @@ def test_to_dict(self): "model": "prithvida/Splade_PP_en_v1", "cache_dir": None, "threads": None, - "prefix": "", - "suffix": "", "batch_size": 256, "progress_bar": True, "parallel": None, @@ -74,8 +66,6 @@ def test_to_dict_with_custom_init_parameters(self): model="prithvida/Splade_PP_en_v1", cache_dir="fake_dir", threads=2, - prefix="prefix", - suffix="suffix", batch_size=64, progress_bar=False, parallel=1, @@ -87,8 +77,6 @@ def test_to_dict_with_custom_init_parameters(self): "model": "prithvida/Splade_PP_en_v1", "cache_dir": "fake_dir", "threads": 2, - "prefix": "prefix", - "suffix": "suffix", "batch_size": 64, "progress_bar": False, "parallel": 1, @@ -105,8 +93,6 @@ def test_from_dict(self): "model": "prithvida/Splade_PP_en_v1", "cache_dir": None, "threads": None, - "prefix": "", - "suffix": "", "batch_size": 256, "progress_bar": True, "parallel": None, @@ -116,8 +102,6 @@ def test_from_dict(self): assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir is None assert embedder.threads is None - assert embedder.prefix == "" - assert embedder.suffix == "" assert embedder.batch_size == 256 assert embedder.progress_bar is True assert embedder.parallel is None @@ -132,8 +116,6 @@ def test_from_dict_with_custom_init_parameters(self): "model": "prithvida/Splade_PP_en_v1", "cache_dir": "fake_dir", "threads": 2, - "prefix": "prefix", - "suffix": "suffix", "batch_size": 64, "progress_bar": False, "parallel": 1, @@ -143,8 +125,6 @@ def test_from_dict_with_custom_init_parameters(self): assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir == "fake_dir" assert embedder.threads == 2 - assert embedder.prefix == "prefix" - assert embedder.suffix == "suffix" assert embedder.batch_size == 64 assert embedder.progress_bar is False assert embedder.parallel == 1 From e7e16669e2794eff108bcec9a0d40160937d7c0a Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Thu, 21 Mar 2024 13:51:33 +0100 Subject: [PATCH 23/32] feat(FastEmbed): fix linting --- integrations/fastembed/pyproject.toml | 2 +- .../fastembed/fastembed_sparse_document_embedder.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index d0e92d5cd..fe74b67d0 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ ] dependencies = [ "haystack-ai", -"fastembed>=0.2.4", +"fastembed>=0.2.5", ] [project.urls] diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py index 77483e850..a93020bf7 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py @@ -122,7 +122,7 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: meta_values_to_embed = [ str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None ] - text_to_embed = self.embedding_separator.join([*meta_values_to_embed, doc.content or ""] + text_to_embed = self.embedding_separator.join([*meta_values_to_embed, doc.content or ""]) texts_to_embed.append(text_to_embed) return texts_to_embed @@ -134,7 +134,8 @@ def run(self, documents: List[Document]): :param documents: List of Documents to embed. :returns: A dictionary with the following keys: - - `documents`: List of Documents with each Document's `sparse_embedding` field set to the computed embeddings. + - `documents`: List of Documents with each Document's `sparse_embedding` + field set to the computed embeddings. """ if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): msg = ( From 89f857da0a6a7d34f5baa27e0044a724a4f9decb Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Thu, 21 Mar 2024 16:03:24 +0100 Subject: [PATCH 24/32] feat(FastEmbed): suggestion for progress bar --- .../embedding_backend/fastembed_backend.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py index 9d56d0726..41578a2b4 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py @@ -91,12 +91,20 @@ def __init__( ): self.model = SparseTextEmbedding(model_name=model_name, cache_dir=cache_dir, threads=threads) - def embed(self, data: List[List[str]], **kwargs) -> List[Dict[str, Union[List[int], List[float]]]]: + def embed( + self, data: List[List[str]], progress_bar=True, **kwargs + ) -> List[Dict[str, Union[List[int], List[float]]]]: # The embed method returns a Iterable[SparseEmbedding], so we convert it to a list of dictionaries. # Each dict contains an `indices` key containing a list of int and an `values` key containing a list of floats. - sparse_embeddings = [sparse_embedding.as_object() for sparse_embedding in self.model.embed(data, **kwargs)] - for embedding in sparse_embeddings: - embedding["indices"] = embedding["indices"].tolist() - embedding["values"] = embedding["values"].tolist() + + sparse_embeddings = [] + sparse_embeddings_iterable = self.model.embed(data, **kwargs) + for sparse_embedding in tqdm( + sparse_embeddings_iterable, disable=not progress_bar, desc="Calculating sparse embeddings", total=len(data) + ): + sparse_embedding_obj = sparse_embedding.as_object() + sparse_embedding_obj["indices"] = sparse_embedding_obj["indices"].tolist() + sparse_embedding_obj["values"] = sparse_embedding_obj["values"].tolist() + sparse_embeddings.append(sparse_embedding_obj) return sparse_embeddings From 66bc9521f0225ae7a0fbc1b005ef68802513b06e Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Fri, 22 Mar 2024 10:40:30 +0100 Subject: [PATCH 25/32] feat(FastEmbed): return Haystack's SparseEmbedding instead of Dict --- .../embedding_backend/fastembed_backend.py | 16 ++++++++++------ .../test_fastembed_sparse_document_embedder.py | 12 +++++++----- .../tests/test_fastembed_sparse_text_embedder.py | 12 +++++++----- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py index 41578a2b4..f3663409a 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py @@ -2,6 +2,8 @@ from tqdm import tqdm +from haystack.dataclasses.sparse_embedding import SparseEmbedding + from fastembed import TextEmbedding from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding @@ -94,17 +96,19 @@ def __init__( def embed( self, data: List[List[str]], progress_bar=True, **kwargs ) -> List[Dict[str, Union[List[int], List[float]]]]: - # The embed method returns a Iterable[SparseEmbedding], so we convert it to a list of dictionaries. - # Each dict contains an `indices` key containing a list of int and an `values` key containing a list of floats. + # The embed method returns a Iterable[SparseEmbedding], so we convert to Haystack SparseEmbedding type. + # Each SparseEmbedding contains an `indices` key containing a list of int and + # an `values` key containing a list of floats. sparse_embeddings = [] sparse_embeddings_iterable = self.model.embed(data, **kwargs) for sparse_embedding in tqdm( sparse_embeddings_iterable, disable=not progress_bar, desc="Calculating sparse embeddings", total=len(data) ): - sparse_embedding_obj = sparse_embedding.as_object() - sparse_embedding_obj["indices"] = sparse_embedding_obj["indices"].tolist() - sparse_embedding_obj["values"] = sparse_embedding_obj["values"].tolist() - sparse_embeddings.append(sparse_embedding_obj) + sparse_embeddings.append( + SparseEmbedding( + indices=sparse_embedding.indices.tolist(), values=sparse_embedding.values.tolist() + ) + ) return sparse_embeddings diff --git a/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py index c080ae630..fe74c7bd8 100644 --- a/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py @@ -3,6 +3,7 @@ import numpy as np import pytest from haystack import Document, default_from_dict +from haystack.dataclasses.sparse_embedding import SparseEmbedding from haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder import ( FastembedSparseDocumentEmbedder, ) @@ -274,8 +275,9 @@ def test_run(self): result = embedder.run(documents=[doc]) embedding = result["documents"][0].sparse_embedding - assert isinstance(embedding, dict) - assert isinstance(embedding["indices"], list) - assert isinstance(embedding["values"], list) - assert isinstance(embedding["indices"][0], int) - assert isinstance(embedding["values"][0], float) + embedding_dict = embedding.to_dict() + assert isinstance(embedding, SparseEmbedding) + assert isinstance(embedding_dict["indices"], list) + assert isinstance(embedding_dict["values"], list) + assert isinstance(embedding_dict["indices"][0], int) + assert isinstance(embedding_dict["values"][0], float) diff --git a/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py index 6a57b8e24..58e12705a 100644 --- a/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py @@ -3,6 +3,7 @@ import numpy as np import pytest from haystack import default_from_dict +from haystack.dataclasses.sparse_embedding import SparseEmbedding from haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder import ( FastembedSparseTextEmbedder, ) @@ -211,8 +212,9 @@ def test_run(self): result = embedder.run(text=text) embedding = result["embedding"] - assert isinstance(embedding, dict) - assert isinstance(embedding["indices"], list) - assert isinstance(embedding["values"], list) - assert isinstance(embedding["indices"][0], int) - assert isinstance(embedding["values"][0], float) + embedding_dict = embedding.to_dict() + assert isinstance(embedding, SparseEmbedding) + assert isinstance(embedding_dict["indices"], list) + assert isinstance(embedding_dict["values"], list) + assert isinstance(embedding_dict["indices"][0], int) + assert isinstance(embedding_dict["values"][0], float) From 97dd121490c12e8c651fe9fbb9cf08e09e18abf8 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Fri, 22 Mar 2024 10:53:09 +0100 Subject: [PATCH 26/32] feat(FastEmbed): fix lint --- .../fastembed/embedding_backend/fastembed_backend.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py index f3663409a..b252bd96c 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py @@ -1,8 +1,7 @@ from typing import ClassVar, Dict, List, Optional, Union -from tqdm import tqdm - from haystack.dataclasses.sparse_embedding import SparseEmbedding +from tqdm import tqdm from fastembed import TextEmbedding from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding @@ -106,9 +105,7 @@ def embed( sparse_embeddings_iterable, disable=not progress_bar, desc="Calculating sparse embeddings", total=len(data) ): sparse_embeddings.append( - SparseEmbedding( - indices=sparse_embedding.indices.tolist(), values=sparse_embedding.values.tolist() - ) + SparseEmbedding(indices=sparse_embedding.indices.tolist(), values=sparse_embedding.values.tolist()) ) return sparse_embeddings From bc3f555a4b169cd582d879f5c38c6baa331710d7 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Fri, 22 Mar 2024 11:19:16 +0100 Subject: [PATCH 27/32] feat(Fastembed): run output type from dict to haystack sparseembedding class --- .../fastembed/embedding_backend/fastembed_backend.py | 6 ++---- .../embedders/fastembed/fastembed_sparse_text_embedder.py | 5 +++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py index b252bd96c..2fc7c5ca2 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/embedding_backend/fastembed_backend.py @@ -1,4 +1,4 @@ -from typing import ClassVar, Dict, List, Optional, Union +from typing import ClassVar, Dict, List, Optional from haystack.dataclasses.sparse_embedding import SparseEmbedding from tqdm import tqdm @@ -92,9 +92,7 @@ def __init__( ): self.model = SparseTextEmbedding(model_name=model_name, cache_dir=cache_dir, threads=threads) - def embed( - self, data: List[List[str]], progress_bar=True, **kwargs - ) -> List[Dict[str, Union[List[int], List[float]]]]: + def embed(self, data: List[List[str]], progress_bar=True, **kwargs) -> List[SparseEmbedding]: # The embed method returns a Iterable[SparseEmbedding], so we convert to Haystack SparseEmbedding type. # Each SparseEmbedding contains an `indices` key containing a list of int and # an `values` key containing a list of floats. diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py index 43d59a148..29539a138 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py @@ -1,6 +1,7 @@ -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, Optional from haystack import component, default_to_dict +from haystack.dataclasses.sparse_embedding import SparseEmbedding from .embedding_backend.fastembed_backend import _FastembedSparseEmbeddingBackendFactory @@ -86,7 +87,7 @@ def warm_up(self): model_name=self.model_name, cache_dir=self.cache_dir, threads=self.threads ) - @component.output_types(sparse_embedding=Dict[str, Union[List[int], List[float]]]) + @component.output_types(sparse_embedding=SparseEmbedding) def run(self, text: str): """ Embeds text using the Fastembed model. From 92611229ac83d2dc7673d4242472c97be51a2e43 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Fri, 22 Mar 2024 11:52:42 +0100 Subject: [PATCH 28/32] feat(FastEmbed): reduce default sparse batch size --- .../fastembed/fastembed_sparse_document_embedder.py | 4 ++-- .../fastembed/fastembed_sparse_text_embedder.py | 2 +- .../tests/test_fastembed_sparse_document_embedder.py | 10 +++++----- .../tests/test_fastembed_sparse_text_embedder.py | 8 ++++---- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py index a93020bf7..ed5a3208b 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py @@ -20,7 +20,7 @@ class FastembedSparseDocumentEmbedder: doc_embedder = FastembedSparseDocumentEmbedder( model="prithvida/Splade_PP_en_v1", - batch_size=256, + batch_size=32, ) doc_embedder.warm_up() @@ -55,7 +55,7 @@ def __init__( model: str = "prithvida/Splade_PP_en_v1", cache_dir: Optional[str] = None, threads: Optional[int] = None, - batch_size: int = 256, + batch_size: int = 32, progress_bar: bool = True, parallel: Optional[int] = None, meta_fields_to_embed: Optional[List[str]] = None, diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py index 29539a138..425730b97 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py @@ -34,7 +34,7 @@ def __init__( model: str = "prithvida/Splade_PP_en_v1", cache_dir: Optional[str] = None, threads: Optional[int] = None, - batch_size: int = 256, + batch_size: int = 32, progress_bar: bool = True, parallel: Optional[int] = None, ): diff --git a/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py index fe74c7bd8..756eeb4b5 100644 --- a/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py @@ -18,7 +18,7 @@ def test_init_default(self): assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir is None assert embedder.threads is None - assert embedder.batch_size == 256 + assert embedder.batch_size == 32 assert embedder.progress_bar is True assert embedder.parallel is None assert embedder.meta_fields_to_embed == [] @@ -59,7 +59,7 @@ def test_to_dict(self): "model": "prithvida/Splade_PP_en_v1", "cache_dir": None, "threads": None, - "batch_size": 256, + "batch_size": 32, "progress_bar": True, "parallel": None, "embedding_separator": "\n", @@ -107,7 +107,7 @@ def test_from_dict(self): "model": "prithvida/Splade_PP_en_v1", "cache_dir": None, "threads": None, - "batch_size": 256, + "batch_size": 32, "progress_bar": True, "parallel": None, "meta_fields_to_embed": [], @@ -118,7 +118,7 @@ def test_from_dict(self): assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir is None assert embedder.threads is None - assert embedder.batch_size == 256 + assert embedder.batch_size == 32 assert embedder.progress_bar is True assert embedder.parallel is None assert embedder.meta_fields_to_embed == [] @@ -259,7 +259,7 @@ def test_embed_metadata(self): "meta_value 3\ndocument-number 3", "meta_value 4\ndocument-number 4", ], - batch_size=256, + batch_size=32, show_progress_bar=True, parallel=None, ) diff --git a/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py index 58e12705a..613198567 100644 --- a/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py @@ -18,7 +18,7 @@ def test_init_default(self): assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir is None assert embedder.threads is None - assert embedder.batch_size == 256 + assert embedder.batch_size == 32 assert embedder.progress_bar is True assert embedder.parallel is None @@ -53,7 +53,7 @@ def test_to_dict(self): "model": "prithvida/Splade_PP_en_v1", "cache_dir": None, "threads": None, - "batch_size": 256, + "batch_size": 32, "progress_bar": True, "parallel": None, }, @@ -94,7 +94,7 @@ def test_from_dict(self): "model": "prithvida/Splade_PP_en_v1", "cache_dir": None, "threads": None, - "batch_size": 256, + "batch_size": 32, "progress_bar": True, "parallel": None, }, @@ -103,7 +103,7 @@ def test_from_dict(self): assert embedder.model_name == "prithvida/Splade_PP_en_v1" assert embedder.cache_dir is None assert embedder.threads is None - assert embedder.batch_size == 256 + assert embedder.batch_size == 32 assert embedder.progress_bar is True assert embedder.parallel is None From a6974333c7cd0916bb44192c45a3c120f9fae833 Mon Sep 17 00:00:00 2001 From: Corentin Date: Fri, 22 Mar 2024 13:51:48 +0100 Subject: [PATCH 29/32] Update integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py Co-authored-by: Stefano Fiorucci --- .../embedders/fastembed/fastembed_sparse_text_embedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py index 425730b97..b31677785 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py @@ -114,4 +114,4 @@ def run(self, text: str): show_progress_bar=self.progress_bar, parallel=self.parallel, )[0] - return {"embedding": embedding} + return {"sparse_embedding": embedding} From a16fc9d70b6ca5b67b567f80606ddaefb1f9cc93 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Fri, 22 Mar 2024 13:53:48 +0100 Subject: [PATCH 30/32] feat(FastEmbed): fix test --- .../fastembed/tests/test_fastembed_sparse_text_embedder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py index 613198567..3751eea14 100644 --- a/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py @@ -182,7 +182,7 @@ def test_embed(self): text = "Good text to embed" result = embedder.run(text=text) - embedding = result["embedding"] + embedding = result["sparse_embedding"] assert isinstance(embedding, dict) assert isinstance(embedding["indices"], list) assert isinstance(embedding["indices"][0], int) @@ -211,7 +211,7 @@ def test_run(self): text = "Parton energy loss in QCD matter" result = embedder.run(text=text) - embedding = result["embedding"] + embedding = result["sparse_embedding"] embedding_dict = embedding.to_dict() assert isinstance(embedding, SparseEmbedding) assert isinstance(embedding_dict["indices"], list) From a97c4eda1e5d396e0a3ce7c4606723577c0abe0c Mon Sep 17 00:00:00 2001 From: anakin87 Date: Wed, 10 Apr 2024 11:06:21 +0200 Subject: [PATCH 31/32] updates after 2.0.1 release --- .github/workflows/fastembed.yml | 4 +--- integrations/fastembed/pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/fastembed.yml b/.github/workflows/fastembed.yml index dd4d10e4a..7a34378ee 100644 --- a/.github/workflows/fastembed.yml +++ b/.github/workflows/fastembed.yml @@ -43,9 +43,7 @@ jobs: - name: Run tests id: tests - run: | - hatch run pip install git+https://github.com/deepset-ai/haystack.git #TODO: rm before merging - hatch run cov + run: hatch run cov - name: Nightly - run unit tests with Haystack main branch if: github.event_name == 'schedule' diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index fe74b67d0..f4adea690 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -25,7 +25,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ -"haystack-ai", +"haystack-ai>=2.0.1", "fastembed>=0.2.5", ] From 1a8c7076882b6f0f3cc4769902ea6d3b9833c582 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Wed, 10 Apr 2024 11:28:38 +0200 Subject: [PATCH 32/32] small fixes; naive example --- .../{example => examples}/example.py | 2 +- .../fastembed/examples/sparse_example.py | 32 +++++++++++++++++++ integrations/fastembed/pydoc/config.yml | 2 ++ integrations/fastembed/pyproject.toml | 2 +- 4 files changed, 36 insertions(+), 2 deletions(-) rename integrations/fastembed/{example => examples}/example.py (95%) create mode 100644 integrations/fastembed/examples/sparse_example.py diff --git a/integrations/fastembed/example/example.py b/integrations/fastembed/examples/example.py similarity index 95% rename from integrations/fastembed/example/example.py rename to integrations/fastembed/examples/example.py index 3e8077a79..e4d328210 100644 --- a/integrations/fastembed/example/example.py +++ b/integrations/fastembed/examples/example.py @@ -26,7 +26,7 @@ result = query_pipeline.run({"text_embedder": {"text": query}}) -print(result["retriever"]["documents"][0]) # noqa: T201 +print(result["retriever"]["documents"][0]) # Document(id=..., # content: 'fastembed is supported by and maintained by Qdrant.', diff --git a/integrations/fastembed/examples/sparse_example.py b/integrations/fastembed/examples/sparse_example.py new file mode 100644 index 000000000..bce3b363d --- /dev/null +++ b/integrations/fastembed/examples/sparse_example.py @@ -0,0 +1,32 @@ +# Currently, this example shows how to use the FastembedSparseDocumentEmbedder component to embed a list of documents. + +# TODO: Once we have a proper SparseEmbeddingRetriever, we should replace this naive example with a more realistic one, +# involving indexing and retrieval of documents. + +from haystack import Document +from haystack_integrations.components.embedders.fastembed import FastembedSparseDocumentEmbedder + +document_list = [ + Document( + content="Oxidative stress generated within inflammatory joints can produce autoimmune phenomena and joint destruction. Radical species with oxidative activity, including reactive nitrogen species, represent mediators of inflammation and cartilage damage.", + meta={ + "pubid": "25,445,628", + "long_answer": "yes", + }, + ), + Document( + content="Plasma levels of pancreatic polypeptide (PP) rise upon food intake. Although other pancreatic islet hormones, such as insulin and glucagon, have been extensively investigated, PP secretion and actions are still poorly understood.", + meta={ + "pubid": "25,445,712", + "long_answer": "yes", + }, + ), +] + +document_embedder = FastembedSparseDocumentEmbedder() +document_embedder.warm_up() +documents_with_embeddings = document_embedder.run(document_list)["documents"] + +for doc in documents_with_embeddings: + print(f"Document Text: {doc.content}") + print(f"Document Sparse Embedding: {doc.sparse_embedding.to_dict()}") diff --git a/integrations/fastembed/pydoc/config.yml b/integrations/fastembed/pydoc/config.yml index 2a4439b84..c8bd11762 100644 --- a/integrations/fastembed/pydoc/config.yml +++ b/integrations/fastembed/pydoc/config.yml @@ -5,6 +5,8 @@ loaders: [ "haystack_integrations.components.embedders.fastembed.fastembed_document_embedder", "haystack_integrations.components.embedders.fastembed.fastembed_text_embedder", + "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder", + "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder" ] ignore_when_discovered: ["__init__"] processors: diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index f4adea690..f4d005be3 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -155,7 +155,7 @@ ban-relative-imports = "parents" # Tests can use magic values, assertions, and relative imports "tests/**/*" = ["PLR2004", "S101", "TID252"] # examples can contain "print" commands -"examples/**/*" = ["T201"] +"examples/**/*" = ["T201", "E501"] [tool.coverage.run] source = ["haystack_integrations"]