From fe7fa36dce768f29d81448b1934db44af15c6b3e Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Mon, 19 Feb 2024 19:52:05 +0530 Subject: [PATCH] Add embedding backend --- integrations/optimum/pyproject.toml | 4 +- .../components/embedders/backends/__init__.py | 0 .../embedders/backends/optimum_backend.py | 116 ++++++++++++++++++ .../embedders/optimum_document_embedder.py | 106 +++------------- .../embedders/optimum_text_embedder.py | 79 +++--------- .../tests/test_optimum_document_embedder.py | 32 ++--- .../tests/test_optimum_text_embedder.py | 26 ++-- 7 files changed, 183 insertions(+), 180 deletions(-) create mode 100644 integrations/optimum/src/haystack_integrations/components/embedders/backends/__init__.py create mode 100644 integrations/optimum/src/haystack_integrations/components/embedders/backends/optimum_backend.py diff --git a/integrations/optimum/pyproject.toml b/integrations/optimum/pyproject.toml index 616dfc0e5..91c5eb13c 100644 --- a/integrations/optimum/pyproject.toml +++ b/integrations/optimum/pyproject.toml @@ -26,9 +26,9 @@ classifiers = [ ] dependencies = [ "haystack-ai", - "transformers[sentencepiece]==4.36.2", + "transformers[sentencepiece]", "sentence-transformers>=2.2.0", - "optimum[onnxruntime]==1.15.0" + "optimum[onnxruntime]" ] [project.urls] diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/backends/__init__.py b/integrations/optimum/src/haystack_integrations/components/embedders/backends/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/backends/optimum_backend.py b/integrations/optimum/src/haystack_integrations/components/embedders/backends/optimum_backend.py new file mode 100644 index 000000000..515d92938 --- /dev/null +++ b/integrations/optimum/src/haystack_integrations/components/embedders/backends/optimum_backend.py @@ -0,0 +1,116 @@ +from typing import Any, ClassVar, Dict, List, Optional, Union + +import numpy as np +import torch +from haystack.utils.auth import Secret +from optimum.onnxruntime import ORTModelForFeatureExtraction +from tqdm import tqdm +from transformers import AutoTokenizer + + +class _OptimumEmbeddingBackendFactory: + """ + Factory class to create instances of Sentence Transformers embedding backends. + """ + + _instances: ClassVar[Dict[str, "_OptimumEmbeddingBackend"]] = {} + + @staticmethod + def get_embedding_backend( + model: str, token: Optional[Secret] = None, model_kwargs: Optional[Dict[str, Any]] = None + ): + embedding_backend_id = f"{model}{token}" + + if embedding_backend_id in _OptimumEmbeddingBackendFactory._instances: + return _OptimumEmbeddingBackendFactory._instances[embedding_backend_id] + embedding_backend = _OptimumEmbeddingBackend(model=model, token=token, model_kwargs=model_kwargs) + _OptimumEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend + return embedding_backend + + +class _OptimumEmbeddingBackend: + """ + Class to manage Optimum embeddings. + """ + + def __init__(self, model: str, token: Optional[Secret] = None, model_kwargs: Optional[Dict[str, Any]] = None): + # export=True converts the model to ONNX on the fly + self.model = ORTModelForFeatureExtraction.from_pretrained(**model_kwargs, export=True) + self.tokenizer = AutoTokenizer.from_pretrained(model, token=token) + + def mean_pooling(self, model_output: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: + """ + Perform Mean Pooling on the output of the Embedding model. + + :param model_output: The output of the embedding model. + :param attention_mask: The attention mask of the tokenized text. + :return: The embeddings of the text after mean pooling. + """ + # First element of model_output contains all token embeddings + token_embeddings = model_output[0] + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) + sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) + return sum_embeddings / sum_mask + + def embed( + self, + texts_to_embed: Union[str, List[str]], + normalize_embeddings: bool, + progress_bar: bool = False, + batch_size: int = 1, + ) -> Union[List[List[float]], List[float]]: + """ + Embed text or list of texts using the Optimum model. + + :param texts_to_embed: T + :param normalize_embeddings: Whether to normalize the embeddings to unit length. + :param progress_bar: Whether to show a progress bar or not, defaults to False. + :param batch_size: Batch size to use, defaults to 1. + :return: A single embedding if the input is a single string. A list of embeddings if the input is a list of + strings. + """ + if isinstance(texts_to_embed, str): + texts = [texts_to_embed] + else: + texts = texts_to_embed + + # Determine device for tokenizer output + device = self.model.device + + # Sorting by length + length_sorted_idx = np.argsort([-len(sen) for sen in texts]) + sentences_sorted = [texts[idx] for idx in length_sorted_idx] + + all_embeddings = [] + for i in tqdm( + range(0, len(sentences_sorted), batch_size), disable=not progress_bar, desc="Calculating embeddings" + ): + batch = sentences_sorted[i : i + batch_size] + encoded_input = self.tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device) + + # Only pass required inputs otherwise onnxruntime can raise an error + inputs_to_remove = set(encoded_input.keys()).difference(self.model.inputs_names) + for key in inputs_to_remove: + encoded_input.pop(key) + + # Compute token embeddings + model_output = self.model(**encoded_input) + + # Perform mean pooling + sentence_embeddings = self.mean_pooling(model_output, encoded_input["attention_mask"].to(device)) + + all_embeddings.extend(sentence_embeddings.tolist()) + + # Reorder embeddings according to original order + all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)] + + # Normalize all embeddings + if normalize_embeddings: + all_embeddings = torch.nn.functional.normalize(torch.tensor(all_embeddings), p=2, dim=1).tolist() + + if isinstance(texts_to_embed, str): + # Return the embedding if only one text was passed + all_embeddings = all_embeddings[0] + + return all_embeddings diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py index 1aaa65bb4..9e3ae2e61 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_document_embedder.py @@ -1,13 +1,11 @@ from typing import Any, Dict, List, Optional -import numpy as np -import torch from haystack import Document, component, default_from_dict, default_to_dict from haystack.utils import Secret, deserialize_secrets_inplace from haystack.utils.hf import HFModelType, check_valid_model, deserialize_hf_model_kwargs, serialize_hf_model_kwargs -from optimum.onnxruntime import ORTModelForFeatureExtraction -from tqdm import tqdm -from transformers import AutoTokenizer +from haystack_integrations.components.embedders.backends.optimum_backend import ( + _OptimumEmbeddingBackendFactory, +) class OptimumDocumentEmbedder: @@ -24,7 +22,7 @@ class OptimumDocumentEmbedder: doc = Document(content="I love pizza!") - document_embedder = OptimumDocumentEmbedder(model="BAAI/bge-small-en-v1.5") + document_embedder = OptimumDocumentEmbedder(model="sentence-transformers/all-mpnet-base-v2") document_embedder.warm_up() result = document_embedder.run([doc]) @@ -48,7 +46,7 @@ class OptimumDocumentEmbedder: def __init__( self, - model: str = "BAAI/bge-small-en-v1.5", + model: str = "sentence-transformers/all-mpnet-base-v2", token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False), # noqa: B008 prefix: str = "", suffix: str = "", @@ -63,13 +61,14 @@ def __init__( """ Create a OptimumDocumentEmbedder component. - :param model: A string representing the model id on HF Hub. Default is "BAAI/bge-small-en-v1.5". + :param model: A string representing the model id on HF Hub. Defaults to + "sentence-transformers/all-mpnet-base-v2". :param token: The HuggingFace token to use as HTTP bearer authorization. :param prefix: A string to add to the beginning of each text. :param suffix: A string to add to the end of each text. :param normalize_embeddings: Whether to normalize the embeddings to unit length. :param onnx_execution_provider: The execution provider to use for ONNX models. Defaults to - "CPUExecutionProvider". + "CPUExecutionProvider". See https://onnxruntime.ai/docs/execution-providers/ for possible providers. :param model_kwargs: Dictionary containing additional keyword arguments to pass to the model. In case of duplication, these kwargs override `model`, `onnx_execution_provider`, and `token` initialization parameters. @@ -107,26 +106,12 @@ def __init__( def warm_up(self): """ - Convert the model to ONNX. - - The model is cached if the "TensorrtExecutionProvider" is used, since it takes a while to to build the TensorRT - engine. + Load the embedding backend. """ - if self.embedding_model is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model) - - if self.onnx_execution_provider == "TensorrtExecutionProvider": - # Cache engine for TensorRT - provider_options = { - "trt_engine_cache_enable": True, - "trt_engine_cache_path": f"tmp/trt_cache_{self.model}", - } - self.embedding_model = ORTModelForFeatureExtraction.from_pretrained( - **self.model_kwargs, use_cache=False, provider_options=provider_options - ) - else: - # export=True converts the model to ONNX on the fly - self.embedding_model = ORTModelForFeatureExtraction.from_pretrained(**self.model_kwargs, export=True) + if not hasattr(self, "embedding_backend"): + self.embedding_backend = _OptimumEmbeddingBackendFactory.get_embedding_backend( + model=self.model, token=self.token, model_kwargs=self.model_kwargs + ) def to_dict(self) -> Dict[str, Any]: """ @@ -162,21 +147,6 @@ def from_dict(cls, data: Dict[str, Any]) -> "OptimumDocumentEmbedder": deserialize_hf_model_kwargs(data["init_parameters"]["model_kwargs"]) return default_from_dict(cls, data) - def mean_pooling(self, model_output: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: - """ - Perform Mean Pooling on the output of the Embedding model. - - :param model_output: The output of the embedding model. - :param attention_mask: The attention mask of the tokenized text. - :return: The embeddings of the text after mean pooling. - """ - # First element of model_output contains all token embeddings - token_embeddings = model_output[0] - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) - sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) - return sum_embeddings / sum_mask - def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: """ Prepare the texts to embed by concatenating the Document text with the metadata fields to embed. @@ -194,47 +164,6 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: texts_to_embed.append(text_to_embed) return texts_to_embed - def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> List[List[float]]: - """ - Embed a list of texts in batches. - """ - # Determine device for tokenizer output - device = ( - "cuda" - if self.onnx_execution_provider - in ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"] - else "cpu" - ) - - # Sorting by length - length_sorted_idx = np.argsort([-len(sen) for sen in texts_to_embed]) - sentences_sorted = [texts_to_embed[idx] for idx in length_sorted_idx] - - all_embeddings = [] - for i in tqdm( - range(0, len(sentences_sorted), batch_size), disable=not self.progress_bar, desc="Calculating embeddings" - ): - batch = sentences_sorted[i : i + batch_size] - encoded_input = self.tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device) # type: ignore - - # Compute token embeddings - with torch.no_grad(): - model_output = self.embedding_model(**encoded_input) # type: ignore - - # Perform mean pooling - sentence_embeddings = self.mean_pooling(model_output, encoded_input["attention_mask"].cpu()) - - all_embeddings.extend(sentence_embeddings.tolist()) - - # Reorder embeddings according to original order - all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)] - - # Normalize all embeddings - if self.normalize_embeddings: - all_embeddings = torch.nn.functional.normalize(torch.tensor(all_embeddings), p=2, dim=1).tolist() - - return all_embeddings - @component.output_types(documents=List[Document]) def run(self, documents: List[Document]): """ @@ -251,7 +180,7 @@ def run(self, documents: List[Document]): ) raise TypeError(msg) - if not (self.embedding_model and self.tokenizer): + if not hasattr(self, "embedding_backend"): msg = "The embedding model has not been loaded. Please call warm_up() before running." raise RuntimeError(msg) @@ -261,7 +190,12 @@ def run(self, documents: List[Document]): texts_to_embed = self._prepare_texts_to_embed(documents=documents) - embeddings = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size) + embeddings = self.embedding_backend.embed( + texts_to_embed=texts_to_embed, + normalize_embeddings=self.normalize_embeddings, + progress_bar=self.progress_bar, + batch_size=self.batch_size, + ) for doc, emb in zip(documents, embeddings): doc.embedding = emb diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py index 8b6a3b48e..1531c85ef 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum_text_embedder.py @@ -1,11 +1,11 @@ from typing import Any, Dict, List, Optional -import torch from haystack import component, default_from_dict, default_to_dict from haystack.utils import Secret, deserialize_secrets_inplace from haystack.utils.hf import HFModelType, check_valid_model, deserialize_hf_model_kwargs, serialize_hf_model_kwargs -from optimum.onnxruntime import ORTModelForFeatureExtraction -from transformers import AutoTokenizer +from haystack_integrations.components.embedders.backends.optimum_backend import ( + _OptimumEmbeddingBackendFactory, +) class OptimumTextEmbedder: @@ -19,7 +19,7 @@ class OptimumTextEmbedder: text_to_embed = "I love pizza!" - text_embedder = OptimumTextEmbedder(model="BAAI/bge-small-en-v1.5") + text_embedder = OptimumTextEmbedder(model="sentence-transformers/all-mpnet-base-v2") text_embedder.warm_up() print(text_embedder.run(text_to_embed)) @@ -42,7 +42,7 @@ class OptimumTextEmbedder: def __init__( self, - model: str = "BAAI/bge-small-en-v1.5", + model: str = "sentence-transformers/all-mpnet-base-v2", token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False), # noqa: B008 prefix: str = "", suffix: str = "", @@ -53,13 +53,14 @@ def __init__( """ Create a OptimumTextEmbedder component. - :param model: A string representing the model id on HF Hub. Default is "BAAI/bge-small-en-v1.5". + :param model: A string representing the model id on HF Hub. Defaults to + "sentence-transformers/all-mpnet-base-v2". :param token: The HuggingFace token to use as HTTP bearer authorization. :param prefix: A string to add to the beginning of each text. :param suffix: A string to add to the end of each text. :param normalize_embeddings: Whether to normalize the embeddings to unit length. :param onnx_execution_provider: The execution provider to use for ONNX models. Defaults to - "CPUExecutionProvider". + "CPUExecutionProvider". See https://onnxruntime.ai/docs/execution-providers/ for possible providers. :param model_kwargs: Dictionary containing additional keyword arguments to pass to the model. In case of duplication, these kwargs override `model`, `onnx_execution_provider`, and `token` initialization parameters. @@ -88,26 +89,12 @@ def __init__( def warm_up(self): """ - Convert the model to ONNX. - - The model is cached if the "TensorrtExecutionProvider" is used, since it takes a while to build the TensorRT - engine. + Load the embedding backend. """ - if self.embedding_model is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model) - - if self.onnx_execution_provider == "TensorrtExecutionProvider": - # Cache engine for TensorRT - provider_options = { - "trt_engine_cache_enable": True, - "trt_engine_cache_path": f"tmp/trt_cache_{self.model}", - } - self.embedding_model = ORTModelForFeatureExtraction.from_pretrained( - **self.model_kwargs, use_cache=False, provider_options=provider_options - ) - else: - # export=True converts the model to ONNX on the fly - self.embedding_model = ORTModelForFeatureExtraction.from_pretrained(**self.model_kwargs, export=True) + if not hasattr(self, "embedding_backend"): + self.embedding_backend = _OptimumEmbeddingBackendFactory.get_embedding_backend( + model=self.model, token=self.token, model_kwargs=self.model_kwargs + ) def to_dict(self) -> Dict[str, Any]: """ @@ -139,21 +126,6 @@ def from_dict(cls, data: Dict[str, Any]) -> "OptimumTextEmbedder": deserialize_hf_model_kwargs(data["init_parameters"]["model_kwargs"]) return default_from_dict(cls, data) - def mean_pooling(self, model_output: torch.tensor, attention_mask: torch.tensor) -> torch.tensor: - """ - Perform Mean Pooling on the output of the Embedding model. - - :param model_output: The output of the embedding model. - :param attention_mask: The attention mask of the tokenized text. - :return: The embeddings of the text after mean pooling. - """ - # First element of model_output contains all token embeddings - token_embeddings = model_output[0] - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) - sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) - return sum_embeddings / sum_mask - @component.output_types(embedding=List[float]) def run(self, text: str): """Embed a string. @@ -168,33 +140,14 @@ def run(self, text: str): ) raise TypeError(msg) - if not (self.embedding_model and self.tokenizer): + if not hasattr(self, "embedding_backend"): msg = "The embedding model has not been loaded. Please call warm_up() before running." raise RuntimeError(msg) text_to_embed = self.prefix + text + self.suffix - # Determine device for tokenizer output - device = ( - "cuda" - if self.onnx_execution_provider - in ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"] - else "cpu" + embedding = self.embedding_backend.embed( + texts_to_embed=text_to_embed, normalize_embeddings=self.normalize_embeddings ) - encoded_input = self.tokenizer([text_to_embed], padding=True, truncation=True, return_tensors="pt").to(device) - - # Compute token embeddings - with torch.no_grad(): - model_output = self.embedding_model(**encoded_input) - - # Perform mean pooling - sentence_embeddings = self.mean_pooling(model_output, encoded_input["attention_mask"].cpu()) - - # Normalize Embeddings - if self.normalize_embeddings: - sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1) - - embedding = sentence_embeddings.tolist()[0] - return {"embedding": embedding} diff --git a/integrations/optimum/tests/test_optimum_document_embedder.py b/integrations/optimum/tests/test_optimum_document_embedder.py index b0d26d392..b7d285196 100644 --- a/integrations/optimum/tests/test_optimum_document_embedder.py +++ b/integrations/optimum/tests/test_optimum_document_embedder.py @@ -21,7 +21,7 @@ def test_init_default(self, monkeypatch, mock_check_valid_model): # noqa: ARG00 monkeypatch.setenv("HF_API_TOKEN", "fake-api-token") embedder = OptimumDocumentEmbedder() - assert embedder.model == "BAAI/bge-small-en-v1.5" + assert embedder.model == "sentence-transformers/all-mpnet-base-v2" assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) assert embedder.prefix == "" assert embedder.suffix == "" @@ -32,14 +32,14 @@ def test_init_default(self, monkeypatch, mock_check_valid_model): # noqa: ARG00 assert embedder.meta_fields_to_embed == [] assert embedder.embedding_separator == "\n" assert embedder.model_kwargs == { - "model_id": "BAAI/bge-small-en-v1.5", + "model_id": "sentence-transformers/all-mpnet-base-v2", "provider": "CPUExecutionProvider", "use_auth_token": "fake-api-token", } def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 embedder = OptimumDocumentEmbedder( - model="sentence-transformers/all-mpnet-base-v2", + model="sentence-transformers/all-minilm-l6-v2", token=Secret.from_token("fake-api-token"), prefix="prefix", suffix="suffix", @@ -52,7 +52,7 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 model_kwargs={"trust_remote_code": True}, ) - assert embedder.model == "sentence-transformers/all-mpnet-base-v2" + assert embedder.model == "sentence-transformers/all-minilm-l6-v2" assert embedder.token == Secret.from_token("fake-api-token") assert embedder.prefix == "prefix" assert embedder.suffix == "suffix" @@ -64,7 +64,7 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 assert embedder.onnx_execution_provider == "CUDAExecutionProvider" assert embedder.model_kwargs == { "trust_remote_code": True, - "model_id": "sentence-transformers/all-mpnet-base-v2", + "model_id": "sentence-transformers/all-minilm-l6-v2", "provider": "CUDAExecutionProvider", "use_auth_token": "fake-api-token", } @@ -81,7 +81,7 @@ def test_to_dict(self, mock_check_valid_model): # noqa: ARG002 assert data == { "type": "haystack_integrations.components.embedders.optimum_document_embedder.OptimumDocumentEmbedder", "init_parameters": { - "model": "BAAI/bge-small-en-v1.5", + "model": "sentence-transformers/all-mpnet-base-v2", "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"}, "prefix": "", "suffix": "", @@ -92,7 +92,7 @@ def test_to_dict(self, mock_check_valid_model): # noqa: ARG002 "normalize_embeddings": True, "onnx_execution_provider": "CPUExecutionProvider", "model_kwargs": { - "model_id": "BAAI/bge-small-en-v1.5", + "model_id": "sentence-transformers/all-mpnet-base-v2", "provider": "CPUExecutionProvider", "use_auth_token": None, }, @@ -101,7 +101,7 @@ def test_to_dict(self, mock_check_valid_model): # noqa: ARG002 def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # noqa: ARG002 component = OptimumDocumentEmbedder( - model="sentence-transformers/all-mpnet-base-v2", + model="sentence-transformers/all-minilm-l6-v2", token=Secret.from_env_var("ENV_VAR", strict=False), prefix="prefix", suffix="suffix", @@ -118,7 +118,7 @@ def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # n assert data == { "type": "haystack_integrations.components.embedders.optimum_document_embedder.OptimumDocumentEmbedder", "init_parameters": { - "model": "sentence-transformers/all-mpnet-base-v2", + "model": "sentence-transformers/all-minilm-l6-v2", "token": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"}, "prefix": "prefix", "suffix": "suffix", @@ -130,7 +130,7 @@ def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # n "onnx_execution_provider": "CUDAExecutionProvider", "model_kwargs": { "trust_remote_code": True, - "model_id": "sentence-transformers/all-mpnet-base-v2", + "model_id": "sentence-transformers/all-minilm-l6-v2", "provider": "CUDAExecutionProvider", "use_auth_token": None, }, @@ -143,7 +143,7 @@ def test_prepare_texts_to_embed_w_metadata(self, mock_check_valid_model): # noq ] embedder = OptimumDocumentEmbedder( - model="sentence-transformers/all-mpnet-base-v2", + model="sentence-transformers/all-minilm-l6-v2", meta_fields_to_embed=["meta_field"], embedding_separator=" | ", ) @@ -162,7 +162,7 @@ def test_prepare_texts_to_embed_w_suffix(self, mock_check_valid_model): # noqa: documents = [Document(content=f"document number {i}") for i in range(5)] embedder = OptimumDocumentEmbedder( - model="sentence-transformers/all-mpnet-base-v2", + model="sentence-transformers/all-minilm-l6-v2", prefix="my_prefix ", suffix=" my_suffix", ) @@ -179,7 +179,7 @@ def test_prepare_texts_to_embed_w_suffix(self, mock_check_valid_model): # noqa: def test_run_wrong_input_format(self, mock_check_valid_model): # noqa: ARG002 embedder = OptimumDocumentEmbedder( - model="BAAI/bge-small-en-v1.5", + model="sentence-transformers/all-mpnet-base-v2", ) embedder.warm_up() # wrong formats @@ -194,7 +194,7 @@ def test_run_wrong_input_format(self, mock_check_valid_model): # noqa: ARG002 def test_run_on_empty_list(self, mock_check_valid_model): # noqa: ARG002 embedder = OptimumDocumentEmbedder( - model="BAAI/bge-small-en-v1.5", + model="sentence-transformers/all-mpnet-base-v2", ) embedder.warm_up() empty_list_input = [] @@ -211,7 +211,7 @@ def test_run(self): ] embedder = OptimumDocumentEmbedder( - model="BAAI/bge-small-en-v1.5", + model="sentence-transformers/all-mpnet-base-v2", prefix="prefix ", suffix=" suffix", meta_fields_to_embed=["topic"], @@ -229,5 +229,5 @@ def test_run(self): for doc in documents_with_embeddings: assert isinstance(doc, Document) assert isinstance(doc.embedding, list) - assert len(doc.embedding) == 384 + assert len(doc.embedding) == 768 assert all(isinstance(x, float) for x in doc.embedding) diff --git a/integrations/optimum/tests/test_optimum_text_embedder.py b/integrations/optimum/tests/test_optimum_text_embedder.py index 79aaff6b8..2f5b60b45 100644 --- a/integrations/optimum/tests/test_optimum_text_embedder.py +++ b/integrations/optimum/tests/test_optimum_text_embedder.py @@ -20,21 +20,21 @@ def test_init_default(self, monkeypatch, mock_check_valid_model): # noqa: ARG00 monkeypatch.setenv("HF_API_TOKEN", "fake-api-token") embedder = OptimumTextEmbedder() - assert embedder.model == "BAAI/bge-small-en-v1.5" + assert embedder.model == "sentence-transformers/all-mpnet-base-v2" assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False) assert embedder.prefix == "" assert embedder.suffix == "" assert embedder.normalize_embeddings is True assert embedder.onnx_execution_provider == "CPUExecutionProvider" assert embedder.model_kwargs == { - "model_id": "BAAI/bge-small-en-v1.5", + "model_id": "sentence-transformers/all-mpnet-base-v2", "provider": "CPUExecutionProvider", "use_auth_token": "fake-api-token", } def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 embedder = OptimumTextEmbedder( - model="sentence-transformers/all-mpnet-base-v2", + model="sentence-transformers/all-minilm-l6-v2", token=Secret.from_token("fake-api-token"), prefix="prefix", suffix="suffix", @@ -43,7 +43,7 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 model_kwargs={"trust_remote_code": True}, ) - assert embedder.model == "sentence-transformers/all-mpnet-base-v2" + assert embedder.model == "sentence-transformers/all-minilm-l6-v2" assert embedder.token == Secret.from_token("fake-api-token") assert embedder.prefix == "prefix" assert embedder.suffix == "suffix" @@ -51,7 +51,7 @@ def test_init_with_parameters(self, mock_check_valid_model): # noqa: ARG002 assert embedder.onnx_execution_provider == "CUDAExecutionProvider" assert embedder.model_kwargs == { "trust_remote_code": True, - "model_id": "sentence-transformers/all-mpnet-base-v2", + "model_id": "sentence-transformers/all-minilm-l6-v2", "provider": "CUDAExecutionProvider", "use_auth_token": "fake-api-token", } @@ -68,14 +68,14 @@ def test_to_dict(self, mock_check_valid_model): # noqa: ARG002 assert data == { "type": "haystack_integrations.components.embedders.optimum_text_embedder.OptimumTextEmbedder", "init_parameters": { - "model": "BAAI/bge-small-en-v1.5", + "model": "sentence-transformers/all-mpnet-base-v2", "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"}, "prefix": "", "suffix": "", "normalize_embeddings": True, "onnx_execution_provider": "CPUExecutionProvider", "model_kwargs": { - "model_id": "BAAI/bge-small-en-v1.5", + "model_id": "sentence-transformers/all-mpnet-base-v2", "provider": "CPUExecutionProvider", "use_auth_token": None, }, @@ -84,7 +84,7 @@ def test_to_dict(self, mock_check_valid_model): # noqa: ARG002 def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # noqa: ARG002 component = OptimumTextEmbedder( - model="sentence-transformers/all-mpnet-base-v2", + model="sentence-transformers/all-minilm-l6-v2", token=Secret.from_env_var("ENV_VAR", strict=False), prefix="prefix", suffix="suffix", @@ -97,7 +97,7 @@ def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # n assert data == { "type": "haystack_integrations.components.embedders.optimum_text_embedder.OptimumTextEmbedder", "init_parameters": { - "model": "sentence-transformers/all-mpnet-base-v2", + "model": "sentence-transformers/all-minilm-l6-v2", "token": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"}, "prefix": "prefix", "suffix": "suffix", @@ -105,7 +105,7 @@ def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # n "onnx_execution_provider": "CUDAExecutionProvider", "model_kwargs": { "trust_remote_code": True, - "model_id": "sentence-transformers/all-mpnet-base-v2", + "model_id": "sentence-transformers/all-minilm-l6-v2", "provider": "CUDAExecutionProvider", "use_auth_token": None, }, @@ -114,7 +114,7 @@ def test_to_dict_with_custom_init_parameters(self, mock_check_valid_model): # n def test_run_wrong_input_format(self, mock_check_valid_model): # noqa: ARG002 embedder = OptimumTextEmbedder( - model="BAAI/bge-small-en-v1.5", + model="sentence-transformers/all-mpnet-base-v2", token=Secret.from_token("fake-api-token"), ) embedder.warm_up() @@ -127,7 +127,7 @@ def test_run_wrong_input_format(self, mock_check_valid_model): # noqa: ARG002 @pytest.mark.integration def test_run(self): embedder = OptimumTextEmbedder( - model="BAAI/bge-small-en-v1.5", + model="sentence-transformers/all-mpnet-base-v2", prefix="prefix ", suffix=" suffix", ) @@ -135,5 +135,5 @@ def test_run(self): result = embedder.run(text="The food was delicious") - assert len(result["embedding"]) == 384 + assert len(result["embedding"]) == 768 assert all(isinstance(x, float) for x in result["embedding"])