From 2c79d7c4062bff42806c596626f5431711661432 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Thu, 28 Sep 2023 17:29:16 +0530 Subject: [PATCH 01/14] Add INSTRUCTOR Embedders --- .../embedding_backend/__init__.py | 3 + .../embedding_backend/instructor_backend.py | 45 +++ .../instructor_document_embedder.py | 129 +++++++++ .../instructor_text_embedder.py | 98 +++++++ .../tests/test_instructor_backend.py | 42 +++ .../test_instructor_document_embedder.py | 257 ++++++++++++++++++ .../tests/test_instructor_text_embedder.py | 199 ++++++++++++++ 7 files changed, 773 insertions(+) create mode 100644 components/instructor-embedders/instructor_embedders/embedding_backend/__init__.py create mode 100644 components/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py create mode 100644 components/instructor-embedders/instructor_embedders/instructor_document_embedder.py create mode 100644 components/instructor-embedders/instructor_embedders/instructor_text_embedder.py create mode 100644 components/instructor-embedders/tests/test_instructor_backend.py create mode 100644 components/instructor-embedders/tests/test_instructor_document_embedder.py create mode 100644 components/instructor-embedders/tests/test_instructor_text_embedder.py diff --git a/components/instructor-embedders/instructor_embedders/embedding_backend/__init__.py b/components/instructor-embedders/instructor_embedders/embedding_backend/__init__.py new file mode 100644 index 000000000..e873bc332 --- /dev/null +++ b/components/instructor-embedders/instructor_embedders/embedding_backend/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/components/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py b/components/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py new file mode 100644 index 000000000..cdde82394 --- /dev/null +++ b/components/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py @@ -0,0 +1,45 @@ +from typing import List, Optional, Union, Dict + +from haystack.preview.lazy_imports import LazyImport + +with LazyImport(message="Run 'pip install InstructorEmbedding'") as instructor_embeddings_import: + from InstructorEmbedding import INSTRUCTOR + + +class _InstructorEmbeddingBackendFactory: + """ + Factory class to create instances of INSTRUCTOR embedding backends. + """ + + _instances: Dict[str, "_InstructorEmbeddingBackend"] = {} + + @staticmethod + def get_embedding_backend( + model_name_or_path: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None + ): + embedding_backend_id = f"{model_name_or_path}{device}{use_auth_token}" + + if embedding_backend_id in _InstructorEmbeddingBackendFactory._instances: + return _InstructorEmbeddingBackendFactory._instances[embedding_backend_id] + + embedding_backend = _InstructorEmbeddingBackend( + model_name_or_path=model_name_or_path, device=device, use_auth_token=use_auth_token + ) + _InstructorEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend + return embedding_backend + + +class _InstructorEmbeddingBackend: + """ + Class to manage INSTRUCTOR embeddings. + """ + + def __init__( + self, model_name_or_path: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None + ): + instructor_embeddings_import.check() + self.model = INSTRUCTOR(model_name_or_path=model_name_or_path, device=device, use_auth_token=use_auth_token) + + def embed(self, data: List[List[str]], **kwargs) -> List[List[float]]: + embeddings = self.model.encode(data, **kwargs).tolist() + return embeddings diff --git a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py new file mode 100644 index 000000000..a1c049bce --- /dev/null +++ b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py @@ -0,0 +1,129 @@ +from typing import List, Optional, Union, Dict, Any + +from haystack.preview import component, Document, default_to_dict, default_from_dict +from instructor_embedders.embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory + + +@component +class InstructorDocumentEmbedder: + """ + A component for computing Document embeddings using INSTRUCTOR embedding models. + The embedding of each Document is stored in the `embedding` field of the Document. + """ + + def __init__( + self, + model_name_or_path: str = "hkunlp/instructor-base", + device: Optional[str] = None, + use_auth_token: Union[bool, str, None] = None, + instruction: str = "Represent the 'domain' 'text_type' for 'task_objective'", + batch_size: int = 32, + progress_bar: bool = True, + normalize_embeddings: bool = False, + metadata_fields_to_embed: Optional[List[str]] = None, + embedding_separator: str = "\n", + ): + """ + Create a InstructorDocumentEmbedder component. + + :param model_name_or_path: Local path or name of the model in Hugging Face's model hub, such as ``'sentence-transformers/all-mpnet-base-v2'``. + :param device: Device (like 'cuda' / 'cpu') that should be used for computation. If None, checks if a GPU can be used. + :param use_auth_token: The API token used to download private models from Hugging Face. + If this parameter is set to `True`, then the token generated when running + `transformers-cli login` (stored in ~/.huggingface) will be used. + :param instruction: The instruction string to be used while computing domain specific embeddings. The instruction follows the unified template of the form: "Represent the 'domain' 'text_type' for 'task_objective'", where + - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc. + - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. + - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, classify the sentence, etc. + :param batch_size: Number of strings to encode at once. + :param progress_bar: If true, displays progress bar during embedding. + :param normalize_embeddings: If set to true, returned vectors will have length 1. + :param metadata_fields_to_embed: List of meta fields that should be embedded along with the Document content. + :param embedding_separator: Separator used to concatenate the meta fields to the Document content. + """ + + self.model_name_or_path = model_name_or_path + # TODO: remove device parameter and use Haystack's device management once migrated + self.device = device or "cpu" + self.use_auth_token = use_auth_token + self.instruction = instruction + self.batch_size = batch_size + self.progress_bar = progress_bar + self.normalize_embeddings = normalize_embeddings + self.metadata_fields_to_embed = metadata_fields_to_embed or [] + self.embedding_separator = embedding_separator + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict( + self, + model_name_or_path=self.model_name_or_path, + device=self.device, + use_auth_token=self.use_auth_token, + instruction=self.instruction, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + normalize_embeddings=self.normalize_embeddings, + metadata_fields_to_embed=self.metadata_fields_to_embed, + embedding_separator=self.embedding_separator, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "InstructorDocumentEmbedder": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + def warm_up(self): + """ + Load the embedding backend. + """ + if not hasattr(self, "embedding_backend"): + self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( + model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token + ) + + @component.output_types(documents=List[Document]) + def run(self, documents: List[Document]): + """ + Embed a list of Documents. + The embedding of each Document is stored in the `embedding` field of the Document. + """ + if not isinstance(documents, list) or not isinstance(documents[0], Document): + raise TypeError( + "InstructorDocumentEmbedder expects a list of Documents as input." + "In case you want to embed a list of strings, please use the InstructorTextEmbedder." + ) + if not hasattr(self, "embedding_backend"): + raise RuntimeError("The embedding model has not been loaded. Please call warm_up() before running.") + + # TODO: once non textual Documents are properly supported, we should also prepare them for embedding here + + texts_to_embed = [] + for doc in documents: + meta_values_to_embed = [ + str(doc.metadata[key]) + for key in self.metadata_fields_to_embed + if key in doc.metadata and doc.metadata[key] + ] + text_to_embed = [self.instruction, self.embedding_separator.join(meta_values_to_embed + [doc.text or ""])] + texts_to_embed.append(text_to_embed) + + embeddings = self.embedding_backend.embed( + texts_to_embed, + batch_size=self.batch_size, + show_progress_bar=self.progress_bar, + normalize_embeddings=self.normalize_embeddings, + ) + + documents_with_embeddings = [] + for doc, emb in zip(documents, embeddings): + doc_as_dict = doc.to_dict() + doc_as_dict["embedding"] = emb + del doc_as_dict["id"] + documents_with_embeddings.append(Document.from_dict(doc_as_dict)) + + return {"documents": documents_with_embeddings} diff --git a/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py new file mode 100644 index 000000000..ccbdd5e3e --- /dev/null +++ b/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py @@ -0,0 +1,98 @@ +from typing import List, Optional, Union, Dict, Any + +from haystack.preview import component, default_to_dict, default_from_dict +from instructor_embedders.embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory + + +@component +class InstructorTextEmbedder: + """ + A component for embedding strings using Sentence Transformers models. + """ + + def __init__( + self, + model_name_or_path: str = "hkunlp/instructor-base", + device: Optional[str] = None, + use_auth_token: Union[bool, str, None] = None, + instruction: str = "Represent the 'domain' 'text_type' for 'task_objective'", + batch_size: int = 32, + progress_bar: bool = True, + normalize_embeddings: bool = False, + ): + """ + Create a InstructorTextEmbedder component. + + :param model_name_or_path: Local path or name of the model in Hugging Face's model hub, such as ``'sentence-transformers/all-mpnet-base-v2'``. + :param device: Device (like 'cuda' / 'cpu') that should be used for computation. If None, checks if a GPU can be used. + :param use_auth_token: The API token used to download private models from Hugging Face. + If this parameter is set to `True`, then the token generated when running + `transformers-cli login` (stored in ~/.huggingface) will be used. + :param instruction: The instruction string to be used while computing domain specific embeddings. The instruction follows the unified template of the form: "Represent the 'domain' 'text_type' for 'task_objective'", where + - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc. + - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. + - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, classify the sentence, etc. + :param batch_size: Number of strings to encode at once. + :param progress_bar: If true, displays progress bar during embedding. + :param normalize_embeddings: If set to true, returned vectors will have length 1. + """ + + self.model_name_or_path = model_name_or_path + # TODO: remove device parameter and use Haystack's device management once migrated + self.device = device or "cpu" + self.use_auth_token = use_auth_token + self.instruction = instruction + self.batch_size = batch_size + self.progress_bar = progress_bar + self.normalize_embeddings = normalize_embeddings + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict( + self, + model_name_or_path=self.model_name_or_path, + device=self.device, + use_auth_token=self.use_auth_token, + instruction=self.instruction, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + normalize_embeddings=self.normalize_embeddings, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "InstructorTextEmbedder": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + def warm_up(self): + """ + Load the embedding backend. + """ + if not hasattr(self, "embedding_backend"): + self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( + model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token + ) + + @component.output_types(embedding=List[float]) + def run(self, text: str): + """Embed a string.""" + if not isinstance(text, str): + raise TypeError( + "InstructorTextEmbedder expects a string as input." + "In case you want to embed a list of Documents, please use the InstructorDocumentEmbedder." + ) + if not hasattr(self, "embedding_backend"): + raise RuntimeError("The embedding model has not been loaded. Please call warm_up() before running.") + + text_to_embed = [self.instruction, text] + embedding = self.embedding_backend.embed( + [text_to_embed], + batch_size=self.batch_size, + show_progress_bar=self.progress_bar, + normalize_embeddings=self.normalize_embeddings, + )[0] + return {"embedding": embedding} diff --git a/components/instructor-embedders/tests/test_instructor_backend.py b/components/instructor-embedders/tests/test_instructor_backend.py new file mode 100644 index 000000000..276eea2bb --- /dev/null +++ b/components/instructor-embedders/tests/test_instructor_backend.py @@ -0,0 +1,42 @@ +from unittest.mock import patch +import pytest +from instructor_embedders.embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory + + +@pytest.mark.unit +@patch("instructor_embedders.embedding_backend.instructor_backend.INSTRUCTOR") +def test_factory_behavior(mock_instructor): + embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( + model_name_or_path="hkunlp/instructor-large", device="cpu" + ) + same_embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend("hkunlp/instructor-large", "cpu") + another_embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( + model_name_or_path="hkunlp/instructor-base", device="cpu" + ) + + assert same_embedding_backend is embedding_backend + assert another_embedding_backend is not embedding_backend + + +@pytest.mark.unit +@patch("instructor_embedders.embedding_backend.instructor_backend.INSTRUCTOR") +def test_model_initialization(mock_instructor): + _InstructorEmbeddingBackendFactory.get_embedding_backend( + model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token="huggingface_auth_token" + ) + mock_instructor.assert_called_once_with( + model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token="huggingface_auth_token" + ) + + +@pytest.mark.unit +@patch("instructor_embedders.embedding_backend.instructor_backend.INSTRUCTOR") +def test_embedding_function_with_kwargs(mock_instructor): + embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( + model_name_or_path="hkunlp/instructor-base" + ) + + data = [["instruction", "sentence1"], ["instruction", "sentence2"]] + embedding_backend.embed(data=data, normalize_embeddings=True) + + embedding_backend.model.encode.assert_called_once_with(data, normalize_embeddings=True) diff --git a/components/instructor-embedders/tests/test_instructor_document_embedder.py b/components/instructor-embedders/tests/test_instructor_document_embedder.py new file mode 100644 index 000000000..ecff4672a --- /dev/null +++ b/components/instructor-embedders/tests/test_instructor_document_embedder.py @@ -0,0 +1,257 @@ +from unittest.mock import patch, MagicMock +import pytest +import numpy as np + +from haystack.preview import Document +from instructor_embedders.instructor_document_embedder import InstructorDocumentEmbedder + + +class TestInstructorDocumentEmbedder: + @pytest.mark.unit + def test_init_default(self): + """ + Test default initialization parameters for InstructorDocumentEmbedder. + """ + embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base") + assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.device == "cpu" + assert embedder.use_auth_token is None + assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.normalize_embeddings is False + assert embedder.metadata_fields_to_embed == [] + assert embedder.embedding_separator == "\n" + + @pytest.mark.unit + def test_init_with_parameters(self): + """ + Test custom initialization parameters for InstructorDocumentEmbedder. + """ + embedder = InstructorDocumentEmbedder( + model_name_or_path="hkunlp/instructor-base", + device="cuda", + use_auth_token=True, + instruction="Represent the 'domain' 'text_type' for 'task_objective'", + batch_size=64, + progress_bar=False, + normalize_embeddings=True, + metadata_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.device == "cuda" + assert embedder.use_auth_token is True + assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.normalize_embeddings is True + assert embedder.metadata_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == " | " + + @pytest.mark.unit + def test_to_dict(self): + """ + Test serialization of InstructorDocumentEmbedder to a dictionary, using default initialization parameters. + """ + embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base") + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "InstructorDocumentEmbedder", + "init_parameters": { + "model_name_or_path": "hkunlp/instructor-base", + "device": "cpu", + "use_auth_token": None, + "instruction": "Represent the 'domain' 'text_type' for 'task_objective'", + "batch_size": 32, + "progress_bar": True, + "normalize_embeddings": False, + "embedding_separator": "\n", + "metadata_fields_to_embed": [], + }, + } + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + """ + Test serialization of InstructorDocumentEmbedder to a dictionary, using custom initialization parameters. + """ + embedder = InstructorDocumentEmbedder( + model_name_or_path="hkunlp/instructor-base", + device="cuda", + use_auth_token=True, + instruction="Represent the financial document for retrieval", + batch_size=64, + progress_bar=False, + normalize_embeddings=True, + metadata_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "InstructorDocumentEmbedder", + "init_parameters": { + "model_name_or_path": "hkunlp/instructor-base", + "device": "cuda", + "use_auth_token": True, + "instruction": "Represent the financial document for retrieval", + "batch_size": 64, + "progress_bar": False, + "normalize_embeddings": True, + "metadata_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + }, + } + + @pytest.mark.unit + def test_from_dict(self): + """ + Test deserialization of InstructorDocumentEmbedder from a dictionary, using default initialization parameters. + """ + embedder_dict = { + "type": "InstructorDocumentEmbedder", + "init_parameters": { + "model_name_or_path": "hkunlp/instructor-base", + "device": "cpu", + "use_auth_token": None, + "instruction": "Represent the 'domain' 'text_type' for 'task_objective'", + "batch_size": 32, + "progress_bar": True, + "normalize_embeddings": False, + "metadata_fields_to_embed": [], + "embedding_separator": "\n", + }, + } + embedder = InstructorDocumentEmbedder.from_dict(embedder_dict) + assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.device == "cpu" + assert embedder.use_auth_token is None + assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.normalize_embeddings is False + assert embedder.metadata_fields_to_embed == [] + assert embedder.embedding_separator == "\n" + + @pytest.mark.unit + def test_from_dict_with_custom_init_parameters(self): + """ + Test deserialization of InstructorDocumentEmbedder from a dictionary, using custom initialization parameters. + """ + embedder_dict = { + "type": "InstructorDocumentEmbedder", + "init_parameters": { + "model_name_or_path": "hkunlp/instructor-base", + "device": "cuda", + "use_auth_token": True, + "instruction": "Represent the financial document for retrieval", + "batch_size": 64, + "progress_bar": False, + "normalize_embeddings": True, + "metadata_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + }, + } + embedder = InstructorDocumentEmbedder.from_dict(embedder_dict) + assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.device == "cuda" + assert embedder.use_auth_token is True + assert embedder.instruction == "Represent the financial document for retrieval" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.normalize_embeddings is True + assert embedder.metadata_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == " | " + + @pytest.mark.unit + @patch("instructor_embedders.instructor_document_embedder._InstructorEmbeddingBackendFactory") + def test_warmup(self, mocked_factory): + """ + Test for checking embedder instances after warm-up. + """ + embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once_with( + model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token=None + ) + + @pytest.mark.unit + @patch("instructor_embedders.instructor_document_embedder._InstructorEmbeddingBackendFactory") + def test_warmup_does_not_reload(self, mocked_factory): + """ + Test for checking backend instances after multiple warm-ups. + """ + embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once() + + @pytest.mark.unit + def test_embed(self): + """ + Test for checking output dimensions and embedding dimensions. + """ + embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base") + embedder.embedding_backend = MagicMock() + embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() + + documents = [Document(text=f"Sample-document text {i}") for i in range(5)] + + result = embedder.run(documents=documents) + + assert isinstance(result["documents"], list) + assert len(result["documents"]) == len(documents) + for doc in result["documents"]: + assert isinstance(doc, Document) + assert isinstance(doc.embedding, list) + assert isinstance(doc.embedding[0], float) + + @pytest.mark.unit + def test_embed_incorrect_input_format(self): + """ + Test for checking incorrect input format when creating embedding. + """ + embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base") + + string_input = "text" + list_integers_input = [1, 2, 3] + + with pytest.raises(TypeError, match="InstructorDocumentEmbedder expects a list of Documents as input."): + embedder.run(documents=string_input) + + with pytest.raises(TypeError, match="InstructorDocumentEmbedder expects a list of Documents as input."): + embedder.run(documents=list_integers_input) + + @pytest.mark.unit + def test_embed_metadata(self): + """ + Test for checking output dimensions and embedding dimensions for documents with a custom instruction and metadata. + """ + embedder = InstructorDocumentEmbedder( + model_name_or_path="model", + instruction="Represent the financial document for retrieval", + metadata_fields_to_embed=["meta_field"], + embedding_separator="\n", + ) + embedder.embedding_backend = MagicMock() + + documents = [ + Document(text=f"document-number {i}", metadata={"meta_field": f"meta_value {i}"}) for i in range(5) + ] + + embedder.run(documents=documents) + + embedder.embedding_backend.embed.assert_called_once_with( + [ + ["Represent the financial document for retrieval", "meta_value 0\ndocument-number 0"], + ["Represent the financial document for retrieval", "meta_value 1\ndocument-number 1"], + ["Represent the financial document for retrieval", "meta_value 2\ndocument-number 2"], + ["Represent the financial document for retrieval", "meta_value 3\ndocument-number 3"], + ["Represent the financial document for retrieval", "meta_value 4\ndocument-number 4"], + ], + batch_size=32, + show_progress_bar=True, + normalize_embeddings=False, + ) diff --git a/components/instructor-embedders/tests/test_instructor_text_embedder.py b/components/instructor-embedders/tests/test_instructor_text_embedder.py new file mode 100644 index 000000000..296dfa65d --- /dev/null +++ b/components/instructor-embedders/tests/test_instructor_text_embedder.py @@ -0,0 +1,199 @@ +from unittest.mock import patch, MagicMock +import pytest + +import numpy as np + +from instructor_embedders.instructor_text_embedder import InstructorTextEmbedder + + +class TestInstructorTextEmbedder: + @pytest.mark.unit + def test_init_default(self): + """ + Test default initialization parameters for InstructorTextEmbedder. + """ + embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base") + assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.device == "cpu" + assert embedder.use_auth_token is None + assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.normalize_embeddings is False + + @pytest.mark.unit + def test_init_with_parameters(self): + """ + Test custom initialization parameters for InstructorTextEmbedder. + """ + embedder = InstructorTextEmbedder( + model_name_or_path="hkunlp/instructor-base", + device="cuda", + use_auth_token=True, + instruction="Represent the 'domain' 'text_type' for 'task_objective'", + batch_size=64, + progress_bar=False, + normalize_embeddings=True, + ) + assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.device == "cuda" + assert embedder.use_auth_token is True + assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.normalize_embeddings is True + + @pytest.mark.unit + def test_to_dict(self): + """ + Test serialization of InstructorTextEmbedder to a dictionary, using default initialization parameters. + """ + embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base") + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "InstructorTextEmbedder", + "init_parameters": { + "model_name_or_path": "hkunlp/instructor-base", + "device": "cpu", + "use_auth_token": None, + "instruction": "Represent the 'domain' 'text_type' for 'task_objective'", + "batch_size": 32, + "progress_bar": True, + "normalize_embeddings": False, + }, + } + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + """ + Test serialization of InstructorTextEmbedder to a dictionary, using custom initialization parameters. + """ + embedder = InstructorTextEmbedder( + model_name_or_path="hkunlp/instructor-base", + device="cuda", + use_auth_token=True, + instruction="Represent the financial document for retrieval", + batch_size=64, + progress_bar=False, + normalize_embeddings=True, + ) + embedder_dict = embedder.to_dict() + assert embedder_dict == { + "type": "InstructorTextEmbedder", + "init_parameters": { + "model_name_or_path": "hkunlp/instructor-base", + "device": "cuda", + "use_auth_token": True, + "instruction": "Represent the financial document for retrieval", + "batch_size": 64, + "progress_bar": False, + "normalize_embeddings": True, + }, + } + + @pytest.mark.unit + def test_from_dict(self): + """ + Test deserialization of InstructorTextEmbedder from a dictionary, using default initialization parameters. + """ + embedder_dict = { + "type": "InstructorTextEmbedder", + "init_parameters": { + "model_name_or_path": "hkunlp/instructor-base", + "device": "cpu", + "use_auth_token": None, + "instruction": "Represent the 'domain' 'text_type' for 'task_objective'", + "batch_size": 32, + "progress_bar": True, + "normalize_embeddings": False, + }, + } + embedder = InstructorTextEmbedder.from_dict(embedder_dict) + assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.device == "cpu" + assert embedder.use_auth_token is None + assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.normalize_embeddings is False + + @pytest.mark.unit + def test_from_dict_with_custom_init_parameters(self): + """ + Test deserialization of InstructorTextEmbedder from a dictionary, using custom initialization parameters. + """ + embedder_dict = { + "type": "InstructorTextEmbedder", + "init_parameters": { + "model_name_or_path": "hkunlp/instructor-base", + "device": "cuda", + "use_auth_token": True, + "instruction": "Represent the financial document for retrieval", + "batch_size": 64, + "progress_bar": False, + "normalize_embeddings": True, + }, + } + embedder = InstructorTextEmbedder.from_dict(embedder_dict) + assert embedder.model_name_or_path == "hkunlp/instructor-base" + assert embedder.device == "cuda" + assert embedder.use_auth_token is True + assert embedder.instruction == "Represent the financial document for retrieval" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.normalize_embeddings is True + + @pytest.mark.unit + @patch("instructor_embedders.instructor_text_embedder._InstructorEmbeddingBackendFactory") + def test_warmup(self, mocked_factory): + """ + Test for checking embedder instances after warm-up. + """ + embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once_with( + model_name_or_path="hkunlp/instructor-base", device="cpu", use_auth_token=None + ) + + @pytest.mark.unit + @patch("instructor_embedders.instructor_text_embedder._InstructorEmbeddingBackendFactory") + def test_warmup_does_not_reload(self, mocked_factory): + """ + Test for checking backend instances after multiple warm-ups. + """ + embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once() + + @pytest.mark.unit + def test_embed(self): + """ + Test for checking output dimensions and embedding dimensions. + """ + embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base") + embedder.embedding_backend = MagicMock() + embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() + + text = "Good text to embed" + + result = embedder.run(text=text) + embedding = result["embedding"] + + assert isinstance(embedding, list) + assert all(isinstance(emb, float) for emb in embedding) + + @pytest.mark.unit + def test_run_wrong_incorrect_format(self): + """ + Test for checking incorrect input format when creating embedding. + """ + embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base") + embedder.embedding_backend = MagicMock() + + list_integers_input = [1, 2, 3] + + with pytest.raises(TypeError, match="InstructorTextEmbedder expects a string as input"): + embedder.run(text=list_integers_input) From 9c7f0f71aa5a8bff35d7d374c9fd929fb34e4967 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Thu, 28 Sep 2023 17:30:02 +0530 Subject: [PATCH 02/14] Update Source URL in pyproject.toml --- components/instructor-embedders/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/instructor-embedders/pyproject.toml b/components/instructor-embedders/pyproject.toml index d419e3de8..b5e2c2128 100644 --- a/components/instructor-embedders/pyproject.toml +++ b/components/instructor-embedders/pyproject.toml @@ -49,7 +49,7 @@ dependencies = [ [project.urls] Documentation = "https://github.com/deepset-ai/haystack-extras/tree/main/components/instructor-embedders#readme" Issues = "https://github.com/deepset-ai/haystack-extras/issues" -Source = "https://github.com/deepset-ai/haystack-extras/tree/main/components/text2speech" +Source = "https://github.com/deepset-ai/haystack-extras/tree/main/components/instructor-embedders" [tool.hatch.version] path = "instructor_embedders/__about__.py" From f14b334b37562998bdd31d1c0f868e34fc176393 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Mon, 2 Oct 2023 15:56:19 +0200 Subject: [PATCH 03/14] make ruff happy --- .../embedding_backend/instructor_backend.py | 4 +- .../instructor_document_embedder.py | 38 +++++++++++-------- .../instructor_text_embedder.py | 32 +++++++++------- .../instructor-embedders/pyproject.toml | 1 - .../tests/test_instructor_backend.py | 6 ++- .../test_instructor_document_embedder.py | 12 +++--- .../tests/test_instructor_embedders.py | 1 + .../tests/test_instructor_text_embedder.py | 6 +-- 8 files changed, 58 insertions(+), 42 deletions(-) diff --git a/components/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py b/components/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py index cdde82394..c3ff3a79b 100644 --- a/components/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py +++ b/components/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Union, Dict +from typing import ClassVar, Dict, List, Optional, Union from haystack.preview.lazy_imports import LazyImport @@ -11,7 +11,7 @@ class _InstructorEmbeddingBackendFactory: Factory class to create instances of INSTRUCTOR embedding backends. """ - _instances: Dict[str, "_InstructorEmbeddingBackend"] = {} + _instances: ClassVar[Dict[str, "_InstructorEmbeddingBackend"]] = {} @staticmethod def get_embedding_backend( diff --git a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py index a1c049bce..7618cda93 100644 --- a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py +++ b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py @@ -1,6 +1,7 @@ -from typing import List, Optional, Union, Dict, Any +from typing import Any, Dict, List, Optional, Union + +from haystack.preview import Document, component, default_from_dict, default_to_dict -from haystack.preview import component, Document, default_to_dict, default_from_dict from instructor_embedders.embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory @@ -26,15 +27,20 @@ def __init__( """ Create a InstructorDocumentEmbedder component. - :param model_name_or_path: Local path or name of the model in Hugging Face's model hub, such as ``'sentence-transformers/all-mpnet-base-v2'``. - :param device: Device (like 'cuda' / 'cpu') that should be used for computation. If None, checks if a GPU can be used. + :param model_name_or_path: Local path or name of the model in Hugging Face's model hub, + such as ``'hkunlp/instructor-base'``. + :param device: Device (like 'cuda' / 'cpu') that should be used for computation. + If None, checks if a GPU can be used. :param use_auth_token: The API token used to download private models from Hugging Face. - If this parameter is set to `True`, then the token generated when running - `transformers-cli login` (stored in ~/.huggingface) will be used. - :param instruction: The instruction string to be used while computing domain specific embeddings. The instruction follows the unified template of the form: "Represent the 'domain' 'text_type' for 'task_objective'", where - - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc. - - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. - - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, classify the sentence, etc. + If this parameter is set to `True`, then the token generated when running + `transformers-cli login` (stored in ~/.huggingface) will be used. + :param instruction: The instruction string to be used while computing domain specific embeddings. + The instruction follows the unified template of the form: + "Represent the 'domain' 'text_type' for 'task_objective'", where + - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc. + - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. + - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, + classify the sentence, etc. :param batch_size: Number of strings to encode at once. :param progress_bar: If true, displays progress bar during embedding. :param normalize_embeddings: If set to true, returned vectors will have length 1. @@ -93,12 +99,12 @@ def run(self, documents: List[Document]): The embedding of each Document is stored in the `embedding` field of the Document. """ if not isinstance(documents, list) or not isinstance(documents[0], Document): - raise TypeError( - "InstructorDocumentEmbedder expects a list of Documents as input." - "In case you want to embed a list of strings, please use the InstructorTextEmbedder." - ) + msg = ("InstructorDocumentEmbedder expects a list of Documents as input. " + "In case you want to embed a list of strings, please use the InstructorTextEmbedder.") + raise TypeError(msg) if not hasattr(self, "embedding_backend"): - raise RuntimeError("The embedding model has not been loaded. Please call warm_up() before running.") + msg = "The embedding model has not been loaded. Please call warm_up() before running." + raise RuntimeError(msg) # TODO: once non textual Documents are properly supported, we should also prepare them for embedding here @@ -109,7 +115,7 @@ def run(self, documents: List[Document]): for key in self.metadata_fields_to_embed if key in doc.metadata and doc.metadata[key] ] - text_to_embed = [self.instruction, self.embedding_separator.join(meta_values_to_embed + [doc.text or ""])] + text_to_embed = [self.instruction, self.embedding_separator.join([*meta_values_to_embed, doc.text or ""])] texts_to_embed.append(text_to_embed) embeddings = self.embedding_backend.embed( diff --git a/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py index ccbdd5e3e..51e491ed4 100644 --- a/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py +++ b/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py @@ -1,6 +1,7 @@ -from typing import List, Optional, Union, Dict, Any +from typing import Any, Dict, List, Optional, Union + +from haystack.preview import component, default_from_dict, default_to_dict -from haystack.preview import component, default_to_dict, default_from_dict from instructor_embedders.embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory @@ -23,15 +24,20 @@ def __init__( """ Create a InstructorTextEmbedder component. - :param model_name_or_path: Local path or name of the model in Hugging Face's model hub, such as ``'sentence-transformers/all-mpnet-base-v2'``. - :param device: Device (like 'cuda' / 'cpu') that should be used for computation. If None, checks if a GPU can be used. + :param model_name_or_path: Local path or name of the model in Hugging Face's model hub, + such as ``'hkunlp/instructor-base'``. + :param device: Device (like 'cuda' / 'cpu') that should be used for computation. + If None, checks if a GPU can be used. :param use_auth_token: The API token used to download private models from Hugging Face. If this parameter is set to `True`, then the token generated when running `transformers-cli login` (stored in ~/.huggingface) will be used. - :param instruction: The instruction string to be used while computing domain specific embeddings. The instruction follows the unified template of the form: "Represent the 'domain' 'text_type' for 'task_objective'", where - - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc. - - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. - - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, classify the sentence, etc. + :param instruction: The instruction string to be used while computing domain specific embeddings. + The instruction follows the unified template of the form: + "Represent the 'domain' 'text_type' for 'task_objective'", where + - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc. + - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. + - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, + classify the sentence, etc. :param batch_size: Number of strings to encode at once. :param progress_bar: If true, displays progress bar during embedding. :param normalize_embeddings: If set to true, returned vectors will have length 1. @@ -81,12 +87,12 @@ def warm_up(self): def run(self, text: str): """Embed a string.""" if not isinstance(text, str): - raise TypeError( - "InstructorTextEmbedder expects a string as input." - "In case you want to embed a list of Documents, please use the InstructorDocumentEmbedder." - ) + msg = ("InstructorTextEmbedder expects a string as input. " + "In case you want to embed a list of Documents, please use the InstructorDocumentEmbedder.") + raise TypeError(msg) if not hasattr(self, "embedding_backend"): - raise RuntimeError("The embedding model has not been loaded. Please call warm_up() before running.") + msg = "The embedding model has not been loaded. Please call warm_up() before running." + raise RuntimeError(msg) text_to_embed = [self.instruction, text] embedding = self.embedding_backend.embed( diff --git a/components/instructor-embedders/pyproject.toml b/components/instructor-embedders/pyproject.toml index bea7cc53a..efe6cca9e 100644 --- a/components/instructor-embedders/pyproject.toml +++ b/components/instructor-embedders/pyproject.toml @@ -87,7 +87,6 @@ select = [ "E", "EM", "F", - "FBT", "I", "ICN", "ISC", diff --git a/components/instructor-embedders/tests/test_instructor_backend.py b/components/instructor-embedders/tests/test_instructor_backend.py index 276eea2bb..334e02f6f 100644 --- a/components/instructor-embedders/tests/test_instructor_backend.py +++ b/components/instructor-embedders/tests/test_instructor_backend.py @@ -1,11 +1,13 @@ from unittest.mock import patch + import pytest + from instructor_embedders.embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory @pytest.mark.unit @patch("instructor_embedders.embedding_backend.instructor_backend.INSTRUCTOR") -def test_factory_behavior(mock_instructor): +def test_factory_behavior(mock_instructor): # noqa: ARG001 embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( model_name_or_path="hkunlp/instructor-large", device="cpu" ) @@ -31,7 +33,7 @@ def test_model_initialization(mock_instructor): @pytest.mark.unit @patch("instructor_embedders.embedding_backend.instructor_backend.INSTRUCTOR") -def test_embedding_function_with_kwargs(mock_instructor): +def test_embedding_function_with_kwargs(mock_instructor): # noqa: ARG001 embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( model_name_or_path="hkunlp/instructor-base" ) diff --git a/components/instructor-embedders/tests/test_instructor_document_embedder.py b/components/instructor-embedders/tests/test_instructor_document_embedder.py index ecff4672a..d580c83cc 100644 --- a/components/instructor-embedders/tests/test_instructor_document_embedder.py +++ b/components/instructor-embedders/tests/test_instructor_document_embedder.py @@ -1,8 +1,9 @@ -from unittest.mock import patch, MagicMock -import pytest -import numpy as np +from unittest.mock import MagicMock, patch +import numpy as np +import pytest from haystack.preview import Document + from instructor_embedders.instructor_document_embedder import InstructorDocumentEmbedder @@ -195,7 +196,7 @@ def test_embed(self): """ embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base") embedder.embedding_backend = MagicMock() - embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() + embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() # noqa: ARG005 documents = [Document(text=f"Sample-document text {i}") for i in range(5)] @@ -227,7 +228,8 @@ def test_embed_incorrect_input_format(self): @pytest.mark.unit def test_embed_metadata(self): """ - Test for checking output dimensions and embedding dimensions for documents with a custom instruction and metadata. + Test for checking output dimensions and embedding dimensions for documents + with a custom instruction and metadata. """ embedder = InstructorDocumentEmbedder( model_name_or_path="model", diff --git a/components/instructor-embedders/tests/test_instructor_embedders.py b/components/instructor-embedders/tests/test_instructor_embedders.py index 129c33f3c..1abbe3b32 100644 --- a/components/instructor-embedders/tests/test_instructor_embedders.py +++ b/components/instructor-embedders/tests/test_instructor_embedders.py @@ -2,5 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 + def test_instructor_embedders(): assert True diff --git a/components/instructor-embedders/tests/test_instructor_text_embedder.py b/components/instructor-embedders/tests/test_instructor_text_embedder.py index 296dfa65d..adb6be31a 100644 --- a/components/instructor-embedders/tests/test_instructor_text_embedder.py +++ b/components/instructor-embedders/tests/test_instructor_text_embedder.py @@ -1,7 +1,7 @@ -from unittest.mock import patch, MagicMock -import pytest +from unittest.mock import MagicMock, patch import numpy as np +import pytest from instructor_embedders.instructor_text_embedder import InstructorTextEmbedder @@ -175,7 +175,7 @@ def test_embed(self): """ embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base") embedder.embedding_backend = MagicMock() - embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() + embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() # noqa: ARG005 text = "Good text to embed" From 50128e61aeb314ab3daced67b3548fb3447ab53e Mon Sep 17 00:00:00 2001 From: anakin87 Date: Mon, 2 Oct 2023 16:14:14 +0200 Subject: [PATCH 04/14] little improvements --- .../instructor_embedders/instructor_document_embedder.py | 5 +++-- .../instructor_embedders/instructor_text_embedder.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py index 7618cda93..af463aeb3 100644 --- a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py +++ b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py @@ -41,6 +41,7 @@ def __init__( - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, classify the sentence, etc. + Check some examples of instructions here: https://github.com/xlang-ai/instructor-embedding#use-cases :param batch_size: Number of strings to encode at once. :param progress_bar: If true, displays progress bar during embedding. :param normalize_embeddings: If set to true, returned vectors will have length 1. @@ -98,7 +99,7 @@ def run(self, documents: List[Document]): Embed a list of Documents. The embedding of each Document is stored in the `embedding` field of the Document. """ - if not isinstance(documents, list) or not isinstance(documents[0], Document): + if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): msg = ("InstructorDocumentEmbedder expects a list of Documents as input. " "In case you want to embed a list of strings, please use the InstructorTextEmbedder.") raise TypeError(msg) @@ -113,7 +114,7 @@ def run(self, documents: List[Document]): meta_values_to_embed = [ str(doc.metadata[key]) for key in self.metadata_fields_to_embed - if key in doc.metadata and doc.metadata[key] + if key in doc.metadata and doc.metadata[key] is not None ] text_to_embed = [self.instruction, self.embedding_separator.join([*meta_values_to_embed, doc.text or ""])] texts_to_embed.append(text_to_embed) diff --git a/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py index 51e491ed4..decca441d 100644 --- a/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py +++ b/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py @@ -38,6 +38,7 @@ def __init__( - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, classify the sentence, etc. + Check some examples of instructions here: https://github.com/xlang-ai/instructor-embedding#use-cases :param batch_size: Number of strings to encode at once. :param progress_bar: If true, displays progress bar during embedding. :param normalize_embeddings: If set to true, returned vectors will have length 1. From 217067c8e8ce0dd132af5a6863499678774577cd Mon Sep 17 00:00:00 2001 From: anakin87 Date: Mon, 2 Oct 2023 17:37:38 +0200 Subject: [PATCH 05/14] little improvements --- .../instructor_document_embedder.py | 2 +- .../instructor_text_embedder.py | 2 +- .../test_instructor_document_embedder.py | 22 ++++++++++++++--- .../tests/test_instructor_text_embedder.py | 24 +++++++++++++++---- 4 files changed, 41 insertions(+), 9 deletions(-) diff --git a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py index af463aeb3..5719ff88e 100644 --- a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py +++ b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py @@ -17,7 +17,7 @@ def __init__( model_name_or_path: str = "hkunlp/instructor-base", device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None, - instruction: str = "Represent the 'domain' 'text_type' for 'task_objective'", + instruction: str = "Represent the document", batch_size: int = 32, progress_bar: bool = True, normalize_embeddings: bool = False, diff --git a/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py index decca441d..bbf85462c 100644 --- a/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py +++ b/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py @@ -16,7 +16,7 @@ def __init__( model_name_or_path: str = "hkunlp/instructor-base", device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None, - instruction: str = "Represent the 'domain' 'text_type' for 'task_objective'", + instruction: str = "Represent the sentence", batch_size: int = 32, progress_bar: bool = True, normalize_embeddings: bool = False, diff --git a/components/instructor-embedders/tests/test_instructor_document_embedder.py b/components/instructor-embedders/tests/test_instructor_document_embedder.py index d580c83cc..5519059c9 100644 --- a/components/instructor-embedders/tests/test_instructor_document_embedder.py +++ b/components/instructor-embedders/tests/test_instructor_document_embedder.py @@ -17,7 +17,7 @@ def test_init_default(self): assert embedder.model_name_or_path == "hkunlp/instructor-base" assert embedder.device == "cpu" assert embedder.use_auth_token is None - assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" + assert embedder.instruction == "Represent the document" assert embedder.batch_size == 32 assert embedder.progress_bar is True assert embedder.normalize_embeddings is False @@ -63,7 +63,7 @@ def test_to_dict(self): "model_name_or_path": "hkunlp/instructor-base", "device": "cpu", "use_auth_token": None, - "instruction": "Represent the 'domain' 'text_type' for 'task_objective'", + "instruction": "Represent the document", "batch_size": 32, "progress_bar": True, "normalize_embeddings": False, @@ -194,7 +194,7 @@ def test_embed(self): """ Test for checking output dimensions and embedding dimensions. """ - embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base") + embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-large") embedder.embedding_backend = MagicMock() embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() # noqa: ARG005 @@ -257,3 +257,19 @@ def test_embed_metadata(self): show_progress_bar=True, normalize_embeddings=False, ) + + @pytest.mark.integration + def test_run(self): + embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base", + device="cpu", + instruction="Represent the Science document for retrieval") + embedder.warm_up() + + doc = Document(text="Parton energy loss in QCD matter") + + result = embedder.run(documents=[doc]) + embedding = result['documents'][0].embedding + + assert isinstance(embedding, list) + assert len(embedding) == 768 + assert all(isinstance(emb, float) for emb in embedding) diff --git a/components/instructor-embedders/tests/test_instructor_text_embedder.py b/components/instructor-embedders/tests/test_instructor_text_embedder.py index adb6be31a..e3afe91e8 100644 --- a/components/instructor-embedders/tests/test_instructor_text_embedder.py +++ b/components/instructor-embedders/tests/test_instructor_text_embedder.py @@ -16,7 +16,7 @@ def test_init_default(self): assert embedder.model_name_or_path == "hkunlp/instructor-base" assert embedder.device == "cpu" assert embedder.use_auth_token is None - assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'" + assert embedder.instruction == "Represent the sentence" assert embedder.batch_size == 32 assert embedder.progress_bar is True assert embedder.normalize_embeddings is False @@ -56,7 +56,7 @@ def test_to_dict(self): "model_name_or_path": "hkunlp/instructor-base", "device": "cpu", "use_auth_token": None, - "instruction": "Represent the 'domain' 'text_type' for 'task_objective'", + "instruction": "Represent the sentence", "batch_size": 32, "progress_bar": True, "normalize_embeddings": False, @@ -173,7 +173,7 @@ def test_embed(self): """ Test for checking output dimensions and embedding dimensions. """ - embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base") + embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-large") embedder.embedding_backend = MagicMock() embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() # noqa: ARG005 @@ -190,10 +190,26 @@ def test_run_wrong_incorrect_format(self): """ Test for checking incorrect input format when creating embedding. """ - embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base") + embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-large") embedder.embedding_backend = MagicMock() list_integers_input = [1, 2, 3] with pytest.raises(TypeError, match="InstructorTextEmbedder expects a string as input"): embedder.run(text=list_integers_input) + + @pytest.mark.integration + def test_run(self): + embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base", + device="cpu", + instruction="Represent the Science sentence for retrieval") + embedder.warm_up() + + text = "Parton energy loss in QCD matter" + + result = embedder.run(text=text) + embedding = result["embedding"] + + assert isinstance(embedding, list) + assert len(embedding) == 768 + assert all(isinstance(emb, float) for emb in embedding) From 0807ae5aa63d65436946dd92e4563e1d85093155 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Mon, 2 Oct 2023 17:40:12 +0200 Subject: [PATCH 06/14] fix ruff --- .../tests/test_instructor_document_embedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/instructor-embedders/tests/test_instructor_document_embedder.py b/components/instructor-embedders/tests/test_instructor_document_embedder.py index 5519059c9..faa9d715b 100644 --- a/components/instructor-embedders/tests/test_instructor_document_embedder.py +++ b/components/instructor-embedders/tests/test_instructor_document_embedder.py @@ -268,7 +268,7 @@ def test_run(self): doc = Document(text="Parton energy loss in QCD matter") result = embedder.run(documents=[doc]) - embedding = result['documents'][0].embedding + embedding = result["documents"][0].embedding assert isinstance(embedding, list) assert len(embedding) == 768 From a0c0c9dc5ce879dc5e30034adaa143fe9f3e9caa Mon Sep 17 00:00:00 2001 From: anakin87 Date: Mon, 2 Oct 2023 18:18:09 +0200 Subject: [PATCH 07/14] separate unit and integration test --- .github/workflows/components_instructor_embedders.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/components_instructor_embedders.yml b/.github/workflows/components_instructor_embedders.yml index 710776d12..0aa0bb30d 100644 --- a/.github/workflows/components_instructor_embedders.yml +++ b/.github/workflows/components_instructor_embedders.yml @@ -34,6 +34,10 @@ jobs: run: | pip install -e .[dev] - - name: Run tests + - name: Run unit tests run: | - pytest + pytest -v -m unit + + - name: Run integration tests + run: | + pytest -v -m integration From bf5dca60678ca32a3debb59aa37adf7e4ab9c10b Mon Sep 17 00:00:00 2001 From: Daria Fokina Date: Tue, 3 Oct 2023 10:36:13 +0200 Subject: [PATCH 08/14] document embedder docstrings cleanup --- .../instructor_document_embedder.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py index 5719ff88e..0044e2e63 100644 --- a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py +++ b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py @@ -25,26 +25,26 @@ def __init__( embedding_separator: str = "\n", ): """ - Create a InstructorDocumentEmbedder component. + Create an InstructorDocumentEmbedder component. :param model_name_or_path: Local path or name of the model in Hugging Face's model hub, such as ``'hkunlp/instructor-base'``. :param device: Device (like 'cuda' / 'cpu') that should be used for computation. If None, checks if a GPU can be used. - :param use_auth_token: The API token used to download private models from Hugging Face. + :param use_auth_token: An API token used to download private models from Hugging Face. If this parameter is set to `True`, then the token generated when running `transformers-cli login` (stored in ~/.huggingface) will be used. - :param instruction: The instruction string to be used while computing domain specific embeddings. + :param instruction: The instruction string to be used while computing domain-specific embeddings. The instruction follows the unified template of the form: - "Represent the 'domain' 'text_type' for 'task_objective'", where - - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc. - - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. - - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, - classify the sentence, etc. + "Represent the 'domain' 'text_type' for 'task_objective'", where: + - "domain" is optional, and it specifies the domain of the text, for example, science, finance, medicine, and so on. + - "text_type" is required, and it specifies the encoding unit, for example, sentence, document, paragraph, and so on. + - "task_objective" is optional, and it specifies the objective of embedding, for example, retrieve a document, + classify the sentence, and so on. Check some examples of instructions here: https://github.com/xlang-ai/instructor-embedding#use-cases :param batch_size: Number of strings to encode at once. :param progress_bar: If true, displays progress bar during embedding. - :param normalize_embeddings: If set to true, returned vectors will have length 1. + :param normalize_embeddings: If set to true, returned vectors will have the length of 1. :param metadata_fields_to_embed: List of meta fields that should be embedded along with the Document content. :param embedding_separator: Separator used to concatenate the meta fields to the Document content. """ From 42e696b66a6d9dd3880b699c9b0ad70aed230780 Mon Sep 17 00:00:00 2001 From: Daria Fokina Date: Tue, 3 Oct 2023 10:37:58 +0200 Subject: [PATCH 09/14] text embedder docstrings cleanup --- .../instructor_text_embedder.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py index bbf85462c..a2732e2de 100644 --- a/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py +++ b/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py @@ -22,7 +22,7 @@ def __init__( normalize_embeddings: bool = False, ): """ - Create a InstructorTextEmbedder component. + Create an InstructorTextEmbedder component. :param model_name_or_path: Local path or name of the model in Hugging Face's model hub, such as ``'hkunlp/instructor-base'``. @@ -31,17 +31,17 @@ def __init__( :param use_auth_token: The API token used to download private models from Hugging Face. If this parameter is set to `True`, then the token generated when running `transformers-cli login` (stored in ~/.huggingface) will be used. - :param instruction: The instruction string to be used while computing domain specific embeddings. + :param instruction: The instruction string to be used while computing domain-specific embeddings. The instruction follows the unified template of the form: - "Represent the 'domain' 'text_type' for 'task_objective'", where - - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc. - - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. - - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, - classify the sentence, etc. - Check some examples of instructions here: https://github.com/xlang-ai/instructor-embedding#use-cases + "Represent the 'domain' 'text_type' for 'task_objective'", where: + - "domain" is optional, and it specifies the domain of the text, for example, science, finance, medicine, and so on. + - "text_type" is required, and it specifies the encoding unit, for example, sentence, document, paragraph, and so on. + - "task_objective" is optional, and it specifies the objective of embedding, for example, retrieve a document, + classify the sentence, and so on. + Check some examples of instructions here: https://github.com/xlang-ai/instructor-embedding#use-cases. :param batch_size: Number of strings to encode at once. :param progress_bar: If true, displays progress bar during embedding. - :param normalize_embeddings: If set to true, returned vectors will have length 1. + :param normalize_embeddings: If set to true, returned vectors will have the length of 1. """ self.model_name_or_path = model_name_or_path From cb200f183974ffd992557f251ab871b00480158f Mon Sep 17 00:00:00 2001 From: Daria Fokina Date: Tue, 3 Oct 2023 10:40:20 +0200 Subject: [PATCH 10/14] cut docstrings --- .../instructor_document_embedder.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py index 0044e2e63..e1da5648b 100644 --- a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py +++ b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py @@ -37,10 +37,12 @@ def __init__( :param instruction: The instruction string to be used while computing domain-specific embeddings. The instruction follows the unified template of the form: "Represent the 'domain' 'text_type' for 'task_objective'", where: - - "domain" is optional, and it specifies the domain of the text, for example, science, finance, medicine, and so on. - - "text_type" is required, and it specifies the encoding unit, for example, sentence, document, paragraph, and so on. - - "task_objective" is optional, and it specifies the objective of embedding, for example, retrieve a document, - classify the sentence, and so on. + - "domain" is optional, and it specifies the domain of the text, for example, science, finance, + medicine, and so on. + - "text_type" is required, and it specifies the encoding unit, for example, sentence, document, + paragraph, and so on. + - "task_objective" is optional, and it specifies the objective of embedding, for example, + retrieve a document, classify the sentence, and so on. Check some examples of instructions here: https://github.com/xlang-ai/instructor-embedding#use-cases :param batch_size: Number of strings to encode at once. :param progress_bar: If true, displays progress bar during embedding. From 110cafd41ca11253113ff883f8e783547a339e64 Mon Sep 17 00:00:00 2001 From: Daria Fokina Date: Tue, 3 Oct 2023 10:42:40 +0200 Subject: [PATCH 11/14] cut docstrings 2 --- .../instructor_document_embedder.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py index e1da5648b..3724ef9c4 100644 --- a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py +++ b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py @@ -37,12 +37,10 @@ def __init__( :param instruction: The instruction string to be used while computing domain-specific embeddings. The instruction follows the unified template of the form: "Represent the 'domain' 'text_type' for 'task_objective'", where: - - "domain" is optional, and it specifies the domain of the text, for example, science, finance, - medicine, and so on. - - "text_type" is required, and it specifies the encoding unit, for example, sentence, document, - paragraph, and so on. - - "task_objective" is optional, and it specifies the objective of embedding, for example, - retrieve a document, classify the sentence, and so on. + - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc. + - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. + - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, + classify the sentence, etc. Check some examples of instructions here: https://github.com/xlang-ai/instructor-embedding#use-cases :param batch_size: Number of strings to encode at once. :param progress_bar: If true, displays progress bar during embedding. From 94c7ef12a43b3216dc6c4ced0225fe0825510627 Mon Sep 17 00:00:00 2001 From: Daria Fokina Date: Tue, 3 Oct 2023 10:43:27 +0200 Subject: [PATCH 12/14] cut docstrings 3 --- .../instructor_embedders/instructor_text_embedder.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py index a2732e2de..dd0ec48c3 100644 --- a/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py +++ b/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py @@ -34,10 +34,10 @@ def __init__( :param instruction: The instruction string to be used while computing domain-specific embeddings. The instruction follows the unified template of the form: "Represent the 'domain' 'text_type' for 'task_objective'", where: - - "domain" is optional, and it specifies the domain of the text, for example, science, finance, medicine, and so on. - - "text_type" is required, and it specifies the encoding unit, for example, sentence, document, paragraph, and so on. - - "task_objective" is optional, and it specifies the objective of embedding, for example, retrieve a document, - classify the sentence, and so on. + - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc. + - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. + - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, + classify the sentence, etc. Check some examples of instructions here: https://github.com/xlang-ai/instructor-embedding#use-cases. :param batch_size: Number of strings to encode at once. :param progress_bar: If true, displays progress bar during embedding. From 3302b5071001cf392b0158a46390ddb77bd2aa54 Mon Sep 17 00:00:00 2001 From: Daria Fokina Date: Tue, 3 Oct 2023 10:44:58 +0200 Subject: [PATCH 13/14] doc cleanup (whitespace) --- .../instructor_embedders/instructor_document_embedder.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py index 3724ef9c4..5981b80f2 100644 --- a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py +++ b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py @@ -39,8 +39,7 @@ def __init__( "Represent the 'domain' 'text_type' for 'task_objective'", where: - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc. - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. - - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, - classify the sentence, etc. + - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, classify the sentence, etc. Check some examples of instructions here: https://github.com/xlang-ai/instructor-embedding#use-cases :param batch_size: Number of strings to encode at once. :param progress_bar: If true, displays progress bar during embedding. From 3e65f9a0043279314855aba5bf9ad088f47adafe Mon Sep 17 00:00:00 2001 From: Daria Fokina Date: Tue, 3 Oct 2023 10:46:09 +0200 Subject: [PATCH 14/14] cleanup whitespace 2 --- .../instructor_embedders/instructor_document_embedder.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py index 5981b80f2..7fd369cd2 100644 --- a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py +++ b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py @@ -39,8 +39,9 @@ def __init__( "Represent the 'domain' 'text_type' for 'task_objective'", where: - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc. - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc. - - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, classify the sentence, etc. - Check some examples of instructions here: https://github.com/xlang-ai/instructor-embedding#use-cases + - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, + classify the sentence, etc. + Check some examples of instructions here: https://github.com/xlang-ai/instructor-embedding#use-cases. :param batch_size: Number of strings to encode at once. :param progress_bar: If true, displays progress bar during embedding. :param normalize_embeddings: If set to true, returned vectors will have the length of 1.