deepset-ai · awinml · Sep 19, 2023 · Sep 19, 2023
@@ -0,0 +1,129 @@
+from typing import List, Optional, Union, Dict, Any
+
+from haystack.preview import component, Document, default_to_dict, default_from_dict
+from haystack.preview.embedding_backends.instructor_backend import _InstructorEmbeddingBackendFactory
+
+
+@component
+class InstructorDocumentEmbedder:
+    """
+    A component for computing Document embeddings using INSTRUCTOR embedding models.
+    The embedding of each Document is stored in the `embedding` field of the Document.
+    """
+
+    def __init__(
+        self,
+        model_name_or_path: str = "hkunlp/instructor-base",
+        device: Optional[str] = None,
+        use_auth_token: Union[bool, str, None] = None,
+        instruction: str = "Represent the 'domain' 'text_type' for 'task_objective'",
+        batch_size: int = 32,
+        progress_bar: bool = True,
+        normalize_embeddings: bool = False,
+        metadata_fields_to_embed: Optional[List[str]] = None,
+        embedding_separator: str = "\n",
+    ):
+        """
+        Create a InstructorDocumentEmbedder component.
+
+        :param model_name_or_path: Local path or name of the model in Hugging Face's model hub, such as ``'sentence-transformers/all-mpnet-base-v2'``.
+        :param device: Device (like 'cuda' / 'cpu') that should be used for computation. If None, checks if a GPU can be used.
+        :param use_auth_token: The API token used to download private models from Hugging Face.
+                        If this parameter is set to `True`, then the token generated when running
+                        `transformers-cli login` (stored in ~/.huggingface) will be used.
+        :param instruction: The instruction string to be used while computing domain specific embeddings. The instruction follows the unified template of the form: "Represent the 'domain' 'text_type' for 'task_objective'", where
+        - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc.
+        - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc.
+        - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, classify the sentence, etc.
+        :param batch_size: Number of strings to encode at once.
+        :param progress_bar: If true, displays progress bar during embedding.
+        :param normalize_embeddings: If set to true, returned vectors will have length 1.
+        :param metadata_fields_to_embed: List of meta fields that should be embedded along with the Document content.
+        :param embedding_separator: Separator used to concatenate the meta fields to the Document content.
+        """
+
+        self.model_name_or_path = model_name_or_path
+        # TODO: remove device parameter and use Haystack's device management once migrated
+        self.device = device or "cpu"
+        self.use_auth_token = use_auth_token
+        self.instruction = instruction
+        self.batch_size = batch_size
+        self.progress_bar = progress_bar
+        self.normalize_embeddings = normalize_embeddings
+        self.metadata_fields_to_embed = metadata_fields_to_embed or []
+        self.embedding_separator = embedding_separator
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serialize this component to a dictionary.
+        """
+        return default_to_dict(
+            self,
+            model_name_or_path=self.model_name_or_path,
+            device=self.device,
+            use_auth_token=self.use_auth_token,
+            instruction=self.instruction,
+            batch_size=self.batch_size,
+            progress_bar=self.progress_bar,
+            normalize_embeddings=self.normalize_embeddings,
+            metadata_fields_to_embed=self.metadata_fields_to_embed,
+            embedding_separator=self.embedding_separator,
+        )
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "InstructorDocumentEmbedder":
+        """
+        Deserialize this component from a dictionary.
+        """
+        return default_from_dict(cls, data)
+
+    def warm_up(self):
+        """
+        Load the embedding backend.
+        """
+        if not hasattr(self, "embedding_backend"):
+            self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
+                model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
+            )
+
+    @component.output_types(documents=List[Document])
+    def run(self, documents: List[Document]):
+        """
+        Embed a list of Documents.
+        The embedding of each Document is stored in the `embedding` field of the Document.
+        """
+        if not isinstance(documents, list) or not isinstance(documents[0], Document):
+            raise TypeError(
+                "InstructorDocumentEmbedder expects a list of Documents as input."
+                "In case you want to embed a list of strings, please use the InstructorTextEmbedder."
+            )
+        if not hasattr(self, "embedding_backend"):
+            raise RuntimeError("The embedding model has not been loaded. Please call warm_up() before running.")
+
+        # TODO: once non textual Documents are properly supported, we should also prepare them for embedding here
+
+        texts_to_embed = []
+        for doc in documents:
+            meta_values_to_embed = [
+                str(doc.metadata[key])
+                for key in self.metadata_fields_to_embed
+                if key in doc.metadata and doc.metadata[key]
+            ]
+            text_to_embed = [self.instruction, self.embedding_separator.join(meta_values_to_embed + [doc.text or ""])]
+            texts_to_embed.append(text_to_embed)
+
+        embeddings = self.embedding_backend.embed(
+            texts_to_embed,
+            batch_size=self.batch_size,
+            show_progress_bar=self.progress_bar,
+            normalize_embeddings=self.normalize_embeddings,
+        )
+
+        documents_with_embeddings = []
+        for doc, emb in zip(documents, embeddings):
+            doc_as_dict = doc.to_dict()
+            doc_as_dict["embedding"] = emb
+            del doc_as_dict["id"]
+            documents_with_embeddings.append(Document.from_dict(doc_as_dict))
+
+        return {"documents": documents_with_embeddings}
@@ -0,0 +1,98 @@
+from typing import List, Optional, Union, Dict, Any
+
+from haystack.preview import component, default_to_dict, default_from_dict
+from haystack.preview.embedding_backends.instructor_backend import _InstructorEmbeddingBackendFactory
+
+
+@component
+class InstructorTextEmbedder:
+    """
+    A component for embedding strings using Sentence Transformers models.
+    """
+
+    def __init__(
+        self,
+        model_name_or_path: str = "hkunlp/instructor-base",
+        device: Optional[str] = None,
+        use_auth_token: Union[bool, str, None] = None,
+        instruction: str = "Represent the 'domain' 'text_type' for 'task_objective'",
+        batch_size: int = 32,
+        progress_bar: bool = True,
+        normalize_embeddings: bool = False,
+    ):
+        """
+        Create a InstructorTextEmbedder component.
+
+        :param model_name_or_path: Local path or name of the model in Hugging Face's model hub, such as ``'sentence-transformers/all-mpnet-base-v2'``.
+        :param device: Device (like 'cuda' / 'cpu') that should be used for computation. If None, checks if a GPU can be used.
+        :param use_auth_token: The API token used to download private models from Hugging Face.
+                        If this parameter is set to `True`, then the token generated when running
+                        `transformers-cli login` (stored in ~/.huggingface) will be used.
+        :param instruction: The instruction string to be used while computing domain specific embeddings. The instruction follows the unified template of the form: "Represent the 'domain' 'text_type' for 'task_objective'", where
+        - "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc.
+        - "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc.
+        - "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, classify the sentence, etc.
+        :param batch_size: Number of strings to encode at once.
+        :param progress_bar: If true, displays progress bar during embedding.
+        :param normalize_embeddings: If set to true, returned vectors will have length 1.
+        """
+
+        self.model_name_or_path = model_name_or_path
+        # TODO: remove device parameter and use Haystack's device management once migrated
+        self.device = device or "cpu"
+        self.use_auth_token = use_auth_token
+        self.instruction = instruction
+        self.batch_size = batch_size
+        self.progress_bar = progress_bar
+        self.normalize_embeddings = normalize_embeddings
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serialize this component to a dictionary.
+        """
+        return default_to_dict(
+            self,
+            model_name_or_path=self.model_name_or_path,
+            device=self.device,
+            use_auth_token=self.use_auth_token,
+            instruction=self.instruction,
+            batch_size=self.batch_size,
+            progress_bar=self.progress_bar,
+            normalize_embeddings=self.normalize_embeddings,
+        )
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "InstructorTextEmbedder":
+        """
+        Deserialize this component from a dictionary.
+        """
+        return default_from_dict(cls, data)
+
+    def warm_up(self):
+        """
+        Load the embedding backend.
+        """
+        if not hasattr(self, "embedding_backend"):
+            self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
+                model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
+            )
+
+    @component.output_types(embedding=List[float])
+    def run(self, text: str):
+        """Embed a string."""
+        if not isinstance(text, str):
+            raise TypeError(
+                "InstructorTextEmbedder expects a string as input."
+                "In case you want to embed a list of Documents, please use the InstructorDocumentEmbedder."
+            )
+        if not hasattr(self, "embedding_backend"):
+            raise RuntimeError("The embedding model has not been loaded. Please call warm_up() before running.")
+
+        text_to_embed = [self.instruction, text]
+        embedding = self.embedding_backend.embed(
+            [text_to_embed],
+            batch_size=self.batch_size,
+            show_progress_bar=self.progress_bar,
+            normalize_embeddings=self.normalize_embeddings,
+        )[0]
+        return {"embedding": embedding}
@@ -0,0 +1,45 @@
+from typing import List, Optional, Union, Dict
+
+from haystack.preview.lazy_imports import LazyImport
+
+with LazyImport(message="Run 'pip install InstructorEmbedding'") as instructor_embeddings_import:
+    from InstructorEmbedding import INSTRUCTOR
+
+
+class _InstructorEmbeddingBackendFactory:
+    """
+    Factory class to create instances of INSTRUCTOR embedding backends.
+    """
+
+    _instances: Dict[str, "_InstructorEmbeddingBackend"] = {}
+
+    @staticmethod
+    def get_embedding_backend(
+        model_name_or_path: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None
+    ):
+        embedding_backend_id = f"{model_name_or_path}{device}{use_auth_token}"
+
+        if embedding_backend_id in _InstructorEmbeddingBackendFactory._instances:
+            return _InstructorEmbeddingBackendFactory._instances[embedding_backend_id]
+
+        embedding_backend = _InstructorEmbeddingBackend(
+            model_name_or_path=model_name_or_path, device=device, use_auth_token=use_auth_token
+        )
+        _InstructorEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend
+        return embedding_backend
+
+
+class _InstructorEmbeddingBackend:
+    """
+    Class to manage INSTRUCTOR embeddings.
+    """
+
+    def __init__(
+        self, model_name_or_path: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None
+    ):
+        instructor_embeddings_import.check()
+        self.model = INSTRUCTOR(model_name_or_path=model_name_or_path, device=device, use_auth_token=use_auth_token)
+
+    def embed(self, data: List[List[str]], **kwargs) -> List[List[float]]:
+        embeddings = self.model.encode(data, **kwargs).tolist()
+        return embeddings
@@ -0,0 +1,5 @@
+---
+preview:
+  - |
+    Add Instructor Embedder.
+    Adds support for the INSTRUCTOR family of Embedding Models. They tailor the embeddings for different tasks and domains using a prompt (instruction) for each embedding.