Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add INSTRUCTOR Embedder (v2) #5836

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from typing import List, Optional, Union, Dict, Any

from haystack.preview import component, Document, default_to_dict, default_from_dict
from haystack.preview.embedding_backends.instructor_backend import _InstructorEmbeddingBackendFactory


@component
class InstructorDocumentEmbedder:
"""
A component for computing Document embeddings using INSTRUCTOR embedding models.
The embedding of each Document is stored in the `embedding` field of the Document.
"""

def __init__(
self,
model_name_or_path: str = "hkunlp/instructor-base",
device: Optional[str] = None,
use_auth_token: Union[bool, str, None] = None,
instruction: str = "Represent the 'domain' 'text_type' for 'task_objective'",
batch_size: int = 32,
progress_bar: bool = True,
normalize_embeddings: bool = False,
metadata_fields_to_embed: Optional[List[str]] = None,
embedding_separator: str = "\n",
):
"""
Create a InstructorDocumentEmbedder component.

:param model_name_or_path: Local path or name of the model in Hugging Face's model hub, such as ``'sentence-transformers/all-mpnet-base-v2'``.
:param device: Device (like 'cuda' / 'cpu') that should be used for computation. If None, checks if a GPU can be used.
:param use_auth_token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
:param instruction: The instruction string to be used while computing domain specific embeddings. The instruction follows the unified template of the form: "Represent the 'domain' 'text_type' for 'task_objective'", where
- "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc.
- "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc.
- "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, classify the sentence, etc.
:param batch_size: Number of strings to encode at once.
:param progress_bar: If true, displays progress bar during embedding.
:param normalize_embeddings: If set to true, returned vectors will have length 1.
:param metadata_fields_to_embed: List of meta fields that should be embedded along with the Document content.
:param embedding_separator: Separator used to concatenate the meta fields to the Document content.
"""

self.model_name_or_path = model_name_or_path
# TODO: remove device parameter and use Haystack's device management once migrated
self.device = device or "cpu"
self.use_auth_token = use_auth_token
self.instruction = instruction
self.batch_size = batch_size
self.progress_bar = progress_bar
self.normalize_embeddings = normalize_embeddings
self.metadata_fields_to_embed = metadata_fields_to_embed or []
self.embedding_separator = embedding_separator

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
"""
return default_to_dict(
self,
model_name_or_path=self.model_name_or_path,
device=self.device,
use_auth_token=self.use_auth_token,
instruction=self.instruction,
batch_size=self.batch_size,
progress_bar=self.progress_bar,
normalize_embeddings=self.normalize_embeddings,
metadata_fields_to_embed=self.metadata_fields_to_embed,
embedding_separator=self.embedding_separator,
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "InstructorDocumentEmbedder":
"""
Deserialize this component from a dictionary.
"""
return default_from_dict(cls, data)

def warm_up(self):
"""
Load the embedding backend.
"""
if not hasattr(self, "embedding_backend"):
self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
)

@component.output_types(documents=List[Document])
def run(self, documents: List[Document]):
"""
Embed a list of Documents.
The embedding of each Document is stored in the `embedding` field of the Document.
"""
if not isinstance(documents, list) or not isinstance(documents[0], Document):
raise TypeError(
"InstructorDocumentEmbedder expects a list of Documents as input."
"In case you want to embed a list of strings, please use the InstructorTextEmbedder."
)
if not hasattr(self, "embedding_backend"):
raise RuntimeError("The embedding model has not been loaded. Please call warm_up() before running.")

# TODO: once non textual Documents are properly supported, we should also prepare them for embedding here

texts_to_embed = []
for doc in documents:
meta_values_to_embed = [
str(doc.metadata[key])
for key in self.metadata_fields_to_embed
if key in doc.metadata and doc.metadata[key]
]
text_to_embed = [self.instruction, self.embedding_separator.join(meta_values_to_embed + [doc.text or ""])]
texts_to_embed.append(text_to_embed)

embeddings = self.embedding_backend.embed(
texts_to_embed,
batch_size=self.batch_size,
show_progress_bar=self.progress_bar,
normalize_embeddings=self.normalize_embeddings,
)

documents_with_embeddings = []
for doc, emb in zip(documents, embeddings):
doc_as_dict = doc.to_dict()
doc_as_dict["embedding"] = emb
del doc_as_dict["id"]
documents_with_embeddings.append(Document.from_dict(doc_as_dict))

return {"documents": documents_with_embeddings}
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from typing import List, Optional, Union, Dict, Any

from haystack.preview import component, default_to_dict, default_from_dict
from haystack.preview.embedding_backends.instructor_backend import _InstructorEmbeddingBackendFactory


@component
class InstructorTextEmbedder:
"""
A component for embedding strings using Sentence Transformers models.
"""

def __init__(
self,
model_name_or_path: str = "hkunlp/instructor-base",
device: Optional[str] = None,
use_auth_token: Union[bool, str, None] = None,
instruction: str = "Represent the 'domain' 'text_type' for 'task_objective'",
batch_size: int = 32,
progress_bar: bool = True,
normalize_embeddings: bool = False,
):
"""
Create a InstructorTextEmbedder component.

:param model_name_or_path: Local path or name of the model in Hugging Face's model hub, such as ``'sentence-transformers/all-mpnet-base-v2'``.
:param device: Device (like 'cuda' / 'cpu') that should be used for computation. If None, checks if a GPU can be used.
:param use_auth_token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
:param instruction: The instruction string to be used while computing domain specific embeddings. The instruction follows the unified template of the form: "Represent the 'domain' 'text_type' for 'task_objective'", where
- "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc.
- "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc.
- "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, classify the sentence, etc.
:param batch_size: Number of strings to encode at once.
:param progress_bar: If true, displays progress bar during embedding.
:param normalize_embeddings: If set to true, returned vectors will have length 1.
"""

self.model_name_or_path = model_name_or_path
# TODO: remove device parameter and use Haystack's device management once migrated
self.device = device or "cpu"
self.use_auth_token = use_auth_token
self.instruction = instruction
self.batch_size = batch_size
self.progress_bar = progress_bar
self.normalize_embeddings = normalize_embeddings

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
"""
return default_to_dict(
self,
model_name_or_path=self.model_name_or_path,
device=self.device,
use_auth_token=self.use_auth_token,
instruction=self.instruction,
batch_size=self.batch_size,
progress_bar=self.progress_bar,
normalize_embeddings=self.normalize_embeddings,
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "InstructorTextEmbedder":
"""
Deserialize this component from a dictionary.
"""
return default_from_dict(cls, data)

def warm_up(self):
"""
Load the embedding backend.
"""
if not hasattr(self, "embedding_backend"):
self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
)

@component.output_types(embedding=List[float])
def run(self, text: str):
"""Embed a string."""
if not isinstance(text, str):
raise TypeError(
"InstructorTextEmbedder expects a string as input."
"In case you want to embed a list of Documents, please use the InstructorDocumentEmbedder."
)
if not hasattr(self, "embedding_backend"):
raise RuntimeError("The embedding model has not been loaded. Please call warm_up() before running.")

text_to_embed = [self.instruction, text]
embedding = self.embedding_backend.embed(
[text_to_embed],
batch_size=self.batch_size,
show_progress_bar=self.progress_bar,
normalize_embeddings=self.normalize_embeddings,
)[0]
return {"embedding": embedding}
45 changes: 45 additions & 0 deletions haystack/preview/embedding_backends/instructor_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from typing import List, Optional, Union, Dict

from haystack.preview.lazy_imports import LazyImport

with LazyImport(message="Run 'pip install InstructorEmbedding'") as instructor_embeddings_import:
from InstructorEmbedding import INSTRUCTOR


class _InstructorEmbeddingBackendFactory:
"""
Factory class to create instances of INSTRUCTOR embedding backends.
"""

_instances: Dict[str, "_InstructorEmbeddingBackend"] = {}

@staticmethod
def get_embedding_backend(
model_name_or_path: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None
):
embedding_backend_id = f"{model_name_or_path}{device}{use_auth_token}"

if embedding_backend_id in _InstructorEmbeddingBackendFactory._instances:
return _InstructorEmbeddingBackendFactory._instances[embedding_backend_id]

embedding_backend = _InstructorEmbeddingBackend(
model_name_or_path=model_name_or_path, device=device, use_auth_token=use_auth_token
)
_InstructorEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend
return embedding_backend


class _InstructorEmbeddingBackend:
"""
Class to manage INSTRUCTOR embeddings.
"""

def __init__(
self, model_name_or_path: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None
):
instructor_embeddings_import.check()
self.model = INSTRUCTOR(model_name_or_path=model_name_or_path, device=device, use_auth_token=use_auth_token)

def embed(self, data: List[List[str]], **kwargs) -> List[List[float]]:
embeddings = self.model.encode(data, **kwargs).tolist()
return embeddings
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
preview:
- |
Add Instructor Embedder.
Adds support for the INSTRUCTOR family of Embedding Models. They tailor the embeddings for different tasks and domains using a prompt (instruction) for each embedding.
Loading