Skip to content

Commit

Permalink
instructor - new devices mgmt (#441)
Browse files Browse the repository at this point in the history
  • Loading branch information
anakin87 authored Feb 21, 2024
1 parent 17faf53 commit cc32ee6
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Any, Dict, List, Optional

from haystack import Document, component, default_from_dict, default_to_dict
from haystack.utils import Secret, deserialize_secrets_inplace
from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace

from .embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory

Expand All @@ -20,8 +20,9 @@ class InstructorDocumentEmbedder:
# To use this component, install the "instructor-embedders-haystack" package.
# pip install instructor-embedders-haystack
from instructor_embedders_haystack.instructor_document_embedder import InstructorDocumentEmbedder
from haystack_integrations.components.embedders.instructor_embedders import InstructorDocumentEmbedder
from haystack.dataclasses import Document
from haystack.utils import ComponentDevice
doc_embedding_instruction = "Represent the Medical Document for retrieval:"
Expand All @@ -30,7 +31,7 @@ class InstructorDocumentEmbedder:
model="hkunlp/instructor-base",
instruction=doc_embedding_instruction,
batch_size=32,
device="cpu",
device=ComponentDevice.from_str("cpu"),
)
doc_embedder.warm_up()
Expand Down Expand Up @@ -62,7 +63,7 @@ class InstructorDocumentEmbedder:
def __init__(
self,
model: str = "hkunlp/instructor-base",
device: Optional[str] = None,
device: Optional[ComponentDevice] = None,
token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False), # noqa: B008
instruction: str = "Represent the document",
batch_size: int = 32,
Expand All @@ -76,8 +77,8 @@ def __init__(
:param model: Local path or name of the model in Hugging Face's model hub,
such as ``'hkunlp/instructor-base'``.
:param device: Device (like 'cuda' / 'cpu') that should be used for computation.
If None, checks if a GPU can be used.
:param device: The device on which the model is loaded. If `None`, the default device is automatically
selected.
:param use_auth_token: An API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
Expand All @@ -97,8 +98,7 @@ def __init__(
"""

self.model = model
# TODO: remove device parameter and use Haystack's device management once migrated
self.device = device or "cpu"
self.device = ComponentDevice.resolve_device(device)
self.token = token
self.instruction = instruction
self.batch_size = batch_size
Expand All @@ -114,7 +114,7 @@ def to_dict(self) -> Dict[str, Any]:
return default_to_dict(
self,
model=self.model,
device=self.device,
device=self.device.to_dict(),
token=self.token.to_dict() if self.token else None,
instruction=self.instruction,
batch_size=self.batch_size,
Expand All @@ -129,6 +129,9 @@ def from_dict(cls, data: Dict[str, Any]) -> "InstructorDocumentEmbedder":
"""
Deserialize this component from a dictionary.
"""
serialized_device = data["init_parameters"]["device"]
data["init_parameters"]["device"] = ComponentDevice.from_dict(serialized_device)

deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
return default_from_dict(cls, data)

Expand All @@ -138,7 +141,7 @@ def warm_up(self):
"""
if not hasattr(self, "embedding_backend"):
self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
model=self.model, device=self.device, token=self.token
model=self.model, device=self.device.to_torch_str(), token=self.token
)

@component.output_types(documents=List[Document])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Any, Dict, List, Optional

from haystack import component, default_from_dict, default_to_dict
from haystack.utils import Secret, deserialize_secrets_inplace
from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace

from .embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory

Expand All @@ -19,26 +19,29 @@ class InstructorTextEmbedder:
# To use this component, install the "instructor-embedders-haystack" package.
# pip install instructor-embedders-haystack
from instructor_embedders_haystack.instructor_text_embedder import InstructorTextEmbedder
from haystack.utils.device import ComponentDevice
from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder
text = "It clearly says online this will work on a Mac OS system. The disk comes and it does not, only Windows. Do Not order this if you have a Mac!!"
text = ("It clearly says online this will work on a Mac OS system. The disk comes and it does not, only Windows.
"Do Not order this if you have a Mac!!")
instruction = (
"Represent the Amazon comment for classifying the sentence as positive or negative"
)
text_embedder = InstructorTextEmbedder(
model="hkunlp/instructor-base", instruction=instruction,
device="cpu"
device=ComponentDevice.from_str("cpu")
)
text_embedder.warm_up()
embedding = text_embedder.run(text)
```
""" # noqa: E501
"""

def __init__(
self,
model: str = "hkunlp/instructor-base",
device: Optional[str] = None,
device: Optional[ComponentDevice] = None,
token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False), # noqa: B008
instruction: str = "Represent the sentence",
batch_size: int = 32,
Expand All @@ -50,8 +53,8 @@ def __init__(
:param model: Local path or name of the model in Hugging Face's model hub,
such as ``'hkunlp/instructor-base'``.
:param device: Device (like 'cuda' / 'cpu') that should be used for computation.
If None, checks if a GPU can be used.
:param device: The device on which the model is loaded. If `None`, the default device is automatically
selected.
:param token: The API token used to download private models from Hugging Face.
:param instruction: The instruction string to be used while computing domain-specific embeddings.
The instruction follows the unified template of the form:
Expand All @@ -67,8 +70,7 @@ def __init__(
"""

self.model = model
# TODO: remove device parameter and use Haystack's device management once migrated
self.device = device or "cpu"
self.device = ComponentDevice.resolve_device(device)
self.token = token
self.instruction = instruction
self.batch_size = batch_size
Expand All @@ -82,7 +84,7 @@ def to_dict(self) -> Dict[str, Any]:
return default_to_dict(
self,
model=self.model,
device=self.device,
device=self.device.to_dict(),
token=self.token.to_dict() if self.token else None,
instruction=self.instruction,
batch_size=self.batch_size,
Expand All @@ -95,6 +97,9 @@ def from_dict(cls, data: Dict[str, Any]) -> "InstructorTextEmbedder":
"""
Deserialize this component from a dictionary.
"""
serialized_device = data["init_parameters"]["device"]
data["init_parameters"]["device"] = ComponentDevice.from_dict(serialized_device)

deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
return default_from_dict(cls, data)

Expand All @@ -104,7 +109,7 @@ def warm_up(self):
"""
if not hasattr(self, "embedding_backend"):
self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
model=self.model, device=self.device, token=self.token
model=self.model, device=self.device.to_torch_str(), token=self.token
)

@component.output_types(embedding=List[float])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np
import pytest
from haystack import Document
from haystack.utils import Secret
from haystack.utils import ComponentDevice, Secret
from haystack_integrations.components.embedders.instructor_embedders import InstructorDocumentEmbedder


Expand All @@ -14,7 +14,7 @@ def test_init_default(self):
"""
embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base")
assert embedder.model == "hkunlp/instructor-base"
assert embedder.device == "cpu"
assert embedder.device == ComponentDevice.resolve_device(None)
assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False)
assert embedder.instruction == "Represent the document"
assert embedder.batch_size == 32
Expand All @@ -29,7 +29,7 @@ def test_init_with_parameters(self):
"""
embedder = InstructorDocumentEmbedder(
model="hkunlp/instructor-base",
device="cuda",
device=ComponentDevice.from_str("cuda:0"),
token=Secret.from_token("fake-api-token"),
instruction="Represent the 'domain' 'text_type' for 'task_objective'",
batch_size=64,
Expand All @@ -39,7 +39,7 @@ def test_init_with_parameters(self):
embedding_separator=" | ",
)
assert embedder.model == "hkunlp/instructor-base"
assert embedder.device == "cuda"
assert embedder.device == ComponentDevice.from_str("cuda:0")
assert embedder.token == Secret.from_token("fake-api-token")
assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'"
assert embedder.batch_size == 64
Expand All @@ -52,13 +52,13 @@ def test_to_dict(self):
"""
Test serialization of InstructorDocumentEmbedder to a dictionary, using default initialization parameters.
"""
embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base")
embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base", device=ComponentDevice.from_str("cpu"))
embedder_dict = embedder.to_dict()
assert embedder_dict == {
"type": "haystack_integrations.components.embedders.instructor_embedders.instructor_document_embedder.InstructorDocumentEmbedder", # noqa
"init_parameters": {
"model": "hkunlp/instructor-base",
"device": "cpu",
"device": ComponentDevice.from_str("cpu").to_dict(),
"token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"},
"instruction": "Represent the document",
"batch_size": 32,
Expand All @@ -75,7 +75,7 @@ def test_to_dict_with_custom_init_parameters(self):
"""
embedder = InstructorDocumentEmbedder(
model="hkunlp/instructor-base",
device="cuda",
device=ComponentDevice.from_str("cuda:0"),
instruction="Represent the financial document for retrieval",
batch_size=64,
progress_bar=False,
Expand All @@ -88,7 +88,7 @@ def test_to_dict_with_custom_init_parameters(self):
"type": "haystack_integrations.components.embedders.instructor_embedders.instructor_document_embedder.InstructorDocumentEmbedder", # noqa
"init_parameters": {
"model": "hkunlp/instructor-base",
"device": "cuda",
"device": ComponentDevice.from_str("cuda:0").to_dict(),
"token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"},
"instruction": "Represent the financial document for retrieval",
"batch_size": 64,
Expand All @@ -107,7 +107,7 @@ def test_from_dict(self):
"type": "haystack_integrations.components.embedders.instructor_embedders.instructor_document_embedder.InstructorDocumentEmbedder", # noqa
"init_parameters": {
"model": "hkunlp/instructor-base",
"device": "cpu",
"device": ComponentDevice.from_str("cpu").to_dict(),
"token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"},
"instruction": "Represent the 'domain' 'text_type' for 'task_objective'",
"batch_size": 32,
Expand All @@ -119,7 +119,7 @@ def test_from_dict(self):
}
embedder = InstructorDocumentEmbedder.from_dict(embedder_dict)
assert embedder.model == "hkunlp/instructor-base"
assert embedder.device == "cpu"
assert embedder.device == ComponentDevice.from_str("cpu")
assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False)
assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'"
assert embedder.batch_size == 32
Expand All @@ -136,7 +136,7 @@ def test_from_dict_with_custom_init_parameters(self):
"type": "haystack_integrations.components.embedders.instructor_embedders.instructor_document_embedder.InstructorDocumentEmbedder", # noqa
"init_parameters": {
"model": "hkunlp/instructor-base",
"device": "cuda",
"device": ComponentDevice.from_str("cuda:0").to_dict(),
"token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"},
"instruction": "Represent the financial document for retrieval",
"batch_size": 64,
Expand All @@ -148,7 +148,7 @@ def test_from_dict_with_custom_init_parameters(self):
}
embedder = InstructorDocumentEmbedder.from_dict(embedder_dict)
assert embedder.model == "hkunlp/instructor-base"
assert embedder.device == "cuda"
assert embedder.device == ComponentDevice.from_str("cuda:0")
assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False)
assert embedder.instruction == "Represent the financial document for retrieval"
assert embedder.batch_size == 64
Expand All @@ -164,7 +164,7 @@ def test_warmup(self, mocked_factory):
"""
Test for checking embedder instances after warm-up.
"""
embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base")
embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base", device=ComponentDevice.from_str("cpu"))
mocked_factory.get_embedding_backend.assert_not_called()
embedder.warm_up()
mocked_factory.get_embedding_backend.assert_called_once_with(
Expand Down Expand Up @@ -254,7 +254,7 @@ def test_embed_metadata(self):
def test_run(self):
embedder = InstructorDocumentEmbedder(
model="hkunlp/instructor-base",
device="cpu",
device=ComponentDevice.from_str("cpu"),
instruction="Represent the Science document for retrieval",
)
embedder.warm_up()
Expand Down
Loading

0 comments on commit cc32ee6

Please sign in to comment.