instructor - new devices mgmt (#441)

deepset-ai · Feb 21, 2024 · cc32ee6 · cc32ee6
1 parent 17faf53
commit cc32ee6
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 50 deletions.
diff --git a/...ck_integrations/components/embedders/instructor_embedders/instructor_document_embedder.py b/...ck_integrations/components/embedders/instructor_embedders/instructor_document_embedder.py
@@ -4,7 +4,7 @@
 from typing import Any, Dict, List, Optional
 
 from haystack import Document, component, default_from_dict, default_to_dict
-from haystack.utils import Secret, deserialize_secrets_inplace
+from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace
 
 from .embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory
 
@@ -20,8 +20,9 @@ class InstructorDocumentEmbedder:
     # To use this component, install the "instructor-embedders-haystack" package.
     # pip install instructor-embedders-haystack
 
-    from instructor_embedders_haystack.instructor_document_embedder import InstructorDocumentEmbedder
+    from haystack_integrations.components.embedders.instructor_embedders import InstructorDocumentEmbedder
     from haystack.dataclasses import Document
+    from haystack.utils import ComponentDevice
 
 
     doc_embedding_instruction = "Represent the Medical Document for retrieval:"
@@ -30,7 +31,7 @@ class InstructorDocumentEmbedder:
         model="hkunlp/instructor-base",
         instruction=doc_embedding_instruction,
         batch_size=32,
-        device="cpu",
+        device=ComponentDevice.from_str("cpu"),
     )
 
     doc_embedder.warm_up()
@@ -62,7 +63,7 @@ class InstructorDocumentEmbedder:
     def __init__(
         self,
         model: str = "hkunlp/instructor-base",
-        device: Optional[str] = None,
+        device: Optional[ComponentDevice] = None,
         token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False),  # noqa: B008
         instruction: str = "Represent the document",
         batch_size: int = 32,
@@ -76,8 +77,8 @@ def __init__(
 
         :param model: Local path or name of the model in Hugging Face's model hub,
             such as ``'hkunlp/instructor-base'``.
-        :param device: Device (like 'cuda' / 'cpu') that should be used for computation.
-            If None, checks if a GPU can be used.
+        :param device: The device on which the model is loaded. If `None`, the default device is automatically
+            selected.
         :param use_auth_token: An API token used to download private models from Hugging Face.
             If this parameter is set to `True`, then the token generated when running
             `transformers-cli login` (stored in ~/.huggingface) will be used.
@@ -97,8 +98,7 @@ def __init__(
         """
 
         self.model = model
-        # TODO: remove device parameter and use Haystack's device management once migrated
-        self.device = device or "cpu"
+        self.device = ComponentDevice.resolve_device(device)
         self.token = token
         self.instruction = instruction
         self.batch_size = batch_size
@@ -114,7 +114,7 @@ def to_dict(self) -> Dict[str, Any]:
         return default_to_dict(
             self,
             model=self.model,
-            device=self.device,
+            device=self.device.to_dict(),
             token=self.token.to_dict() if self.token else None,
             instruction=self.instruction,
             batch_size=self.batch_size,
@@ -129,6 +129,9 @@ def from_dict(cls, data: Dict[str, Any]) -> "InstructorDocumentEmbedder":
         """
         Deserialize this component from a dictionary.
         """
+        serialized_device = data["init_parameters"]["device"]
+        data["init_parameters"]["device"] = ComponentDevice.from_dict(serialized_device)
+
         deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
         return default_from_dict(cls, data)
 
@@ -138,7 +141,7 @@ def warm_up(self):
         """
         if not hasattr(self, "embedding_backend"):
             self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
-                model=self.model, device=self.device, token=self.token
+                model=self.model, device=self.device.to_torch_str(), token=self.token
             )
 
     @component.output_types(documents=List[Document])

diff --git a/...ystack_integrations/components/embedders/instructor_embedders/instructor_text_embedder.py b/...ystack_integrations/components/embedders/instructor_embedders/instructor_text_embedder.py
@@ -4,7 +4,7 @@
 from typing import Any, Dict, List, Optional
 
 from haystack import component, default_from_dict, default_to_dict
-from haystack.utils import Secret, deserialize_secrets_inplace
+from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace
 
 from .embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory
 
@@ -19,26 +19,29 @@ class InstructorTextEmbedder:
     # To use this component, install the "instructor-embedders-haystack" package.
     # pip install instructor-embedders-haystack
 
-    from instructor_embedders_haystack.instructor_text_embedder import InstructorTextEmbedder
+    from haystack.utils.device import ComponentDevice
+    from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder
 
-    text = "It clearly says online this will work on a Mac OS system. The disk comes and it does not, only Windows. Do Not order this if you have a Mac!!"
+    text = ("It clearly says online this will work on a Mac OS system. The disk comes and it does not, only Windows.
+            "Do Not order this if you have a Mac!!")
     instruction = (
         "Represent the Amazon comment for classifying the sentence as positive or negative"
     )
 
     text_embedder = InstructorTextEmbedder(
         model="hkunlp/instructor-base", instruction=instruction,
-        device="cpu"
+        device=ComponentDevice.from_str("cpu")
     )
+    text_embedder.warm_up()
 
     embedding = text_embedder.run(text)
     ```
-    """  # noqa: E501
+    """
 
     def __init__(
         self,
         model: str = "hkunlp/instructor-base",
-        device: Optional[str] = None,
+        device: Optional[ComponentDevice] = None,
         token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False),  # noqa: B008
         instruction: str = "Represent the sentence",
         batch_size: int = 32,
@@ -50,8 +53,8 @@ def __init__(
 
         :param model: Local path or name of the model in Hugging Face's model hub,
             such as ``'hkunlp/instructor-base'``.
-        :param device: Device (like 'cuda' / 'cpu') that should be used for computation.
-            If None, checks if a GPU can be used.
+        :param device: The device on which the model is loaded. If `None`, the default device is automatically
+            selected.
         :param token: The API token used to download private models from Hugging Face.
         :param instruction: The instruction string to be used while computing domain-specific embeddings.
             The instruction follows the unified template of the form:
@@ -67,8 +70,7 @@ def __init__(
         """
 
         self.model = model
-        # TODO: remove device parameter and use Haystack's device management once migrated
-        self.device = device or "cpu"
+        self.device = ComponentDevice.resolve_device(device)
         self.token = token
         self.instruction = instruction
         self.batch_size = batch_size
@@ -82,7 +84,7 @@ def to_dict(self) -> Dict[str, Any]:
         return default_to_dict(
             self,
             model=self.model,
-            device=self.device,
+            device=self.device.to_dict(),
             token=self.token.to_dict() if self.token else None,
             instruction=self.instruction,
             batch_size=self.batch_size,
@@ -95,6 +97,9 @@ def from_dict(cls, data: Dict[str, Any]) -> "InstructorTextEmbedder":
         """
         Deserialize this component from a dictionary.
         """
+        serialized_device = data["init_parameters"]["device"]
+        data["init_parameters"]["device"] = ComponentDevice.from_dict(serialized_device)
+
         deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
         return default_from_dict(cls, data)
 
@@ -104,7 +109,7 @@ def warm_up(self):
         """
         if not hasattr(self, "embedding_backend"):
             self.embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend(
-                model=self.model, device=self.device, token=self.token
+                model=self.model, device=self.device.to_torch_str(), token=self.token
             )
 
     @component.output_types(embedding=List[float])

diff --git a/integrations/instructor_embedders/tests/test_instructor_document_embedder.py b/integrations/instructor_embedders/tests/test_instructor_document_embedder.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pytest
 from haystack import Document
-from haystack.utils import Secret
+from haystack.utils import ComponentDevice, Secret
 from haystack_integrations.components.embedders.instructor_embedders import InstructorDocumentEmbedder
 
 
@@ -14,7 +14,7 @@ def test_init_default(self):
         """
         embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base")
         assert embedder.model == "hkunlp/instructor-base"
-        assert embedder.device == "cpu"
+        assert embedder.device == ComponentDevice.resolve_device(None)
         assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False)
         assert embedder.instruction == "Represent the document"
         assert embedder.batch_size == 32
@@ -29,7 +29,7 @@ def test_init_with_parameters(self):
         """
         embedder = InstructorDocumentEmbedder(
             model="hkunlp/instructor-base",
-            device="cuda",
+            device=ComponentDevice.from_str("cuda:0"),
             token=Secret.from_token("fake-api-token"),
             instruction="Represent the 'domain' 'text_type' for 'task_objective'",
             batch_size=64,
@@ -39,7 +39,7 @@ def test_init_with_parameters(self):
             embedding_separator=" | ",
         )
         assert embedder.model == "hkunlp/instructor-base"
-        assert embedder.device == "cuda"
+        assert embedder.device == ComponentDevice.from_str("cuda:0")
         assert embedder.token == Secret.from_token("fake-api-token")
         assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'"
         assert embedder.batch_size == 64
@@ -52,13 +52,13 @@ def test_to_dict(self):
         """
         Test serialization of InstructorDocumentEmbedder to a dictionary, using default initialization parameters.
         """
-        embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base")
+        embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base", device=ComponentDevice.from_str("cpu"))
         embedder_dict = embedder.to_dict()
         assert embedder_dict == {
             "type": "haystack_integrations.components.embedders.instructor_embedders.instructor_document_embedder.InstructorDocumentEmbedder",  #  noqa
             "init_parameters": {
                 "model": "hkunlp/instructor-base",
-                "device": "cpu",
+                "device": ComponentDevice.from_str("cpu").to_dict(),
                 "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"},
                 "instruction": "Represent the document",
                 "batch_size": 32,
@@ -75,7 +75,7 @@ def test_to_dict_with_custom_init_parameters(self):
         """
         embedder = InstructorDocumentEmbedder(
             model="hkunlp/instructor-base",
-            device="cuda",
+            device=ComponentDevice.from_str("cuda:0"),
             instruction="Represent the financial document for retrieval",
             batch_size=64,
             progress_bar=False,
@@ -88,7 +88,7 @@ def test_to_dict_with_custom_init_parameters(self):
             "type": "haystack_integrations.components.embedders.instructor_embedders.instructor_document_embedder.InstructorDocumentEmbedder",  #  noqa
             "init_parameters": {
                 "model": "hkunlp/instructor-base",
-                "device": "cuda",
+                "device": ComponentDevice.from_str("cuda:0").to_dict(),
                 "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"},
                 "instruction": "Represent the financial document for retrieval",
                 "batch_size": 64,
@@ -107,7 +107,7 @@ def test_from_dict(self):
             "type": "haystack_integrations.components.embedders.instructor_embedders.instructor_document_embedder.InstructorDocumentEmbedder",  #  noqa
             "init_parameters": {
                 "model": "hkunlp/instructor-base",
-                "device": "cpu",
+                "device": ComponentDevice.from_str("cpu").to_dict(),
                 "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"},
                 "instruction": "Represent the 'domain' 'text_type' for 'task_objective'",
                 "batch_size": 32,
@@ -119,7 +119,7 @@ def test_from_dict(self):
         }
         embedder = InstructorDocumentEmbedder.from_dict(embedder_dict)
         assert embedder.model == "hkunlp/instructor-base"
-        assert embedder.device == "cpu"
+        assert embedder.device == ComponentDevice.from_str("cpu")
         assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False)
         assert embedder.instruction == "Represent the 'domain' 'text_type' for 'task_objective'"
         assert embedder.batch_size == 32
@@ -136,7 +136,7 @@ def test_from_dict_with_custom_init_parameters(self):
             "type": "haystack_integrations.components.embedders.instructor_embedders.instructor_document_embedder.InstructorDocumentEmbedder",  #  noqa
             "init_parameters": {
                 "model": "hkunlp/instructor-base",
-                "device": "cuda",
+                "device": ComponentDevice.from_str("cuda:0").to_dict(),
                 "token": {"env_vars": ["HF_API_TOKEN"], "strict": False, "type": "env_var"},
                 "instruction": "Represent the financial document for retrieval",
                 "batch_size": 64,
@@ -148,7 +148,7 @@ def test_from_dict_with_custom_init_parameters(self):
         }
         embedder = InstructorDocumentEmbedder.from_dict(embedder_dict)
         assert embedder.model == "hkunlp/instructor-base"
-        assert embedder.device == "cuda"
+        assert embedder.device == ComponentDevice.from_str("cuda:0")
         assert embedder.token == Secret.from_env_var("HF_API_TOKEN", strict=False)
         assert embedder.instruction == "Represent the financial document for retrieval"
         assert embedder.batch_size == 64
@@ -164,7 +164,7 @@ def test_warmup(self, mocked_factory):
         """
         Test for checking embedder instances after warm-up.
         """
-        embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base")
+        embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base", device=ComponentDevice.from_str("cpu"))
         mocked_factory.get_embedding_backend.assert_not_called()
         embedder.warm_up()
         mocked_factory.get_embedding_backend.assert_called_once_with(
@@ -254,7 +254,7 @@ def test_embed_metadata(self):
     def test_run(self):
         embedder = InstructorDocumentEmbedder(
             model="hkunlp/instructor-base",
-            device="cpu",
+            device=ComponentDevice.from_str("cpu"),
             instruction="Represent the Science document for retrieval",
         )
         embedder.warm_up()