Merge branch 'deepset-ai:main' into main

deepset-ai · Mar 6, 2024 · bf0221c · bf0221c
2 parents 9800f2b + 1032b2c
commit bf0221c
Show file tree

Hide file tree

Showing 10 changed files with 104 additions and 76 deletions.
diff --git a/.github/workflows/ollama.yml b/.github/workflows/ollama.yml
@@ -33,16 +33,31 @@ jobs:
       matrix:
         os: [ubuntu-latest]
         python-version: ["3.9","3.10","3.11"]
-    services:
-      ollama:
-        image: ollama/ollama:latest
-        options: --name ollama
-        ports:
-          - 11434:11434
 
     steps:   
       - uses: actions/checkout@v4
 
+      - name: Install Ollama and pull the required models
+        run: |
+          curl -fsSL https://ollama.com/install.sh | sh
+          ollama serve &
+          
+          # Check if the service is up and running with a timeout of 60 seconds
+          timeout=60
+          while [ $timeout -gt 0 ] && ! curl -sSf http://localhost:11434/ > /dev/null; do
+            echo "Waiting for Ollama service to start..."
+            sleep 5
+            ((timeout-=5))
+          done
+
+          if [ $timeout -eq 0 ]; then
+            echo "Timed out waiting for Ollama service to start."
+            exit 1
+          fi
+          
+          ollama pull ${{ env.LLM_FOR_TESTS }}
+          ollama pull ${{ env.EMBEDDER_FOR_TESTS }}
+
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v5
         with:
@@ -55,12 +70,6 @@ jobs:
         if: matrix.python-version == '3.9'
         run: hatch run lint:all
 
-      - name: Pull the LLM in the Ollama service
-        run: docker exec ollama ollama pull ${{ env.LLM_FOR_TESTS }}    
-
-      - name: Pull the Embedding Model in the Ollama service
-        run: docker exec ollama ollama pull ${{ env.EMBEDDER_FOR_TESTS }}
-
       - name: Generate docs
         if: matrix.python-version == '3.9' && runner.os == 'Linux'
         run: hatch run docs

diff --git a/integrations/amazon_bedrock/src/haystack_integrations/common/amazon_bedrock/errors.py b/integrations/amazon_bedrock/src/haystack_integrations/common/amazon_bedrock/errors.py
@@ -1,6 +1,3 @@
-from typing import Optional
-
-
 class AmazonBedrockError(Exception):
     """
     Any error generated by the Amazon Bedrock integration.
@@ -10,41 +7,14 @@ class AmazonBedrockError(Exception):
     `AmazonBedrockError.message` will exist and have the expected content.
     """
 
-    def __init__(
-        self,
-        message: Optional[str] = None,
-    ):
-        super().__init__()
-        if message:
-            self.message = message
-
-    def __getattr__(self, attr):
-        # If self.__cause__ is None, it will raise the expected AttributeError
-        getattr(self.__cause__, attr)
-
-    def __str__(self):
-        return self.message
-
-    def __repr__(self):
-        return str(self)
-
 
 class AWSConfigurationError(AmazonBedrockError):
     """Exception raised when AWS is not configured correctly"""
 
-    def __init__(self, message: Optional[str] = None):
-        super().__init__(message=message)
-
 
 class AmazonBedrockConfigurationError(AmazonBedrockError):
     """Exception raised when AmazonBedrock node is not configured correctly"""
 
-    def __init__(self, message: Optional[str] = None):
-        super().__init__(message=message)
-
 
 class AmazonBedrockInferenceError(AmazonBedrockError):
     """Exception for issues that occur in the Bedrock inference node"""
-
-    def __init__(self, message: Optional[str] = None):
-        super().__init__(message=message)
diff --git a/...edrock/src/haystack_integrations/components/embedders/amazon_bedrock/document_embedder.py b/...edrock/src/haystack_integrations/components/embedders/amazon_bedrock/document_embedder.py
@@ -77,20 +77,21 @@ def __init__(
 
         :param model: The embedding model to use. The model has to be specified in the format outlined in the Amazon
             Bedrock [documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/model-ids.html).
-        :type model: Literal["amazon.titan-embed-text-v1", "cohere.embed-english-v3", "cohere.embed-multilingual-v3"]
         :param aws_access_key_id: AWS access key ID.
         :param aws_secret_access_key: AWS secret access key.
         :param aws_session_token: AWS session token.
         :param aws_region_name: AWS region name.
         :param aws_profile_name: AWS profile name.
-        :param batch_size: Number of Documents to encode at once. Default is 32.
+        :param batch_size: Number of Documents to encode at once.
             Only Cohere models support batch inference. This parameter is ignored for Amazon Titan models.
         :param progress_bar: Whether to show a progress bar or not. Can be helpful to disable in production deployments
             to keep the logs clean.
         :param meta_fields_to_embed: List of meta fields that should be embedded along with the Document text.
         :param embedding_separator: Separator used to concatenate the meta fields to the Document text.
         :param kwargs: Additional parameters to pass for model inference. For example, `input_type` and `truncate` for
             Cohere models.
+        :raises ValueError: If the model is not supported.
+        :raises AmazonBedrockConfigurationError: If the AWS environment is not configured correctly.
         """
 
         if not model or model not in SUPPORTED_EMBEDDING_MODELS:
@@ -218,6 +219,13 @@ def _embed_titan(self, documents: List[Document]) -> List[Document]:
 
     @component.output_types(documents=List[Document])
     def run(self, documents: List[Document]):
+        """Embed the provided `Document`s using the specified model.
+
+        :param documents: The `Document`s to embed.
+        :returns: A dictionary with the following keys:
+            - `documents`: The `Document`s with the `embedding` field populated.
+        :raises AmazonBedrockInferenceError: If the inference fails.
+        """
         if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
             msg = (
                 "AmazonBedrockDocumentEmbedder expects a list of Documents as input."
@@ -234,8 +242,10 @@ def run(self, documents: List[Document]):
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Serialize this component to a dictionary.
-        :returns: The serialized component as a dictionary.
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
         """
         return default_to_dict(
             self,
@@ -255,7 +265,12 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "AmazonBedrockDocumentEmbedder":
         """
-        Deserialize this component from a dictionary.
+        Deserializes the component from a dictionary.
+
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+            Deserialized component.
         """
         deserialize_secrets_inplace(
             data["init_parameters"],

diff --git a/...on_bedrock/src/haystack_integrations/components/embedders/amazon_bedrock/text_embedder.py b/...on_bedrock/src/haystack_integrations/components/embedders/amazon_bedrock/text_embedder.py
@@ -66,14 +66,15 @@ def __init__(
 
         :param model: The embedding model to use. The model has to be specified in the format outlined in the Amazon
             Bedrock [documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/model-ids.html).
-        :type model: Literal["amazon.titan-embed-text-v1", "cohere.embed-english-v3", "cohere.embed-multilingual-v3"]
         :param aws_access_key_id: AWS access key ID.
         :param aws_secret_access_key: AWS secret access key.
         :param aws_session_token: AWS session token.
         :param aws_region_name: AWS region name.
         :param aws_profile_name: AWS profile name.
         :param kwargs: Additional parameters to pass for model inference. For example, `input_type` and `truncate` for
             Cohere models.
+        :raises ValueError: If the model is not supported.
+        :raises AmazonBedrockConfigurationError: If the AWS environment is not configured correctly.
         """
         if not model or model not in SUPPORTED_EMBEDDING_MODELS:
             msg = "Please provide a valid model from the list of supported models: " + ", ".join(
@@ -110,6 +111,14 @@ def resolve_secret(secret: Optional[Secret]) -> Optional[str]:
 
     @component.output_types(embedding=List[float])
     def run(self, text: str):
+        """Embeds the input text using the Amazon Bedrock model.
+
+        :param text: The input text to embed.
+        :returns: A dictionary with the following keys:
+            - `embedding`: The embedding of the input text.
+        :raises TypeError: If the input text is not a string.
+        :raises AmazonBedrockInferenceError: If the model inference fails.
+        """
         if not isinstance(text, str):
             msg = (
                 "AmazonBedrockTextEmbedder expects a string as an input."
@@ -153,8 +162,10 @@ def run(self, text: str):
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Serialize this component to a dictionary.
-        :returns: The serialized component as a dictionary.
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
         """
         return default_to_dict(
             self,
@@ -170,9 +181,12 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "AmazonBedrockTextEmbedder":
         """
-        Deserialize this component from a dictionary.
-        :param data: The dictionary representation of this component.
-        :returns: The deserialized component instance.
+        Deserializes the component from a dictionary.
+
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+            Deserialized component.
         """
         deserialize_secrets_inplace(
             data["init_parameters"],

diff --git a/...amazon_bedrock/src/haystack_integrations/components/generators/amazon_bedrock/adapters.py b/...amazon_bedrock/src/haystack_integrations/components/generators/amazon_bedrock/adapters.py
@@ -103,6 +103,10 @@ def prepare_body(self, prompt: str, **inference_kwargs) -> Dict[str, Any]:
         Prepares the body for the Claude model
 
         :param prompt: The prompt to be sent to the model.
+        :param inference_kwargs: Additional keyword arguments passed to the handler.
+        :returns: A dictionary with the following keys:
+            - `prompt`: The prompt to be sent to the model.
+            - specified inference parameters.
         """
         default_params = {
             "max_tokens_to_sample": self.max_length,
@@ -146,7 +150,9 @@ def prepare_body(self, prompt: str, **inference_kwargs) -> Dict[str, Any]:
 
         :param prompt: The prompt to be sent to the model.
         :param inference_kwargs: Additional keyword arguments passed to the handler.
-        :returns: A dictionary containing the body for the request.
+        :returns: A dictionary with the following keys:
+            - `prompt`: The prompt to be sent to the model.
+            - specified inference parameters.
         """
         default_params = {
             "max_tokens": self.max_length,
@@ -191,6 +197,14 @@ class AI21LabsJurassic2Adapter(BedrockModelAdapter):
     """
 
     def prepare_body(self, prompt: str, **inference_kwargs) -> Dict[str, Any]:
+        """Prepares the body for the Jurassic 2 model.
+
+        :param prompt: The prompt to be sent to the model.
+        :param inference_kwargs: Additional keyword arguments passed to the handler.
+        :returns: A dictionary with the following keys:
+            -  `prompt`: The prompt to be sent to the model.
+            - specified inference parameters.
+        """
         default_params = {
             "maxTokens": self.max_length,
             "stopSequences": None,
@@ -226,7 +240,9 @@ def prepare_body(self, prompt: str, **inference_kwargs) -> Dict[str, Any]:
 
         :param prompt: The prompt to be sent to the model.
         :param inference_kwargs: Additional keyword arguments passed to the handler.
-        :returns: A dictionary containing the body for the request.
+        :returns: A dictionary with the following keys
+            - `inputText`: The prompt to be sent to the model.
+            - specified inference parameters.
         """
         default_params = {
             "maxTokenCount": self.max_length,
@@ -270,7 +286,9 @@ def prepare_body(self, prompt: str, **inference_kwargs) -> Dict[str, Any]:
 
         :param prompt: The prompt to be sent to the model.
         :param inference_kwargs: Additional keyword arguments passed to the handler.
-        :returns: A dictionary containing the body for the request.
+        :returns: A dictionary with the following keys:
+            - `prompt`: The prompt to be sent to the model.
+            - specified inference parameters.
         """
         default_params = {
             "max_gen_len": self.max_length,

diff --git a/...mazon_bedrock/src/haystack_integrations/components/generators/amazon_bedrock/generator.py b/...mazon_bedrock/src/haystack_integrations/components/generators/amazon_bedrock/generator.py
@@ -84,7 +84,9 @@ def __init__(
         :param aws_profile_name: The AWS profile name.
         :param max_length: The maximum length of the generated text.
         :param kwargs: Additional keyword arguments to be passed to the model.
-
+        :raises ValueError: If the model name is empty or None.
+        :raises AmazonBedrockConfigurationError: If the AWS environment is not configured correctly or the model is
+            not supported.
         """
         if not model:
             msg = "'model' cannot be None or empty string"
@@ -226,7 +228,9 @@ def run(self, prompt: str, generation_kwargs: Optional[Dict[str, Any]] = None):
         :param prompt: The prompt to generate a response for.
         :param generation_kwargs: Additional keyword arguments passed to the generator.
         :returns: A dictionary with the following keys:
-            - `replies`: A list of generated responses (strings).
+            - `replies`: A list of generated responses.
+        :raises ValueError: If the prompt is empty or None.
+        :raises AmazonBedrockInferenceError: If the model cannot be invoked.
         """
         return {"replies": self.invoke(prompt=prompt, **(generation_kwargs or {}))}
 
@@ -269,7 +273,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "AmazonBedrockGenerator":
         :param data:
             Dictionary to deserialize from.
         :returns:
-              Deserialized component.
+            Deserialized component.
         """
         deserialize_secrets_inplace(
             data["init_parameters"],

diff --git a/...amazon_bedrock/src/haystack_integrations/components/generators/amazon_bedrock/handlers.py b/...amazon_bedrock/src/haystack_integrations/components/generators/amazon_bedrock/handlers.py
@@ -15,6 +15,8 @@ def __init__(self, tokenizer: Union[str, PreTrainedTokenizerBase], model_max_len
         :param tokenizer: The tokenizer to be used to tokenize the prompt.
         :param model_max_length: The maximum length of the prompt and answer tokens combined.
         :param max_length: The maximum length of the answer tokens.
+        :raises ValueError: If the tokenizer is not a string or a `PreTrainedTokenizer` or `PreTrainedTokenizerFast`
+            instance.
         """
         if isinstance(tokenizer, str):
             self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)

diff --git a/...d/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py b/...d/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py
@@ -68,11 +68,11 @@ def __init__(
         Create an FastembedDocumentEmbedder component.
 
         :param model: Local path or name of the model in Hugging Face's model hub,
-            such as ``'BAAI/bge-small-en-v1.5'``.
-        :param cache_dir (str, optional): The path to the cache directory.
+            such as `BAAI/bge-small-en-v1.5`.
+        :param cache_dir: The path to the cache directory.
                 Can be set using the `FASTEMBED_CACHE_PATH` env variable.
                 Defaults to `fastembed_cache` in the system's temp directory.
-        :param threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
+        :param threads: The number of threads single onnxruntime session can use. Defaults to None.
         :param prefix: A string to add to the beginning of each text.
         :param suffix: A string to add to the end of each text.
         :param batch_size: Number of strings to encode at once.

diff --git a/...embed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py b/...embed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py
@@ -42,12 +42,11 @@ def __init__(
         """
         Create a FastembedTextEmbedder component.
 
-        :param model: Local path or name of the model in Fastembed's model hub,
-            such as ``'BAAI/bge-small-en-v1.5'``.
-        :param cache_dir (str, optional): The path to the cache directory.
+        :param model: Local path or name of the model in Fastembed's model hub, such as `BAAI/bge-small-en-v1.5`
+        :param cache_dir: The path to the cache directory.
                 Can be set using the `FASTEMBED_CACHE_PATH` env variable.
                 Defaults to `fastembed_cache` in the system's temp directory.
-        :param threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
+        :param threads: The number of threads single onnxruntime session can use. Defaults to None.
         :param batch_size: Number of strings to encode at once.
         :param prefix: A string to add to the beginning of each text.
         :param suffix: A string to add to the end of each text.

diff --git a/integrations/ollama/pyproject.toml b/integrations/ollama/pyproject.toml
@@ -157,20 +157,17 @@ ban-relative-imports = "parents"
 
 
 [tool.coverage.run]
-source_pkgs = ["src", "tests"]
+source = ["haystack_integrations"]
 branch = true
-parallel = true
-
-
-[tool.coverage.paths]
-ollama_haystack = ["src/haystack_integrations", "*/ollama-haystack/src"]
-tests = ["tests", "*/ollama-haystack/tests"]
+parallel = false
 
 [tool.coverage.report]
+omit = ["*/tests/*", "*/__init__.py"]
+show_missing=true
 exclude_lines = [
-    "no cov",
-    "if __name__ == .__main__.:",
-    "if TYPE_CHECKING:",
+  "no cov",
+  "if __name__ == .__main__.:",
+  "if TYPE_CHECKING:",
 ]
 
 [tool.pytest.ini_options]