deepset-ai · davidsbatista · Sep 30, 2024 · Sep 13, 2024 · Sep 13, 2024 · Sep 13, 2024
@@ -48,7 +48,7 @@ The latest version of the package contains the following experiments:
 | [`ChatMessageRetriever`][6] | Memory Component | December 2024                | None         | <a href="https://colab.research.google.com/github/deepset-ai/haystack-cookbook/blob/main/notebooks/conversational_rag_using_memory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/> | [Discuss](https://github.com/deepset-ai/haystack-experimental/discussions/75) |
 | [`InMemoryChatMessageStore`][7] | Memory Store | December 2024                | None         | <a href="https://colab.research.google.com/github/deepset-ai/haystack-cookbook/blob/main/notebooks/conversational_rag_using_memory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/> | [Discuss](https://github.com/deepset-ai/haystack-experimental/discussions/75)  | 
 | [`Auto-Merging Retriever`][8] & [`HierarchicalDocumentSplitter`][9]| Document Splitting & Retrieval Technique        | December 2024                | None        |  <a href="https://colab.research.google.com/github/deepset-ai/haystack-cookbook/blob/main/notebooks/auto_merging_retriever.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/> | [Discuss](https://github.com/deepset-ai/haystack-experimental/discussions/78) |
-| [`LLMetadataExtractor`][13]                                                                                                                               | Metadata extraction with LLM             | Dezember 2025                | None        |   |  |
+| [`LLMetadataExtractor`][13]                                                                                                                               | Metadata extraction with LLM             | December 2024                | None        |   |  |
 
 [1]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/evaluation/harness
 [2]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/components/tools/openai

@@ -3,12 +3,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-import logging
 from enum import Enum
 from typing import Any, Dict, List, Optional, Tuple, Union
-from warnings import warn
 
-from haystack import Document, component, default_from_dict, default_to_dict
+from haystack import Document, component, default_from_dict, default_to_dict, logging
 from haystack.components.builders import PromptBuilder
 from haystack.components.generators import AzureOpenAIGenerator, OpenAIGenerator
 from haystack.lazy_imports import LazyImport
@@ -49,18 +47,6 @@ def from_str(string: str) -> "LLMProvider":
             raise ValueError(msg)
         return provider
 
-    @classmethod
-    def from_dict(cls, data: str) -> "LLMProvider":
-        """
-        Deserializes the component from a dictionary.
-
-        :param data:
-            Dictionary with serialized data.
-        :returns:
-            An instance of the component.
-        """
-        return cls.from_str(data)
-
 
 @component
 class LLMMetadataExtractor:
@@ -157,14 +143,14 @@ def __init__( # pylint: disable=R0917
         self.generator_api = generator_api
         self.generator_api_params = generator_api_params or {}
         self.llm_provider = self._init_generator(generator_api, self.generator_api_params)
-        self._check_prompt()
-
-    def _check_prompt(self):
         if self.input_text not in self.prompt:
-            raise ValueError(f"{self.input_text} must be in the prompt.")
+            raise ValueError(f"Input text '{self.input_text}' must be in the prompt.")
 
     @staticmethod
-    def _init_generator(generator_api: LLMProvider, generator_api_params: Optional[Dict[str, Any]]):
+    def _init_generator(
+            generator_api: LLMProvider,
+            generator_api_params: Optional[Dict[str, Any]]
+    ) -> Union[OpenAIGenerator, AzureOpenAIGenerator, AmazonBedrockGenerator, VertexAIGeminiGenerator]:
         """
         Initialize the chat generator based on the specified API provider and parameters.
         """
@@ -200,18 +186,16 @@ def is_valid_json_and_has_expected_keys(self, expected: List[str], received: str
         try:
             parsed_output = json.loads(received)
         except json.JSONDecodeError:
-            msg = "Response from LLM evaluator is not a valid JSON."
+            msg = "Response from LLM is not a valid JSON."
             if self.raise_on_failure:
                 raise ValueError(msg)
-            warn(msg)
             logger.warning(msg)
             return False
 
         if not all(output in parsed_output for output in expected):
-            msg = f"Expected response from LLM evaluator to be JSON with keys {expected}, got {received}."
+            msg = f"Expected response from LLM to be a JSON with keys {expected}, got {received}."
             if self.raise_on_failure:
                 raise ValueError(msg)
-            warn(msg)
             logger.warning(msg)
             return False
 
@@ -257,13 +241,15 @@ def from_dict(cls, data: Dict[str, Any]) -> "LLMMetadataExtractor":
 
 
     @component.output_types(documents=List[Document], errors=List[Tuple[str,Any]])
-    def run(self, documents: List[Document]) -> Dict[str, Union[List[Document], List[Tuple[str, Any]]]]:
+    def run(self, documents: List[Document]) -> Dict[str, Any]:
         """
         Extract metadata from documents using a Language Model.
 
         :param documents: List of documents to extract metadata from.
         :returns:
-            A dictionary with the key "documents_meta" containing the documents with extracted metadata.
+            A dictionary with the keys:
+            - "documents": List of documents with extracted metadata.
+            - "errors": List of tuples with document ID and error message or None if successful.
         """
         errors = []
         for document in documents:

@@ -11,7 +11,8 @@
 
 class TestLLMMetadataExtractor:
 
-    def test_init_default(self):
+    def test_init_default(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         extractor = LLMMetadataExtractor(
             prompt="prompt {{test}}",
             expected_keys=["key1", "key2"],
@@ -24,7 +25,8 @@ def test_init_default(self):
         assert extractor.raise_on_failure is False
         assert extractor.input_text == "test"
 
-    def test_init_with_parameters(self):
+    def test_init_with_parameters(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         extractor = LLMMetadataExtractor(
             prompt="prompt {{test}}",
             expected_keys=["key1", "key2"],
@@ -101,6 +103,7 @@ def test_from_dict(self, monkeypatch):
         assert extractor.prompt == "some prompt that was used with the LLM {{test}}"
         assert extractor.generator_api == LLMProvider.OPENAI
 
+    @pytest.mark.integration
     @pytest.mark.skipif(
         not os.environ.get("OPENAI_API_KEY", None),
         reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",