From 9d176a2d8ee2f006584f2df54e1ea9747137f746 Mon Sep 17 00:00:00 2001
From: shadeMe <shadeMe@users.noreply.github.com>
Date: Wed, 29 May 2024 16:43:45 +0200
Subject: [PATCH] feat: Implement `RAGEvaluationHarness` and related classes

---
 haystack_experimental/evaluation/__init__.py  |   4 +-
 .../evaluation/harness/__init__.py            |   4 +-
 .../evaluation/harness/evalution_harness.py   |  15 +-
 .../evaluation/harness/rag/__init__.py        |  23 +
 .../harness/rag/evaluation_pipeline.py        |  49 ++
 .../evaluation/harness/rag/harness.py         | 355 +++++++++++
 .../evaluation/harness/rag/parameters.py      | 150 +++++
 test/evaluation/harness/__init__.py           |   3 +
 test/evaluation/harness/rag/__init__.py       |   3 +
 test/evaluation/harness/rag/test_harness.py   | 560 ++++++++++++++++++
 test/test_experimental.py                     |   2 -
 11 files changed, 1155 insertions(+), 13 deletions(-)
 create mode 100644 haystack_experimental/evaluation/harness/rag/__init__.py
 create mode 100644 haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py
 create mode 100644 haystack_experimental/evaluation/harness/rag/harness.py
 create mode 100644 haystack_experimental/evaluation/harness/rag/parameters.py
 create mode 100644 test/evaluation/harness/__init__.py
 create mode 100644 test/evaluation/harness/rag/__init__.py
 create mode 100644 test/evaluation/harness/rag/test_harness.py
 delete mode 100644 test/test_experimental.py

diff --git a/haystack_experimental/evaluation/__init__.py b/haystack_experimental/evaluation/__init__.py
index 4546e497..b5119922 100644
--- a/haystack_experimental/evaluation/__init__.py
+++ b/haystack_experimental/evaluation/__init__.py
@@ -2,6 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from .harness import EvalRunOverrides, EvaluationHarness
+from .harness import EvaluationHarness, EvaluationRunOverrides
 
-_all_ = ["EvaluationHarness", "EvalRunOverrides"]
+_all_ = ["EvaluationHarness", "EvaluationRunOverrides"]
diff --git a/haystack_experimental/evaluation/harness/__init__.py b/haystack_experimental/evaluation/harness/__init__.py
index 90792912..6d761a05 100644
--- a/haystack_experimental/evaluation/harness/__init__.py
+++ b/haystack_experimental/evaluation/harness/__init__.py
@@ -2,6 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from .evalution_harness import EvalRunOverrides, EvaluationHarness
+from .evalution_harness import EvaluationHarness, EvaluationRunOverrides
 
-_all_ = ["EvaluationHarness", "EvalRunOverrides"]
+_all_ = ["EvaluationHarness", "EvaluationRunOverrides"]
diff --git a/haystack_experimental/evaluation/harness/evalution_harness.py b/haystack_experimental/evaluation/harness/evalution_harness.py
index e5977ee5..f1015139 100644
--- a/haystack_experimental/evaluation/harness/evalution_harness.py
+++ b/haystack_experimental/evaluation/harness/evalution_harness.py
@@ -8,11 +8,10 @@
 
 from haystack import Pipeline
 from haystack.core.serialization import DeserializationCallbacks
-from haystack.evaluation.eval_run_result import BaseEvaluationRunResult
 
 
 @dataclass
-class EvalRunOverrides:
+class EvaluationRunOverrides:
     """
     Overrides for an evaluation run.
 
@@ -32,7 +31,7 @@ class EvalRunOverrides:
 
 
 EvalRunInputT = TypeVar("EvalRunInputT")
-EvalRunOutputT = TypeVar("EvalRunOutputT", bound=BaseEvaluationRunResult)
+EvalRunOutputT = TypeVar("EvalRunOutputT")
 EvalRunOverridesT = TypeVar("EvalRunOverridesT")
 
 
@@ -43,9 +42,7 @@ class EvaluationHarness(ABC, Generic[EvalRunInputT, EvalRunOverridesT, EvalRunOu
 
     @staticmethod
     def _override_pipeline(pipeline: Pipeline, parameter_overrides: Optional[Dict[str, Any]]) -> Pipeline:
-        def component_pre_init_callback(
-            name: str, cls: Type, init_params: Dict[str, Any]
-        ):  # pylint: disable=unused-argument
+        def component_pre_init_callback(name: str, cls: Type, init_params: Dict[str, Any]):  # pylint: disable=unused-argument
             assert parameter_overrides is not None
             overrides = parameter_overrides.get(name)
             if overrides:
@@ -70,7 +67,11 @@ def validate_overrides():
 
     @abstractmethod
     def run(
-        self, inputs: EvalRunInputT, *, overrides: Optional[EvalRunOverridesT] = None, run_name: Optional[str] = None
+        self,
+        inputs: EvalRunInputT,
+        *,
+        overrides: Optional[EvalRunOverridesT] = None,
+        run_name: Optional[str] = None,
     ) -> EvalRunOutputT:
         """
         Launch a evaluation run.
diff --git a/haystack_experimental/evaluation/harness/rag/__init__.py b/haystack_experimental/evaluation/harness/rag/__init__.py
new file mode 100644
index 00000000..cc714697
--- /dev/null
+++ b/haystack_experimental/evaluation/harness/rag/__init__.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from .harness import RAGEvaluationHarness
+from .parameters import (
+    RAGEvaluationInput,
+    RAGEvaluationMetric,
+    RAGEvaluationOutput,
+    RAGEvaluationOverrides,
+    RAGExpectedComponent,
+    RAGExpectedComponentMetadata,
+)
+
+_all_ = [
+    "RAGEvaluationHarness",
+    "RAGExpectedComponent",
+    "RAGExpectedComponentMetadata",
+    "RAGEvaluationMetric",
+    "RAGEvaluationOutput",
+    "RAGEvaluationOverrides",
+    "RAGEvaluationInput",
+]
diff --git a/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py b/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py
new file mode 100644
index 00000000..1ba64bee
--- /dev/null
+++ b/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from functools import partial
+from typing import Set
+
+from haystack import Pipeline
+from haystack.components.evaluators import (
+    DocumentMAPEvaluator,
+    DocumentMRREvaluator,
+    DocumentRecallEvaluator,
+    FaithfulnessEvaluator,
+    SASEvaluator,
+)
+from haystack.components.evaluators.document_recall import RecallMode
+
+from .parameters import RAGEvaluationMetric
+
+
+def default_rag_evaluation_pipeline(
+    metrics: Set[RAGEvaluationMetric],
+) -> Pipeline:
+    """
+    Builds the default evaluation pipeline for RAG.
+
+    :param metrics:
+        The set of metrics to include in the pipeline.
+    :returns:
+        The evaluation pipeline.
+    """
+    pipeline = Pipeline()
+
+    metric_ctors = {
+        RAGEvaluationMetric.DOCUMENT_MAP: DocumentMAPEvaluator,
+        RAGEvaluationMetric.DOCUMENT_MRR: DocumentMRREvaluator,
+        RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT: partial(DocumentRecallEvaluator, mode=RecallMode.SINGLE_HIT),
+        RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT: partial(DocumentRecallEvaluator, mode=RecallMode.MULTI_HIT),
+        RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: partial(
+            SASEvaluator, model="sentence-transformers/all-MiniLM-L6-v2"
+        ),
+        RAGEvaluationMetric.ANSWER_FAITHFULNESS: FaithfulnessEvaluator,
+    }
+
+    for metric in metrics:
+        ctor = metric_ctors[metric]
+        pipeline.add_component(metric.value, ctor())
+
+    return pipeline
diff --git a/haystack_experimental/evaluation/harness/rag/harness.py b/haystack_experimental/evaluation/harness/rag/harness.py
new file mode 100644
index 00000000..156383e8
--- /dev/null
+++ b/haystack_experimental/evaluation/harness/rag/harness.py
@@ -0,0 +1,355 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, Set
+
+from haystack import Pipeline
+from haystack.evaluation.eval_run_result import EvaluationRunResult
+
+from ...util.helpers import (
+    aggregate_batched_pipeline_outputs,
+    deaggregate_batched_pipeline_inputs,
+)
+from ...util.pipeline_pair import PipelinePair
+from ..evalution_harness import EvaluationHarness
+from .evaluation_pipeline import default_rag_evaluation_pipeline
+from .parameters import (
+    RAGEvaluationInput,
+    RAGEvaluationMetric,
+    RAGEvaluationOutput,
+    RAGEvaluationOverrides,
+    RAGExpectedComponent,
+    RAGExpectedComponentMetadata,
+)
+
+
+class RAGEvaluationHarness(EvaluationHarness[RAGEvaluationInput, RAGEvaluationOverrides, RAGEvaluationOutput]):
+    """
+    Evaluation harness for evaluating RAG pipelines.
+    """
+
+    def __init__(
+        self,
+        rag_pipeline: Pipeline,
+        rag_components: Dict[RAGExpectedComponent, RAGExpectedComponentMetadata],
+        metrics: Set[RAGEvaluationMetric],
+    ):
+        """
+        Create a evaluation harness for evaluating basic RAG pipelines.
+
+        :param rag_pipeline:
+            The RAG pipeline to evaluate.
+        :param rag_components:
+            A mapping of expected components to their metadata.
+        :param metrics:
+            The metrics to use during evaluation.
+        """
+        super().__init__()
+
+        self._validate_rag_components(rag_pipeline, rag_components)
+
+        self.rag_pipeline = rag_pipeline
+        self.rag_components = rag_components
+        self.metrics = metrics
+        self.evaluation_pipeline = default_rag_evaluation_pipeline(metrics)
+
+    @classmethod
+    def default_with_embedding_retriever(
+        cls, rag_pipeline: Pipeline, metrics: Set[RAGEvaluationMetric]
+    ) -> "RAGEvaluationHarness":
+        """
+        Create a default evaluation harness for evaluating RAG pipelines with a query embedder.
+
+        :param rag_pipeline:
+            The RAG pipeline to evaluate. The following assumptions are made:
+            - The query embedder component is named 'query_embedder' and has a 'text' input.
+            - The document retriever component is named 'retriever' and has a 'documents' output.
+            - The response generator component is named 'generator' and has a 'replies' output.
+        :param metrics:
+            The metrics to use during evaluation.
+        """
+        rag_components = {
+            RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata(
+                name="query_embedder", input_mapping={"query": "text"}
+            ),
+            RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata(
+                name="retriever", output_mapping={"retrieved_documents": "documents"}
+            ),
+            RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata(
+                name="generator", output_mapping={"replies": "replies"}
+            ),
+        }
+
+        return cls(rag_pipeline, rag_components, deepcopy(metrics))
+
+    @classmethod
+    def default_with_keyword_retriever(
+        cls, rag_pipeline: Pipeline, metrics: Set[RAGEvaluationMetric]
+    ) -> "RAGEvaluationHarness":
+        """
+        Create a default evaluation harness for evaluating RAG pipelines with a keyword retriever.
+
+        :param rag_pipeline:
+            The RAG pipeline to evaluate. The following assumptions are made:
+            - The document retriever component is named 'retriever' and has a 'query' input and a 'documents' output.
+            - The response generator component is named 'generator' and has a 'replies' output.
+        :param metrics:
+            The metrics to use during evaluation.
+        """
+        rag_components = {
+            RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata(
+                name="retriever", input_mapping={"query": "query"}
+            ),
+            RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata(
+                name="retriever", output_mapping={"retrieved_documents": "documents"}
+            ),
+            RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata(
+                name="generator", output_mapping={"replies": "replies"}
+            ),
+        }
+
+        return cls(rag_pipeline, rag_components, deepcopy(metrics))
+
+    def run(  # noqa: D102
+        self,
+        inputs: RAGEvaluationInput,
+        *,
+        overrides: Optional[RAGEvaluationOverrides] = None,
+        run_name: Optional[str] = "RAG Evaluation",
+    ) -> RAGEvaluationOutput:
+        rag_inputs = self._prepare_rag_pipeline_inputs(inputs)
+        eval_inputs = self._prepare_eval_pipeline_additional_inputs(inputs)
+        pipeline_pair = self._generate_eval_run_pipelines(overrides)
+
+        pipeline_outputs = pipeline_pair.run_first_as_batch(rag_inputs, eval_inputs)
+        rag_outputs, eval_outputs = (
+            pipeline_outputs["first"],
+            pipeline_outputs["second"],
+        )
+
+        assert run_name is not None
+        run_results = EvaluationRunResult(
+            run_name,
+            inputs={
+                "questions": inputs.queries,
+                "contexts": [
+                    [doc.content for doc in docs]
+                    for docs in self._lookup_component_output(
+                        RAGExpectedComponent.DOCUMENT_RETRIEVER,
+                        rag_outputs,
+                        "retrieved_documents",
+                    )
+                ],
+                "responses": self._lookup_component_output(
+                    RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies"
+                ),
+            },
+            results=eval_outputs,
+        )
+
+        return RAGEvaluationOutput(
+            evaluated_pipeline=pipeline_pair.first.dumps(),
+            evaluation_pipeline=pipeline_pair.second.dumps(),
+            inputs=deepcopy(inputs),
+            results=run_results,
+        )
+
+    def _lookup_component_output(
+        self,
+        component: RAGExpectedComponent,
+        outputs: Dict[str, Dict[str, Any]],
+        output_name: str,
+    ) -> Any:
+        name = self.rag_components[component].name
+        mapping = self.rag_components[component].output_mapping
+        output_name = mapping[output_name]
+        return outputs[name][output_name]
+
+    def _generate_eval_run_pipelines(self, overrides: Optional[RAGEvaluationOverrides]) -> PipelinePair:
+        if overrides is None:
+            rag_overrides = None
+            eval_overrides = None
+        else:
+            rag_overrides = overrides.rag_pipeline
+            eval_overrides = overrides.eval_pipeline
+
+        if eval_overrides is not None:
+            for metric in eval_overrides.keys():
+                if metric not in self.metrics:
+                    raise ValueError(f"Cannot override parameters of unused evaluation metric '{metric.value}'")
+
+            eval_overrides = {k.value: v for k, v in eval_overrides.items()}  # type: ignore
+
+        rag_pipeline = self._override_pipeline(self.rag_pipeline, rag_overrides)
+        eval_pipeline = self._override_pipeline(self.evaluation_pipeline, eval_overrides)  # type: ignore
+
+        return PipelinePair(
+            first=rag_pipeline,
+            second=eval_pipeline,
+            outputs_to_inputs=self._map_rag_eval_pipeline_io(),
+            map_first_outputs=lambda x: self._aggregate_rag_outputs(  # pylint: disable=unnecessary-lambda
+                x
+            ),
+            included_first_outputs={
+                RAGExpectedComponent.DOCUMENT_RETRIEVER.value,
+                RAGExpectedComponent.RESPONSE_GENERATOR.value,
+            },
+        )
+
+    def _aggregate_rag_outputs(self, outputs: List[Dict[str, Dict[str, Any]]]) -> Dict[str, Dict[str, Any]]:
+        aggregate = aggregate_batched_pipeline_outputs(outputs)
+
+        # We only care about the first response from the generator.
+        generator_name = self.rag_components[RAGExpectedComponent.RESPONSE_GENERATOR].name
+        replies_output_name = self.rag_components[RAGExpectedComponent.RESPONSE_GENERATOR].output_mapping["replies"]
+        aggregate[generator_name][replies_output_name] = [r[0] for r in aggregate[generator_name][replies_output_name]]
+
+        return aggregate
+
+    def _map_rag_eval_pipeline_io(self) -> Dict[str, List[str]]:
+        # We currently only have metric components in the eval pipeline.
+        # So, we just map those inputs to the outputs of the rag pipeline.
+        metric_inputs_to_component_outputs = {
+            RAGEvaluationMetric.DOCUMENT_MAP: {
+                "retrieved_documents": (
+                    RAGExpectedComponent.DOCUMENT_RETRIEVER,
+                    "retrieved_documents",
+                )
+            },
+            RAGEvaluationMetric.DOCUMENT_MRR: {
+                "retrieved_documents": (
+                    RAGExpectedComponent.DOCUMENT_RETRIEVER,
+                    "retrieved_documents",
+                )
+            },
+            RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT: {
+                "retrieved_documents": (
+                    RAGExpectedComponent.DOCUMENT_RETRIEVER,
+                    "retrieved_documents",
+                )
+            },
+            RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT: {
+                "retrieved_documents": (
+                    RAGExpectedComponent.DOCUMENT_RETRIEVER,
+                    "retrieved_documents",
+                )
+            },
+            RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: {
+                "predicted_answers": (
+                    RAGExpectedComponent.RESPONSE_GENERATOR,
+                    "replies",
+                )
+            },
+            RAGEvaluationMetric.ANSWER_FAITHFULNESS: {
+                "contexts": (
+                    RAGExpectedComponent.DOCUMENT_RETRIEVER,
+                    "retrieved_documents",
+                ),
+                "responses": (RAGExpectedComponent.RESPONSE_GENERATOR, "replies"),
+            },
+        }
+
+        outputs_to_inputs: Dict[str, List[str]] = {}
+        for metric in self.metrics:
+            io = metric_inputs_to_component_outputs[metric]
+            for metric_input_name, (component, component_output_name) in io.items():
+                component_out = (
+                    f"{self.rag_components[component].name}."
+                    f"{self.rag_components[component].output_mapping[component_output_name]}"
+                )
+                metric_in = f"{metric.value}.{metric_input_name}"
+                if component_out not in outputs_to_inputs:
+                    outputs_to_inputs[component_out] = []
+                outputs_to_inputs[component_out].append(metric_in)
+
+        return outputs_to_inputs
+
+    def _prepare_rag_pipeline_inputs(self, inputs: RAGEvaluationInput) -> List[Dict[str, Dict[str, Any]]]:
+        query_embedder_name = self.rag_components[RAGExpectedComponent.QUERY_PROCESSOR].name
+        query_embedder_text_input = self.rag_components[RAGExpectedComponent.QUERY_PROCESSOR].input_mapping["query"]
+
+        if inputs.additional_rag_inputs is not None:
+            # Ensure that the query embedder input is not provided as additional input.
+            existing = inputs.additional_rag_inputs.get(query_embedder_name)
+            if existing is not None:
+                existing = existing.get(query_embedder_text_input)  # type: ignore
+                if existing is not None:
+                    raise ValueError(
+                        f"Query embedder input '{query_embedder_text_input}' cannot be provided as additional input."
+                    )
+
+            # Add the queries as an aggregate input.
+            rag_inputs = deepcopy(inputs.additional_rag_inputs)
+            if query_embedder_name not in rag_inputs:
+                rag_inputs[query_embedder_name] = {}
+            rag_inputs[query_embedder_name][query_embedder_text_input] = deepcopy(inputs.queries)
+        else:
+            rag_inputs = {query_embedder_name: {query_embedder_text_input: deepcopy(inputs.queries)}}
+
+        separate_rag_inputs = deaggregate_batched_pipeline_inputs(rag_inputs)
+        return separate_rag_inputs
+
+    def _prepare_eval_pipeline_additional_inputs(self, inputs: RAGEvaluationInput) -> Dict[str, Dict[str, Any]]:
+        eval_inputs: Dict[str, Dict[str, List[Any]]] = {}
+
+        for metric in self.metrics:
+            if metric in (
+                RAGEvaluationMetric.DOCUMENT_MAP,
+                RAGEvaluationMetric.DOCUMENT_MRR,
+                RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,
+                RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT,
+            ):
+                if inputs.ground_truth_documents is None:
+                    raise ValueError(f"Ground truth documents required for metric '{metric.value}'.")
+                if len(inputs.ground_truth_documents) != len(inputs.queries):
+                    raise ValueError("Length of ground truth documents should match the number of queries.")
+
+                eval_inputs[metric.value] = {"ground_truth_documents": inputs.ground_truth_documents}
+            elif metric == RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY:
+                if inputs.ground_truth_answers is None:
+                    raise ValueError(f"Ground truth answers required for metric '{metric.value}'.")
+                if len(inputs.ground_truth_answers) != len(inputs.queries):
+                    raise ValueError("Length of ground truth answers should match the number of queries.")
+
+                eval_inputs[metric.value] = {"ground_truth_answers": inputs.ground_truth_answers}
+            elif metric == RAGEvaluationMetric.ANSWER_FAITHFULNESS:
+                eval_inputs[metric.value] = {"questions": inputs.queries}
+
+        return eval_inputs
+
+    @staticmethod
+    def _validate_rag_components(
+        pipeline: Pipeline,
+        components: Dict[RAGExpectedComponent, RAGExpectedComponentMetadata],
+    ):
+        for e in RAGExpectedComponent:
+            if e not in components:
+                raise ValueError(f"RAG evaluation harness requires metadata for the '{e.value}' component.")
+
+        pipeline_outputs = pipeline.outputs(include_components_with_connected_outputs=True)
+        pipeline_inputs = pipeline.inputs(include_components_with_connected_inputs=True)
+
+        for component, metadata in components.items():
+            if metadata.name not in pipeline_outputs or metadata.name not in pipeline_inputs:
+                raise ValueError(
+                    f"Expected '{component.value}' component named '{metadata.name}' not found in pipeline."
+                )
+
+            comp_inputs = pipeline_inputs[metadata.name]
+            comp_outputs = pipeline_outputs[metadata.name]
+
+            for needle in metadata.input_mapping.values():
+                if needle not in comp_inputs:
+                    raise ValueError(
+                        f"Required input '{needle}' not found in '{component.value}' "
+                        f"component named '{metadata.name}'."
+                    )
+
+            for needle in metadata.output_mapping.values():
+                if needle not in comp_outputs:
+                    raise ValueError(
+                        f"Required output '{needle}' not found in '{component.value}' "
+                        f"component named '{metadata.name}'."
+                    )
diff --git a/haystack_experimental/evaluation/harness/rag/parameters.py b/haystack_experimental/evaluation/harness/rag/parameters.py
new file mode 100644
index 00000000..4ad49b8b
--- /dev/null
+++ b/haystack_experimental/evaluation/harness/rag/parameters.py
@@ -0,0 +1,150 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+from haystack import Document
+from haystack.evaluation.eval_run_result import EvaluationRunResult
+
+
+class RAGExpectedComponent(Enum):
+    """
+    Represents the basic components in a RAG pipeline that needs to be present for evaluation.
+
+    Each of these can be separate components in the pipeline or a single component that performs
+    multiple tasks.
+    """
+
+    #: The component in a RAG pipeline that accepts the user query.
+    #: Expected inputs: `query` - Name of input that contains the query string.
+    QUERY_PROCESSOR = "query_processor"
+
+    #: The component in a RAG pipeline that retrieves documents based on the query.
+    #: Expected outputs: `retrieved_documents` - Name of output containing retrieved documents.
+    DOCUMENT_RETRIEVER = "document_retriever"
+
+    #: The component in a RAG pipeline that generates responses based on the query and the retrieved documents.
+    #: Expected outputs: `replies` - Name of out containing the LLM responses. Only the first response is used.
+    RESPONSE_GENERATOR = "response_generator"
+
+
+@dataclass(frozen=True)
+class RAGExpectedComponentMetadata:
+    """
+    Metadata for a `RAGExpectedComponent`.
+
+    :param name:
+        Name of the component in the pipeline.
+    :param input_mapping:
+        Mapping of the expected inputs to
+        corresponding component input names.
+    :param output_mapping:
+        Mapping of the expected outputs to
+        corresponding component output names.
+    """
+
+    name: str
+    input_mapping: Dict[str, str] = field(default_factory=dict)
+    output_mapping: Dict[str, str] = field(default_factory=dict)
+
+
+class RAGEvaluationMetric(Enum):
+    """
+    Represents the metrics that can be used to evaluate a RAG pipeline.
+    """
+
+    #: Document Mean Average Precision.
+    DOCUMENT_MAP = "metric_doc_map"
+
+    #: Document Mean Reciprocal Rank.
+    DOCUMENT_MRR = "metric_doc_mrr"
+
+    #: Document Recall with a single hit.
+    DOCUMENT_RECALL_SINGLE_HIT = "metric_doc_recall_single"
+
+    #: Document Recall with multiple hits.
+    DOCUMENT_RECALL_MULTI_HIT = "metric_doc_recall_multi"
+
+    #: Semantic Answer Similarity.
+    SEMANTIC_ANSWER_SIMILARITY = "metric_sas"
+
+    #: Answer Faithfulness.
+    ANSWER_FAITHFULNESS = "metric_answer_faithfulness"
+
+
+@dataclass(frozen=True)
+class RAGEvaluationInput:
+    """
+    Input passed to the RAG evaluation harness.
+
+    :param queries:
+        The queries passed to the RAG pipeline.
+    :param ground_truth_documents:
+        The ground truth documents passed to the
+        evaluation pipeline. Only required for metrics
+        that require them.
+
+        Corresponds to the queries.
+    :param ground_truth_answers:
+        The ground truth answers passed to the
+        evaluation pipeline. Only required for metrics
+        that require them.
+
+        Corresponds to the queries.
+    :param additional_rag_inputs:
+        Additional inputs to pass to the RAG pipeline. Each
+        key is the name of the component and its value a dictionary
+        with the input name and a list of values, each corresponding
+        to a query.
+    """
+
+    queries: List[str]
+    ground_truth_documents: Optional[List[List[Document]]] = None
+    ground_truth_answers: Optional[List[str]] = None
+    additional_rag_inputs: Optional[Dict[str, Dict[str, List[Any]]]] = None
+
+
+@dataclass(frozen=True)
+class RAGEvaluationOverrides:
+    """
+    Overrides for a RAG evaluation run.
+
+    Used to override the init parameters of components in
+    either (or both) the evaluated and evaluation pipelines.
+
+    :param rag:
+        Overrides for the RAG pipeline. Each
+        key is a component name and its value a dictionary
+        with init parameters to override.
+    :param eval:
+        Overrides for the evaluation pipeline. Each
+        key is a RAG metric and its value a dictionary
+        with init parameters to override.
+    """
+
+    rag_pipeline: Optional[Dict[str, Dict[str, Any]]] = None
+    eval_pipeline: Optional[Dict[RAGEvaluationMetric, Dict[str, Any]]] = None
+
+
+@dataclass(frozen=True)
+class RAGEvaluationOutput:
+    """
+    Represents the output of a RAG evaluation run.
+
+    :param evaluated_pipeline:
+        Serialized version of the evaluated pipeline, including overrides.
+    :param evaluation_pipeline:
+        Serialized version of the evaluation pipeline, including overrides.
+    :param input:
+        Input passed to the evaluation harness.
+    :param results:
+        Results of the evaluation run.
+    """
+
+    evaluated_pipeline: str
+    evaluation_pipeline: str
+    inputs: RAGEvaluationInput
+    results: EvaluationRunResult
diff --git a/test/evaluation/harness/__init__.py b/test/evaluation/harness/__init__.py
new file mode 100644
index 00000000..c1764a6e
--- /dev/null
+++ b/test/evaluation/harness/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/test/evaluation/harness/rag/__init__.py b/test/evaluation/harness/rag/__init__.py
new file mode 100644
index 00000000..c1764a6e
--- /dev/null
+++ b/test/evaluation/harness/rag/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/test/evaluation/harness/rag/test_harness.py b/test/evaluation/harness/rag/test_harness.py
new file mode 100644
index 00000000..1088bcbb
--- /dev/null
+++ b/test/evaluation/harness/rag/test_harness.py
@@ -0,0 +1,560 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Dict, List, Optional
+import pytest
+
+import random
+from haystack_experimental.evaluation.harness.rag import (
+    RAGEvaluationHarness,
+    RAGExpectedComponent,
+    RAGExpectedComponentMetadata,
+    RAGEvaluationMetric,
+    RAGEvaluationOverrides,
+    RAGEvaluationInput,
+)
+from haystack import Pipeline, component, Document, default_to_dict, default_from_dict
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack.components.embedders import SentenceTransformersTextEmbedder
+from haystack.components.builders import PromptBuilder
+from haystack.components.retrievers.in_memory import (
+    InMemoryEmbeddingRetriever,
+    InMemoryBM25Retriever,
+)
+from haystack.components.generators import OpenAIGenerator
+from haystack.utils import Secret
+
+
+@component
+class NonConformantComponent:
+    def __init__(self, inputs, outputs) -> None:
+        component.set_input_types(self, **inputs)
+        component.set_output_types(self, **outputs)
+
+    def run(self, **kwargs):
+        return {}
+
+
+@component
+class MockGenerator:
+    def __init__(self, arg: int) -> None:
+        self.arg = arg
+
+    def to_dict(self):
+        return default_to_dict(self, arg=self.arg)
+
+    @classmethod
+    def from_dict(cls, data):
+        return default_from_dict(cls, data)
+
+    @component.output_types(replies=List[str])
+    def run(self, prompt: str) -> Dict[str, Any]:
+        return {"replies": ["placeholder"]}
+
+
+@component
+class MockKeywordRetriever:
+    def __init__(self) -> None:
+        self.counter = 0
+
+    @component.output_types(documents=List[Document])
+    def run(self, query: str) -> Dict[str, Any]:
+        samples = [
+            [Document(content="France")],
+            [
+                Document(content="9th century"),
+                Document(content="10th century"),
+                Document(content="9th"),
+            ],
+            [
+                Document(content="classical"),
+                Document(content="rock music"),
+                Document(content="dubstep"),
+            ],
+            [
+                Document(content="11th"),
+                Document(content="the 11th"),
+                Document(content="11th century"),
+            ],
+            [
+                Document(content="Denmark"),
+                Document(content="Norway"),
+                Document(content="Iceland"),
+            ],
+            [
+                Document(content="10th century"),
+                Document(content="the first half of the 10th century"),
+                Document(content="10th"),
+                Document(content="10th"),
+            ],
+        ]
+
+        idx = self.counter % len(samples)
+        self.counter += 1
+
+        return {"documents": samples[idx]}
+
+
+def build_rag_pipeline_with_query_embedder(
+    embedder_name: str = "text_embedder",
+    embedder_component: Optional[Any] = None,
+    generator_name: str = "llm",
+    generator_component: Optional[Any] = None,
+):
+    document_store = InMemoryDocumentStore()
+    retriever = InMemoryEmbeddingRetriever(document_store)
+
+    if embedder_component:
+        text_embedder = embedder_component
+    else:
+        text_embedder = SentenceTransformersTextEmbedder(
+            model="sentence-transformers/all-MiniLM-L6-v2"
+        )
+    template = """
+    Given the following information, answer the question.
+
+    Context:
+    {% for document in documents %}
+        {{ document.content }}
+    {% endfor %}
+
+    Question: {{question}}
+    Answer:
+    """
+
+    prompt_builder = PromptBuilder(template=template)
+
+    if generator_component:
+        generator = generator_component
+    else:
+        generator = OpenAIGenerator(
+            model="gpt-3.5-turbo", api_key=Secret.from_token("test_key")
+        )
+
+    pipeline = Pipeline()
+    pipeline.add_component(embedder_name, text_embedder)
+    pipeline.add_component("retriever", retriever)
+    pipeline.add_component("prompt_builder", prompt_builder)
+    pipeline.add_component(generator_name, generator)
+    pipeline.connect(f"{embedder_name}.embedding", "retriever.query_embedding")
+    pipeline.connect("retriever", "prompt_builder.documents")
+    pipeline.connect("prompt_builder", generator_name)
+    return pipeline
+
+
+def build_rag_pipeline_with_keyword_retriever(
+    retriever_name: str = "retriever",
+    retriever_component: Optional[Any] = None,
+    retriever_output_name: str = "documents",
+    generator_name: str = "llm",
+    generator_component: Optional[Any] = None,
+):
+    document_store = InMemoryDocumentStore()
+    if retriever_component:
+        retriever = retriever_component
+    else:
+        retriever = InMemoryBM25Retriever(document_store)
+    template = """
+    Given the following information, answer the question.
+
+    Context:
+    {% for document in documents %}
+        {{ document.content }}
+    {% endfor %}
+
+    Question: {{question}}
+    Answer:
+    """
+
+    prompt_builder = PromptBuilder(template=template)
+    if generator_component:
+        generator = generator_component
+    else:
+        generator = OpenAIGenerator(
+            model="gpt-3.5-turbo", api_key=Secret.from_token("test_key")
+        )
+
+    pipeline = Pipeline()
+    pipeline.add_component(retriever_name, retriever)
+    pipeline.add_component("prompt_builder", prompt_builder)
+    pipeline.add_component(generator_name, generator)
+    pipeline.connect(
+        f"{retriever_name}.{retriever_output_name}", "prompt_builder.documents"
+    )
+    pipeline.connect("prompt_builder", generator_name)
+    return pipeline
+
+
+@pytest.fixture
+def rag_pipeline():
+    return build_rag_pipeline_with_query_embedder("text_embedder")
+
+
+@pytest.fixture
+def rag_pipeline_with_query_embedder():
+    return build_rag_pipeline_with_query_embedder(
+        embedder_name="query_embedder", generator_name="generator"
+    )
+
+
+@pytest.fixture
+def rag_pipeline_with_keyword_retriever():
+    return build_rag_pipeline_with_keyword_retriever(generator_name="generator")
+
+
+def test_rag_eval_harness_init(rag_pipeline):
+    harness = RAGEvaluationHarness(
+        rag_pipeline,
+        rag_components={
+            RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata(
+                name="text_embedder", input_mapping={"query": "text"}
+            ),
+            RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata(
+                name="retriever", output_mapping={"retrieved_documents": "documents"}
+            ),
+            RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata(
+                name="llm", output_mapping={"replies": "replies"}
+            ),
+        },
+        metrics={RAGEvaluationMetric.DOCUMENT_MAP},
+    )
+
+
+def test_rag_eval_harness_init_invalid_expected_component(
+    rag_pipeline,
+):
+    with pytest.raises(ValueError, match="RAG evaluation harness requires metadata"):
+        _ = RAGEvaluationHarness(
+            rag_pipeline,
+            rag_components={},
+            metrics={RAGEvaluationMetric.DOCUMENT_MAP},
+        )
+
+    with pytest.raises(ValueError, match="RAG evaluation harness requires metadata"):
+        _ = RAGEvaluationHarness(
+            rag_pipeline,
+            rag_components={
+                RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata(
+                    name="text_embedder", input_mapping={"query": "text"}
+                ),
+            },
+            metrics={RAGEvaluationMetric.DOCUMENT_MAP},
+        )
+
+
+def test_rag_eval_harness_init_invalid_missing_components(
+    rag_pipeline,
+):
+    with pytest.raises(ValueError, match="named 'embedder' not found in pipeline"):
+        _ = RAGEvaluationHarness(
+            rag_pipeline,
+            rag_components={
+                RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata(
+                    name="embedder", input_mapping={"query": "text"}
+                ),
+                RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata(
+                    name="retriever",
+                    output_mapping={"retrieved_documents": "documents"},
+                ),
+                RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata(
+                    name="llm", output_mapping={"replies": "replies"}
+                ),
+            },
+            metrics={RAGEvaluationMetric.DOCUMENT_MAP},
+        )
+
+
+def test_rag_eval_harness_init_invalid_missing_inputs(rag_pipeline):
+    with pytest.raises(
+        ValueError,
+        match="Required input 'rando_input' not found in 'query_processor' component named 'text_embedder'",
+    ):
+        _ = RAGEvaluationHarness(
+            rag_pipeline,
+            rag_components={
+                RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata(
+                    name="text_embedder", input_mapping={"query": "rando_input"}
+                ),
+                RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata(
+                    name="retriever",
+                    output_mapping={"retrieved_documents": "documents"},
+                ),
+                RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata(
+                    name="llm", output_mapping={"replies": "replies"}
+                ),
+            },
+            metrics={RAGEvaluationMetric.DOCUMENT_MAP},
+        )
+
+
+def test_rag_eval_harness_init_invalid_missing_outputs(
+    rag_pipeline,
+):
+    with pytest.raises(
+        ValueError,
+        match="Required output 'rando_output' not found in 'response_generator' component named 'llm'",
+    ):
+        _ = RAGEvaluationHarness(
+            rag_pipeline,
+            rag_components={
+                RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata(
+                    name="text_embedder", input_mapping={"query": "text"}
+                ),
+                RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata(
+                    name="retriever",
+                    output_mapping={"retrieved_documents": "documents"},
+                ),
+                RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata(
+                    name="llm", output_mapping={"replies": "rando_output"}
+                ),
+            },
+            metrics={RAGEvaluationMetric.DOCUMENT_MAP},
+        )
+
+
+def test_rag_eval_harness_init_defaults(
+    rag_pipeline_with_query_embedder, rag_pipeline_with_keyword_retriever
+):
+    harness = RAGEvaluationHarness.default_with_embedding_retriever(
+        rag_pipeline_with_query_embedder, metrics={RAGEvaluationMetric.DOCUMENT_MAP}
+    )
+
+    harness = RAGEvaluationHarness.default_with_keyword_retriever(
+        rag_pipeline_with_keyword_retriever, metrics={RAGEvaluationMetric.DOCUMENT_MAP}
+    )
+
+
+def test_rag_eval_harness_init_defaults_invalid_missing_inputs():
+    with pytest.raises(
+        ValueError,
+        match="Required input 'text' not found in 'query_processor' component named 'query_embedder'",
+    ):
+        _ = RAGEvaluationHarness.default_with_embedding_retriever(
+            build_rag_pipeline_with_query_embedder(
+                embedder_name="llm", generator_name="query_embedder"
+            ),
+            metrics={RAGEvaluationMetric.DOCUMENT_MAP},
+        )
+
+    with pytest.raises(
+        ValueError,
+        match="Required input 'query' not found in 'query_processor' component named 'retriever'",
+    ):
+        _ = RAGEvaluationHarness.default_with_keyword_retriever(
+            build_rag_pipeline_with_keyword_retriever(
+                retriever_name="llm", generator_name="retriever"
+            ),
+            metrics={RAGEvaluationMetric.DOCUMENT_MAP},
+        )
+
+
+def test_rag_eval_harness_init_defaults_invalid_missing_outputs():
+    non_conformant_query_embedder_pipeline = build_rag_pipeline_with_query_embedder(
+        embedder_name="query_embedder",
+        generator_name="generator",
+        generator_component=NonConformantComponent(
+            {"prompt": str}, {"responses": List[str]}
+        ),
+    )
+    non_conformant_keyword_retriever_pipeline = (
+        build_rag_pipeline_with_keyword_retriever(
+            retriever_component=NonConformantComponent(
+                {"query": str}, {"docs": List[Document]}
+            ),
+            retriever_output_name="docs",
+        )
+    )
+
+    with pytest.raises(
+        ValueError,
+        match="Required output 'replies' not found in 'response_generator' component named 'generator'",
+    ):
+        _ = RAGEvaluationHarness.default_with_embedding_retriever(
+            non_conformant_query_embedder_pipeline,
+            metrics={RAGEvaluationMetric.DOCUMENT_MAP},
+        )
+
+    with pytest.raises(
+        ValueError,
+        match="Required output 'documents' not found in 'document_retriever' component named 'retriever'",
+    ):
+        _ = RAGEvaluationHarness.default_with_keyword_retriever(
+            non_conformant_keyword_retriever_pipeline,
+            metrics={RAGEvaluationMetric.DOCUMENT_MAP},
+        )
+
+
+def test_rag_eval_harness_run_invalid_ground_truths(rag_pipeline_with_query_embedder):
+    harness_map = RAGEvaluationHarness.default_with_embedding_retriever(
+        rag_pipeline_with_query_embedder,
+        metrics={
+            RAGEvaluationMetric.DOCUMENT_MAP,
+        },
+    )
+    harness_sas = RAGEvaluationHarness.default_with_embedding_retriever(
+        rag_pipeline_with_query_embedder,
+        metrics={
+            RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY,
+        },
+    )
+
+    input_no_gt_docs = RAGEvaluationInput(queries=["What is the capital of France?"])
+    input_mismatching_gt_docs = RAGEvaluationInput(
+        queries=["What is the capital of France?"], ground_truth_documents=[]
+    )
+    input_no_gt_answers = RAGEvaluationInput(
+        queries=["What is the capital of France?"],
+        ground_truth_documents=[[Document(content="Paris is the capital of France.")]],
+    )
+    input_mismatching_gt_answers = RAGEvaluationInput(
+        queries=["What is the capital of France?"],
+        ground_truth_documents=[[Document(content="Paris is the capital of France.")]],
+        ground_truth_answers=[],
+    )
+
+    with pytest.raises(ValueError, match="Ground truth documents required"):
+        _ = harness_map.run(input_no_gt_docs)
+
+    with pytest.raises(
+        ValueError,
+        match="Length of ground truth documents should match the number of queries",
+    ):
+        _ = harness_map.run(input_mismatching_gt_docs)
+
+    with pytest.raises(ValueError, match="Ground truth answers required"):
+        _ = harness_sas.run(input_no_gt_answers)
+
+    with pytest.raises(
+        ValueError,
+        match="Length of ground truth answers should match the number of queries",
+    ):
+        _ = harness_sas.run(input_mismatching_gt_answers)
+
+
+def test_rag_eval_harness_run_invalid_additional_input(
+    rag_pipeline_with_query_embedder,
+):
+    harness = RAGEvaluationHarness.default_with_embedding_retriever(
+        rag_pipeline_with_query_embedder,
+        metrics={
+            RAGEvaluationMetric.DOCUMENT_MAP,
+        },
+    )
+
+    input = RAGEvaluationInput(
+        queries=["What is the capital of France?"],
+        ground_truth_documents=[[Document(content="Paris is the capital of France.")]],
+        additional_rag_inputs={"query_embedder": {"text": ["Some other question?"]}},
+    )
+
+    with pytest.raises(
+        ValueError,
+        match="Query embedder input 'text' cannot be provided as additional input",
+    ):
+        _ = harness.run(input)
+
+
+def test_rag_eval_harness_run_invalid_override(
+    rag_pipeline_with_query_embedder,
+):
+    harness = RAGEvaluationHarness.default_with_embedding_retriever(
+        rag_pipeline_with_query_embedder,
+        metrics={
+            RAGEvaluationMetric.DOCUMENT_MAP,
+        },
+    )
+
+    input = RAGEvaluationInput(
+        queries=["What is the capital of France?"],
+        ground_truth_documents=[[Document(content="Paris is the capital of France.")]],
+    )
+
+    with pytest.raises(
+        ValueError,
+        match="Cannot override non-existent component 'rando_component'",
+    ):
+        _ = harness.run(
+            input,
+            overrides=RAGEvaluationOverrides(
+                rag_pipeline={"rando_component": {"Some": "thing"}}
+            ),
+        )
+
+    with pytest.raises(
+        ValueError,
+        match="Cannot override parameters of unused evaluation metric",
+    ):
+        _ = harness.run(
+            input,
+            overrides=RAGEvaluationOverrides(
+                eval_pipeline={
+                    RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: {
+                        "model": "rando_model"
+                    }
+                }
+            ),
+        )
+
+
+def test_rag_eval_harness_run_statistical_metrics():
+    harness = RAGEvaluationHarness.default_with_keyword_retriever(
+        build_rag_pipeline_with_keyword_retriever(
+            retriever_component=MockKeywordRetriever(),
+            generator_component=MockGenerator(arg=0),
+            generator_name="generator",
+        ),
+        metrics={
+            RAGEvaluationMetric.DOCUMENT_MAP,
+            RAGEvaluationMetric.DOCUMENT_MRR,
+            RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,
+            RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT,
+        },
+    )
+
+    inputs = RAGEvaluationInput(
+        queries=["What is the capital of France?"] * 6,
+        ground_truth_documents=[
+            [Document(content="France")],
+            [Document(content="9th century"), Document(content="9th")],
+            [Document(content="classical music"), Document(content="classical")],
+            [Document(content="11th century"), Document(content="the 11th")],
+            [Document(content="Denmark, Iceland and Norway")],
+            [Document(content="10th century"), Document(content="10th")],
+        ],
+    )
+
+    output = harness.run(
+        inputs,
+        overrides=RAGEvaluationOverrides(
+            rag_pipeline={
+                "generator": {"arg": 100},
+            }
+        ),
+        run_name="test_run",
+    )
+
+    assert output.inputs == inputs
+    assert output.results.run_name == "test_run"
+    assert output.results.results == {
+        "metric_doc_map": {
+            "score": 0.7222222222222222,
+            "individual_scores": [1.0, 0.8333333333333333, 1.0, 0.5, 0.0, 1.0],
+        },
+        "metric_doc_recall_single": {
+            "score": 0.8333333333333334,
+            "individual_scores": [1.0, 1.0, 1.0, 1.0, 0.0, 1.0],
+        },
+        "metric_doc_recall_multi": {
+            "score": 0.75,
+            "individual_scores": [1.0, 1.0, 0.5, 1.0, 0.0, 1.0],
+        },
+        "metric_doc_mrr": {
+            "score": 0.75,
+            "individual_scores": [1.0, 1.0, 1.0, 0.5, 0.0, 1.0],
+        },
+    }
+    overriden_pipeline_dict = Pipeline.loads(output.evaluated_pipeline).to_dict()
+    assert (
+        overriden_pipeline_dict["components"]["generator"]["init_parameters"]["arg"]
+        == 100
+    )
diff --git a/test/test_experimental.py b/test/test_experimental.py
deleted file mode 100644
index f1748238..00000000
--- a/test/test_experimental.py
+++ /dev/null
@@ -1,2 +0,0 @@
-def test():
-    pass