feat: Add ContextRelevanceEvaluator to RAG eval harness

deepset-ai · Jun 18, 2024 · 6d76d21 · 6d76d21
1 parent 3193cbd
commit 6d76d21
Show file tree

Hide file tree

Showing 4 changed files with 139 additions and 5 deletions.
diff --git a/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py b/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py
@@ -7,6 +7,7 @@
 
 from haystack import Pipeline
 from haystack.components.evaluators import (
+    ContextRelevanceEvaluator,
     DocumentMAPEvaluator,
     DocumentMRREvaluator,
     DocumentRecallEvaluator,
@@ -34,12 +35,17 @@ def default_rag_evaluation_pipeline(
     metric_ctors = {
         RAGEvaluationMetric.DOCUMENT_MAP: DocumentMAPEvaluator,
         RAGEvaluationMetric.DOCUMENT_MRR: DocumentMRREvaluator,
-        RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT: partial(DocumentRecallEvaluator, mode=RecallMode.SINGLE_HIT),
-        RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT: partial(DocumentRecallEvaluator, mode=RecallMode.MULTI_HIT),
+        RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT: partial(
+            DocumentRecallEvaluator, mode=RecallMode.SINGLE_HIT
+        ),
+        RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT: partial(
+            DocumentRecallEvaluator, mode=RecallMode.MULTI_HIT
+        ),
         RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: partial(
             SASEvaluator, model="sentence-transformers/all-MiniLM-L6-v2"
         ),
         RAGEvaluationMetric.ANSWER_FAITHFULNESS: FaithfulnessEvaluator,
+        RAGEvaluationMetric.CONTEXT_RELEVANCE: ContextRelevanceEvaluator,
     }
 
     for metric in metrics:

diff --git a/haystack_experimental/evaluation/harness/rag/harness.py b/haystack_experimental/evaluation/harness/rag/harness.py
@@ -266,6 +266,12 @@ def _map_rag_eval_pipeline_io(self) -> Dict[str, List[str]]:
                     "replies",
                 ),
             },
+            RAGEvaluationMetric.CONTEXT_RELEVANCE: {
+                "contexts": (
+                    RAGExpectedComponent.DOCUMENT_RETRIEVER,
+                    "retrieved_documents",
+                ),
+            },
         }
 
         outputs_to_inputs: Dict[str, List[str]] = {}
@@ -344,6 +350,11 @@ def _prepare_eval_pipeline_additional_inputs(
                 eval_inputs[metric.value] = {
                     "ground_truth_documents": inputs.ground_truth_documents
                 }
+            elif metric in (
+                RAGEvaluationMetric.ANSWER_FAITHFULNESS,
+                RAGEvaluationMetric.CONTEXT_RELEVANCE,
+            ):
+                eval_inputs[metric.value] = {"questions": inputs.queries}
             elif metric == RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY:
                 if inputs.ground_truth_answers is None:
                     raise ValueError(
@@ -357,8 +368,6 @@ def _prepare_eval_pipeline_additional_inputs(
                 eval_inputs[metric.value] = {
                     "ground_truth_answers": inputs.ground_truth_answers
                 }
-            elif metric == RAGEvaluationMetric.ANSWER_FAITHFULNESS:
-                eval_inputs[metric.value] = {"questions": inputs.queries}
 
         return eval_inputs
 

diff --git a/haystack_experimental/evaluation/harness/rag/parameters.py b/haystack_experimental/evaluation/harness/rag/parameters.py
@@ -74,6 +74,9 @@ class RAGEvaluationMetric(Enum):
     #: Answer Faithfulness.
     ANSWER_FAITHFULNESS = "metric_answer_faithfulness"
 
+    #: Context Relevance.
+    CONTEXT_RELEVANCE = "metric_context_relevance"
+
 
 @dataclass(frozen=True)
 class RAGEvaluationInput:

diff --git a/test/evaluation/harness/rag/test_harness.py b/test/evaluation/harness/rag/test_harness.py
@@ -4,7 +4,6 @@
 from typing import Any, Dict, List, Optional
 import pytest
 
-import random
 from haystack_experimental.evaluation.harness.rag import (
     RAGEvaluationHarness,
     RAGExpectedComponent,
@@ -95,6 +94,71 @@ def run(self, query: str) -> Dict[str, Any]:
         return {"documents": samples[idx]}
 
 
+@component
+class MockLLMEvaluator:
+    def __init__(self, metric: RAGEvaluationMetric) -> None:
+        self.metric = metric
+        if metric == RAGEvaluationMetric.ANSWER_FAITHFULNESS:
+            component.set_input_types(
+                self,
+                **dict(
+                    [
+                        ("questions", List[str]),
+                        ("contexts", List[List[str]]),
+                        ("predicted_answers", List[str]),
+                    ]
+                ),
+            )
+
+            component.set_output_types(
+                self,
+                **dict(
+                    [
+                        ("individual_scores", List[int]),
+                        ("score", float),
+                        ("results", List[Dict[str, Any]]),
+                    ]
+                ),
+            )
+        elif metric == RAGEvaluationMetric.CONTEXT_RELEVANCE:
+            component.set_input_types(
+                self,
+                **dict(
+                    [
+                        ("questions", List[str]),
+                        ("contexts", List[List[str]]),
+                    ]
+                ),
+            )
+
+            component.set_output_types(
+                self,
+                **dict(
+                    [
+                        ("individual_scores", List[int]),
+                        ("score", float),
+                        ("results", List[Dict[str, Any]]),
+                    ]
+                ),
+            )
+        else:
+            raise ValueError(f"Invalid metric: {metric}")
+
+    @staticmethod
+    def default_output() -> Dict[str, Any]:
+        return {
+            "individual_scores": [1] * 6,
+            "score": 1.0,
+            "results": [
+                {"statements": ["placeholder"], "statement_scores": [1.0], "score": 1.0}
+            ]
+            * 6,
+        }
+
+    def run(self, **kwargs) -> Dict[str, Any]:
+        return self.default_output()
+
+
 def build_rag_pipeline_with_query_embedder(
     embedder_name: str = "text_embedder",
     embedder_component: Optional[Any] = None,
@@ -558,3 +622,55 @@ def test_rag_eval_harness_run_statistical_metrics():
         overriden_pipeline_dict["components"]["generator"]["init_parameters"]["arg"]
         == 100
     )
+
+
+def test_rag_eval_harness_run_llm_metrics():
+    harness = RAGEvaluationHarness.default_with_keyword_retriever(
+        build_rag_pipeline_with_keyword_retriever(
+            retriever_component=MockKeywordRetriever(),
+            generator_component=MockGenerator(arg=0),
+            generator_name="generator",
+        ),
+        metrics={
+            RAGEvaluationMetric.ANSWER_FAITHFULNESS,
+            RAGEvaluationMetric.CONTEXT_RELEVANCE,
+        },
+    )
+
+    mock_eval_pipeline = Pipeline()
+    for m in (
+        RAGEvaluationMetric.ANSWER_FAITHFULNESS,
+        RAGEvaluationMetric.CONTEXT_RELEVANCE,
+    ):
+        mock_eval_pipeline.add_component(m.value, MockLLMEvaluator(metric=m))
+
+    harness.evaluation_pipeline = mock_eval_pipeline
+
+    inputs = RAGEvaluationInput(
+        queries=["What is the capital of France?"] * 6,
+        ground_truth_documents=[
+            [Document(content="France")],
+            [Document(content="9th century"), Document(content="9th")],
+            [Document(content="classical music"), Document(content="classical")],
+            [Document(content="11th century"), Document(content="the 11th")],
+            [Document(content="Denmark, Iceland and Norway")],
+            [Document(content="10th century"), Document(content="10th")],
+        ],
+    )
+
+    output = harness.run(
+        inputs,
+        overrides=RAGEvaluationOverrides(
+            rag_pipeline={
+                "generator": {"arg": 100},
+            }
+        ),
+        run_name="test_run",
+    )
+
+    assert output.inputs == inputs
+    assert output.results.run_name == "test_run"
+    assert output.results.results == {
+        "metric_answer_faithfulness": MockLLMEvaluator.default_output(),
+        "metric_context_relevance": MockLLMEvaluator.default_output(),
+    }