feat: Add ContextRelevanceEvaluator to RAG eval harness

deepset-ai · Jun 18, 2024 · d4553a6 · d4553a6
1 parent 3193cbd
commit d4553a6
Show file tree

Hide file tree

Showing 4 changed files with 482 additions and 302 deletions.
diff --git a/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py b/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py
@@ -7,6 +7,7 @@
 
 from haystack import Pipeline
 from haystack.components.evaluators import (
+    ContextRelevanceEvaluator,
     DocumentMAPEvaluator,
     DocumentMRREvaluator,
     DocumentRecallEvaluator,
@@ -34,12 +35,17 @@ def default_rag_evaluation_pipeline(
     metric_ctors = {
         RAGEvaluationMetric.DOCUMENT_MAP: DocumentMAPEvaluator,
         RAGEvaluationMetric.DOCUMENT_MRR: DocumentMRREvaluator,
-        RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT: partial(DocumentRecallEvaluator, mode=RecallMode.SINGLE_HIT),
-        RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT: partial(DocumentRecallEvaluator, mode=RecallMode.MULTI_HIT),
+        RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT: partial(
+            DocumentRecallEvaluator, mode=RecallMode.SINGLE_HIT
+        ),
+        RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT: partial(
+            DocumentRecallEvaluator, mode=RecallMode.MULTI_HIT
+        ),
         RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: partial(
             SASEvaluator, model="sentence-transformers/all-MiniLM-L6-v2"
         ),
         RAGEvaluationMetric.ANSWER_FAITHFULNESS: FaithfulnessEvaluator,
+        RAGEvaluationMetric.CONTEXT_RELEVANCE: ContextRelevanceEvaluator,
     }
 
     for metric in metrics:

diff --git a/haystack_experimental/evaluation/harness/rag/harness.py b/haystack_experimental/evaluation/harness/rag/harness.py
@@ -266,6 +266,12 @@ def _map_rag_eval_pipeline_io(self) -> Dict[str, List[str]]:
                     "replies",
                 ),
             },
+            RAGEvaluationMetric.CONTEXT_RELEVANCE: {
+                "contexts": (
+                    RAGExpectedComponent.DOCUMENT_RETRIEVER,
+                    "retrieved_documents",
+                ),
+            },
         }
 
         outputs_to_inputs: Dict[str, List[str]] = {}
@@ -344,6 +350,11 @@ def _prepare_eval_pipeline_additional_inputs(
                 eval_inputs[metric.value] = {
                     "ground_truth_documents": inputs.ground_truth_documents
                 }
+            elif metric in (
+                RAGEvaluationMetric.ANSWER_FAITHFULNESS,
+                RAGEvaluationMetric.CONTEXT_RELEVANCE,
+            ):
+                eval_inputs[metric.value] = {"questions": inputs.queries}
             elif metric == RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY:
                 if inputs.ground_truth_answers is None:
                     raise ValueError(
@@ -357,8 +368,6 @@ def _prepare_eval_pipeline_additional_inputs(
                 eval_inputs[metric.value] = {
                     "ground_truth_answers": inputs.ground_truth_answers
                 }
-            elif metric == RAGEvaluationMetric.ANSWER_FAITHFULNESS:
-                eval_inputs[metric.value] = {"questions": inputs.queries}
 
         return eval_inputs
 

diff --git a/haystack_experimental/evaluation/harness/rag/parameters.py b/haystack_experimental/evaluation/harness/rag/parameters.py
@@ -74,6 +74,9 @@ class RAGEvaluationMetric(Enum):
     #: Answer Faithfulness.
     ANSWER_FAITHFULNESS = "metric_answer_faithfulness"
 
+    #: Context Relevance.
+    CONTEXT_RELEVANCE = "metric_context_relevance"
+
 
 @dataclass(frozen=True)
 class RAGEvaluationInput: