refactor: Rename identifiers for clarification (#30)

* refactor: Rename faithfulness metric identifier to match the underlying component, rename `RAGEvaluationInput.additional_rag_inputs` * build: Exclude unrelated GHA code from lints
deepset-ai · Jul 4, 2024 · 9973f3b · 9973f3b
1 parent 97b93bf
commit 9973f3b
Show file tree

Hide file tree

Showing 5 changed files with 58 additions and 58 deletions.
diff --git a/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py b/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py
@@ -44,8 +44,12 @@ def default_rag_evaluation_pipeline(
         RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: partial(
             SASEvaluator, model="sentence-transformers/all-MiniLM-L6-v2"
         ),
-        RAGEvaluationMetric.ANSWER_FAITHFULNESS: partial(FaithfulnessEvaluator, raise_on_failure=False),
-        RAGEvaluationMetric.CONTEXT_RELEVANCE: partial(ContextRelevanceEvaluator, raise_on_failure=False),
+        RAGEvaluationMetric.FAITHFULNESS: partial(
+            FaithfulnessEvaluator, raise_on_failure=False
+        ),
+        RAGEvaluationMetric.CONTEXT_RELEVANCE: partial(
+            ContextRelevanceEvaluator, raise_on_failure=False
+        ),
     }
 
     for metric in metrics:

diff --git a/haystack_experimental/evaluation/harness/rag/harness.py b/haystack_experimental/evaluation/harness/rag/harness.py
@@ -264,7 +264,7 @@ def _map_rag_eval_pipeline_io(self) -> Dict[str, List[str]]:
                     "replies",
                 )
             },
-            RAGEvaluationMetric.ANSWER_FAITHFULNESS: {
+            RAGEvaluationMetric.FAITHFULNESS: {
                 "contexts": (
                     RAGExpectedComponent.DOCUMENT_RETRIEVER,
                     "retrieved_documents",
@@ -307,9 +307,9 @@ def _prepare_rag_pipeline_inputs(
             RAGExpectedComponent.QUERY_PROCESSOR
         ].input_mapping["query"]
 
-        if inputs.additional_rag_inputs is not None:
+        if inputs.rag_pipeline_inputs is not None:
             # Ensure that the query embedder input is not provided as additional input.
-            existing = inputs.additional_rag_inputs.get(query_embedder_name)
+            existing = inputs.rag_pipeline_inputs.get(query_embedder_name)
             if existing is not None:
                 existing = existing.get(query_embedder_text_input)  # type: ignore
                 if existing is not None:
@@ -318,7 +318,7 @@ def _prepare_rag_pipeline_inputs(
                     )
 
             # Add the queries as an aggregate input.
-            rag_inputs = deepcopy(inputs.additional_rag_inputs)
+            rag_inputs = deepcopy(inputs.rag_pipeline_inputs)
             if query_embedder_name not in rag_inputs:
                 rag_inputs[query_embedder_name] = {}
             rag_inputs[query_embedder_name][query_embedder_text_input] = deepcopy(
@@ -359,7 +359,7 @@ def _prepare_eval_pipeline_additional_inputs(
                     "ground_truth_documents": inputs.ground_truth_documents
                 }
             elif metric in (
-                RAGEvaluationMetric.ANSWER_FAITHFULNESS,
+                RAGEvaluationMetric.FAITHFULNESS,
                 RAGEvaluationMetric.CONTEXT_RELEVANCE,
             ):
                 eval_inputs[metric.value] = {"questions": inputs.queries}

diff --git a/haystack_experimental/evaluation/harness/rag/parameters.py b/haystack_experimental/evaluation/harness/rag/parameters.py
@@ -71,8 +71,8 @@ class RAGEvaluationMetric(Enum):
     #: Semantic Answer Similarity.
     SEMANTIC_ANSWER_SIMILARITY = "metric_sas"
 
-    #: Answer Faithfulness.
-    ANSWER_FAITHFULNESS = "metric_answer_faithfulness"
+    #: Faithfulness.
+    FAITHFULNESS = "metric_faithfulness"
 
     #: Context Relevance.
     CONTEXT_RELEVANCE = "metric_context_relevance"
@@ -88,16 +88,12 @@ class RAGEvaluationInput:
     :param ground_truth_documents:
         The ground truth documents passed to the
         evaluation pipeline. Only required for metrics
-        that require them.
-
-        Corresponds to the queries.
+        that require them. Corresponds to the queries.
     :param ground_truth_answers:
         The ground truth answers passed to the
         evaluation pipeline. Only required for metrics
-        that require them.
-
-        Corresponds to the queries.
-    :param additional_rag_inputs:
+        that require them. Corresponds to the queries.
+    :param rag_pipeline_inputs:
         Additional inputs to pass to the RAG pipeline. Each
         key is the name of the component and its value a dictionary
         with the input name and a list of values, each corresponding
@@ -107,7 +103,7 @@ class RAGEvaluationInput:
     queries: List[str]
     ground_truth_documents: Optional[List[List[Document]]] = None
     ground_truth_answers: Optional[List[str]] = None
-    additional_rag_inputs: Optional[Dict[str, Dict[str, List[Any]]]] = None
+    rag_pipeline_inputs: Optional[Dict[str, Dict[str, List[Any]]]] = None
 
 
 @dataclass(frozen=True)

diff --git a/pyproject.toml b/pyproject.toml
@@ -87,7 +87,7 @@ tag-pattern = 'v(?P<version>.*)'
 allow-direct-references = true
 
 [tool.hatch.build.targets.sdist]
-include = ["/haystack_experimental", "/VERSION.txt"]
+include = ["/haystack_experimental"]
 
 [tool.hatch.build.targets.wheel]
 packages = ["haystack_experimental"]
@@ -166,7 +166,7 @@ ignore_missing_imports = true
 [tool.ruff]
 line-length = 120
 target-version = "py38"
-exclude = ["test"]
+exclude = ["test", ".github"]
 
 [tool.ruff.lint]
 select = [

diff --git a/test/evaluation/harness/rag/test_harness.py b/test/evaluation/harness/rag/test_harness.py
@@ -18,9 +18,13 @@
 from haystack.components.builders import PromptBuilder
 from haystack.components.evaluators import (
     ContextRelevanceEvaluator,
+    DocumentMAPEvaluator,
+    DocumentMRREvaluator,
+    DocumentRecallEvaluator,
     FaithfulnessEvaluator,
     SASEvaluator,
 )
+from haystack.components.evaluators.document_recall import RecallMode
 from haystack.components.retrievers.in_memory import (
     InMemoryEmbeddingRetriever,
     InMemoryBM25Retriever,
@@ -100,15 +104,23 @@ def run(self, query: str) -> Dict[str, Any]:
 
 
 @component
-class MockModelBasedEvaluator:
+class MockEvaluator:
     def __init__(self, metric: RAGEvaluationMetric) -> None:
         self.metric = metric
 
         io_map = {
+            RAGEvaluationMetric.DOCUMENT_MAP: DocumentMAPEvaluator(),
+            RAGEvaluationMetric.DOCUMENT_MRR: DocumentMRREvaluator(),
+            RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT: DocumentRecallEvaluator(
+                mode=RecallMode.SINGLE_HIT
+            ),
+            RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT: DocumentRecallEvaluator(
+                mode=RecallMode.MULTI_HIT
+            ),
             RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: SASEvaluator(
                 "sentence-transformers/all-MiniLM-L6-v2"
             ),
-            RAGEvaluationMetric.ANSWER_FAITHFULNESS: FaithfulnessEvaluator(
+            RAGEvaluationMetric.FAITHFULNESS: FaithfulnessEvaluator(
                 api_key=Secret.from_token("test_key")
             ),
             RAGEvaluationMetric.CONTEXT_RELEVANCE: ContextRelevanceEvaluator(
@@ -121,7 +133,13 @@ def __init__(self, metric: RAGEvaluationMetric) -> None:
 
     @staticmethod
     def default_output(metric) -> Dict[str, Any]:
-        if metric == RAGEvaluationMetric.ANSWER_FAITHFULNESS:
+        if metric in (
+            RAGEvaluationMetric.FAITHFULNESS,
+            RAGEvaluationMetric.DOCUMENT_MAP,
+            RAGEvaluationMetric.DOCUMENT_MRR,
+            RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,
+            RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT,
+        ):
             return {
                 "individual_scores": [1] * 6,
                 "score": 1.0,
@@ -496,9 +514,7 @@ def test_run_invalid_additional_input(
             ground_truth_documents=[
                 [Document(content="Paris is the capital of France.")]
             ],
-            additional_rag_inputs={
-                "query_embedder": {"text": ["Some other question?"]}
-            },
+            rag_pipeline_inputs={"query_embedder": {"text": ["Some other question?"]}},
         )
 
         with pytest.raises(
@@ -552,20 +568,27 @@ def test_run_invalid_override(
             )
 
     def test_run_statistical_metrics(self):
+        metrics = {
+            RAGEvaluationMetric.DOCUMENT_MAP,
+            RAGEvaluationMetric.DOCUMENT_MRR,
+            RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,
+            RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT,
+        }
         harness = RAGEvaluationHarness.default_with_keyword_retriever(
             build_rag_pipeline_with_keyword_retriever(
                 retriever_component=MockKeywordRetriever(),
                 generator_component=MockGenerator(arg=0),
                 generator_name="generator",
             ),
-            metrics={
-                RAGEvaluationMetric.DOCUMENT_MAP,
-                RAGEvaluationMetric.DOCUMENT_MRR,
-                RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,
-                RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT,
-            },
+            metrics=metrics,
         )
 
+        mock_eval_pipeline = Pipeline()
+        for m in metrics:
+            mock_eval_pipeline.add_component(m.value, MockEvaluator(metric=m))
+
+        harness.evaluation_pipeline = mock_eval_pipeline
+
         inputs = RAGEvaluationInput(
             queries=["What is the capital of France?"] * 6,
             ground_truth_documents=[
@@ -591,22 +614,7 @@ def test_run_statistical_metrics(self):
         assert output.inputs == inputs
         assert output.results.run_name == "test_run"
         assert output.results.results == {
-            "metric_doc_map": {
-                "score": 0.7222222222222222,
-                "individual_scores": [1.0, 0.8333333333333333, 1.0, 0.5, 0.0, 1.0],
-            },
-            "metric_doc_recall_single": {
-                "score": 0.8333333333333334,
-                "individual_scores": [1.0, 1.0, 1.0, 1.0, 0.0, 1.0],
-            },
-            "metric_doc_recall_multi": {
-                "score": 0.75,
-                "individual_scores": [1.0, 1.0, 0.5, 1.0, 0.0, 1.0],
-            },
-            "metric_doc_mrr": {
-                "score": 0.75,
-                "individual_scores": [1.0, 1.0, 1.0, 0.5, 0.0, 1.0],
-            },
+            m.value: MockEvaluator.default_output(m) for m in metrics
         }
         overriden_pipeline_dict = Pipeline.loads(output.evaluated_pipeline).to_dict()
         assert (
@@ -618,7 +626,7 @@ def test_run_model_based_metrics(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test")
 
         metrics = {
-            RAGEvaluationMetric.ANSWER_FAITHFULNESS,
+            RAGEvaluationMetric.FAITHFULNESS,
             RAGEvaluationMetric.CONTEXT_RELEVANCE,
             RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY,
         }
@@ -633,7 +641,7 @@ def test_run_model_based_metrics(self, monkeypatch):
 
         mock_eval_pipeline = Pipeline()
         for m in metrics:
-            mock_eval_pipeline.add_component(m.value, MockModelBasedEvaluator(metric=m))
+            mock_eval_pipeline.add_component(m.value, MockEvaluator(metric=m))
 
         harness.evaluation_pipeline = mock_eval_pipeline
 
@@ -721,13 +729,5 @@ def test_run_model_based_metrics(self, monkeypatch):
             ],
         }
         assert output.results.results == {
-            "metric_answer_faithfulness": MockModelBasedEvaluator.default_output(
-                RAGEvaluationMetric.ANSWER_FAITHFULNESS
-            ),
-            "metric_context_relevance": MockModelBasedEvaluator.default_output(
-                RAGEvaluationMetric.CONTEXT_RELEVANCE
-            ),
-            "metric_sas": MockModelBasedEvaluator.default_output(
-                RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY
-            ),
+            m.value: MockEvaluator.default_output(m) for m in metrics
         }