Skip to content

Commit

Permalink
feat: Add ContextRelevanceEvaluator to RAG eval harness
Browse files Browse the repository at this point in the history
  • Loading branch information
shadeMe committed Jun 18, 2024
1 parent 3193cbd commit 6d76d21
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from haystack import Pipeline
from haystack.components.evaluators import (
ContextRelevanceEvaluator,
DocumentMAPEvaluator,
DocumentMRREvaluator,
DocumentRecallEvaluator,
Expand Down Expand Up @@ -34,12 +35,17 @@ def default_rag_evaluation_pipeline(
metric_ctors = {
RAGEvaluationMetric.DOCUMENT_MAP: DocumentMAPEvaluator,
RAGEvaluationMetric.DOCUMENT_MRR: DocumentMRREvaluator,
RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT: partial(DocumentRecallEvaluator, mode=RecallMode.SINGLE_HIT),
RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT: partial(DocumentRecallEvaluator, mode=RecallMode.MULTI_HIT),
RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT: partial(
DocumentRecallEvaluator, mode=RecallMode.SINGLE_HIT
),
RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT: partial(
DocumentRecallEvaluator, mode=RecallMode.MULTI_HIT
),
RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: partial(
SASEvaluator, model="sentence-transformers/all-MiniLM-L6-v2"
),
RAGEvaluationMetric.ANSWER_FAITHFULNESS: FaithfulnessEvaluator,
RAGEvaluationMetric.CONTEXT_RELEVANCE: ContextRelevanceEvaluator,
}

for metric in metrics:
Expand Down
13 changes: 11 additions & 2 deletions haystack_experimental/evaluation/harness/rag/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,12 @@ def _map_rag_eval_pipeline_io(self) -> Dict[str, List[str]]:
"replies",
),
},
RAGEvaluationMetric.CONTEXT_RELEVANCE: {
"contexts": (
RAGExpectedComponent.DOCUMENT_RETRIEVER,
"retrieved_documents",
),
},
}

outputs_to_inputs: Dict[str, List[str]] = {}
Expand Down Expand Up @@ -344,6 +350,11 @@ def _prepare_eval_pipeline_additional_inputs(
eval_inputs[metric.value] = {
"ground_truth_documents": inputs.ground_truth_documents
}
elif metric in (
RAGEvaluationMetric.ANSWER_FAITHFULNESS,
RAGEvaluationMetric.CONTEXT_RELEVANCE,
):
eval_inputs[metric.value] = {"questions": inputs.queries}
elif metric == RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY:
if inputs.ground_truth_answers is None:
raise ValueError(
Expand All @@ -357,8 +368,6 @@ def _prepare_eval_pipeline_additional_inputs(
eval_inputs[metric.value] = {
"ground_truth_answers": inputs.ground_truth_answers
}
elif metric == RAGEvaluationMetric.ANSWER_FAITHFULNESS:
eval_inputs[metric.value] = {"questions": inputs.queries}

return eval_inputs

Expand Down
3 changes: 3 additions & 0 deletions haystack_experimental/evaluation/harness/rag/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ class RAGEvaluationMetric(Enum):
#: Answer Faithfulness.
ANSWER_FAITHFULNESS = "metric_answer_faithfulness"

#: Context Relevance.
CONTEXT_RELEVANCE = "metric_context_relevance"


@dataclass(frozen=True)
class RAGEvaluationInput:
Expand Down
118 changes: 117 additions & 1 deletion test/evaluation/harness/rag/test_harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from typing import Any, Dict, List, Optional
import pytest

import random
from haystack_experimental.evaluation.harness.rag import (
RAGEvaluationHarness,
RAGExpectedComponent,
Expand Down Expand Up @@ -95,6 +94,71 @@ def run(self, query: str) -> Dict[str, Any]:
return {"documents": samples[idx]}


@component
class MockLLMEvaluator:
def __init__(self, metric: RAGEvaluationMetric) -> None:
self.metric = metric
if metric == RAGEvaluationMetric.ANSWER_FAITHFULNESS:
component.set_input_types(
self,
**dict(
[
("questions", List[str]),
("contexts", List[List[str]]),
("predicted_answers", List[str]),
]
),
)

component.set_output_types(
self,
**dict(
[
("individual_scores", List[int]),
("score", float),
("results", List[Dict[str, Any]]),
]
),
)
elif metric == RAGEvaluationMetric.CONTEXT_RELEVANCE:
component.set_input_types(
self,
**dict(
[
("questions", List[str]),
("contexts", List[List[str]]),
]
),
)

component.set_output_types(
self,
**dict(
[
("individual_scores", List[int]),
("score", float),
("results", List[Dict[str, Any]]),
]
),
)
else:
raise ValueError(f"Invalid metric: {metric}")

@staticmethod
def default_output() -> Dict[str, Any]:
return {
"individual_scores": [1] * 6,
"score": 1.0,
"results": [
{"statements": ["placeholder"], "statement_scores": [1.0], "score": 1.0}
]
* 6,
}

def run(self, **kwargs) -> Dict[str, Any]:
return self.default_output()


def build_rag_pipeline_with_query_embedder(
embedder_name: str = "text_embedder",
embedder_component: Optional[Any] = None,
Expand Down Expand Up @@ -558,3 +622,55 @@ def test_rag_eval_harness_run_statistical_metrics():
overriden_pipeline_dict["components"]["generator"]["init_parameters"]["arg"]
== 100
)


def test_rag_eval_harness_run_llm_metrics():
harness = RAGEvaluationHarness.default_with_keyword_retriever(
build_rag_pipeline_with_keyword_retriever(
retriever_component=MockKeywordRetriever(),
generator_component=MockGenerator(arg=0),
generator_name="generator",
),
metrics={
RAGEvaluationMetric.ANSWER_FAITHFULNESS,
RAGEvaluationMetric.CONTEXT_RELEVANCE,
},
)

mock_eval_pipeline = Pipeline()
for m in (
RAGEvaluationMetric.ANSWER_FAITHFULNESS,
RAGEvaluationMetric.CONTEXT_RELEVANCE,
):
mock_eval_pipeline.add_component(m.value, MockLLMEvaluator(metric=m))

harness.evaluation_pipeline = mock_eval_pipeline

inputs = RAGEvaluationInput(
queries=["What is the capital of France?"] * 6,
ground_truth_documents=[
[Document(content="France")],
[Document(content="9th century"), Document(content="9th")],
[Document(content="classical music"), Document(content="classical")],
[Document(content="11th century"), Document(content="the 11th")],
[Document(content="Denmark, Iceland and Norway")],
[Document(content="10th century"), Document(content="10th")],
],
)

output = harness.run(
inputs,
overrides=RAGEvaluationOverrides(
rag_pipeline={
"generator": {"arg": 100},
}
),
run_name="test_run",
)

assert output.inputs == inputs
assert output.results.run_name == "test_run"
assert output.results.results == {
"metric_answer_faithfulness": MockLLMEvaluator.default_output(),
"metric_context_relevance": MockLLMEvaluator.default_output(),
}

0 comments on commit 6d76d21

Please sign in to comment.