Skip to content

Commit

Permalink
refactor: Rename identifiers for clarification (#30)
Browse files Browse the repository at this point in the history
* refactor: Rename faithfulness metric identifier to match the underlying component, rename `RAGEvaluationInput.additional_rag_inputs`

* build: Exclude unrelated GHA code from lints
  • Loading branch information
shadeMe authored Jul 4, 2024
1 parent 97b93bf commit 9973f3b
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 58 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,12 @@ def default_rag_evaluation_pipeline(
RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: partial(
SASEvaluator, model="sentence-transformers/all-MiniLM-L6-v2"
),
RAGEvaluationMetric.ANSWER_FAITHFULNESS: partial(FaithfulnessEvaluator, raise_on_failure=False),
RAGEvaluationMetric.CONTEXT_RELEVANCE: partial(ContextRelevanceEvaluator, raise_on_failure=False),
RAGEvaluationMetric.FAITHFULNESS: partial(
FaithfulnessEvaluator, raise_on_failure=False
),
RAGEvaluationMetric.CONTEXT_RELEVANCE: partial(
ContextRelevanceEvaluator, raise_on_failure=False
),
}

for metric in metrics:
Expand Down
10 changes: 5 additions & 5 deletions haystack_experimental/evaluation/harness/rag/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def _map_rag_eval_pipeline_io(self) -> Dict[str, List[str]]:
"replies",
)
},
RAGEvaluationMetric.ANSWER_FAITHFULNESS: {
RAGEvaluationMetric.FAITHFULNESS: {
"contexts": (
RAGExpectedComponent.DOCUMENT_RETRIEVER,
"retrieved_documents",
Expand Down Expand Up @@ -307,9 +307,9 @@ def _prepare_rag_pipeline_inputs(
RAGExpectedComponent.QUERY_PROCESSOR
].input_mapping["query"]

if inputs.additional_rag_inputs is not None:
if inputs.rag_pipeline_inputs is not None:
# Ensure that the query embedder input is not provided as additional input.
existing = inputs.additional_rag_inputs.get(query_embedder_name)
existing = inputs.rag_pipeline_inputs.get(query_embedder_name)
if existing is not None:
existing = existing.get(query_embedder_text_input) # type: ignore
if existing is not None:
Expand All @@ -318,7 +318,7 @@ def _prepare_rag_pipeline_inputs(
)

# Add the queries as an aggregate input.
rag_inputs = deepcopy(inputs.additional_rag_inputs)
rag_inputs = deepcopy(inputs.rag_pipeline_inputs)
if query_embedder_name not in rag_inputs:
rag_inputs[query_embedder_name] = {}
rag_inputs[query_embedder_name][query_embedder_text_input] = deepcopy(
Expand Down Expand Up @@ -359,7 +359,7 @@ def _prepare_eval_pipeline_additional_inputs(
"ground_truth_documents": inputs.ground_truth_documents
}
elif metric in (
RAGEvaluationMetric.ANSWER_FAITHFULNESS,
RAGEvaluationMetric.FAITHFULNESS,
RAGEvaluationMetric.CONTEXT_RELEVANCE,
):
eval_inputs[metric.value] = {"questions": inputs.queries}
Expand Down
16 changes: 6 additions & 10 deletions haystack_experimental/evaluation/harness/rag/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ class RAGEvaluationMetric(Enum):
#: Semantic Answer Similarity.
SEMANTIC_ANSWER_SIMILARITY = "metric_sas"

#: Answer Faithfulness.
ANSWER_FAITHFULNESS = "metric_answer_faithfulness"
#: Faithfulness.
FAITHFULNESS = "metric_faithfulness"

#: Context Relevance.
CONTEXT_RELEVANCE = "metric_context_relevance"
Expand All @@ -88,16 +88,12 @@ class RAGEvaluationInput:
:param ground_truth_documents:
The ground truth documents passed to the
evaluation pipeline. Only required for metrics
that require them.
Corresponds to the queries.
that require them. Corresponds to the queries.
:param ground_truth_answers:
The ground truth answers passed to the
evaluation pipeline. Only required for metrics
that require them.
Corresponds to the queries.
:param additional_rag_inputs:
that require them. Corresponds to the queries.
:param rag_pipeline_inputs:
Additional inputs to pass to the RAG pipeline. Each
key is the name of the component and its value a dictionary
with the input name and a list of values, each corresponding
Expand All @@ -107,7 +103,7 @@ class RAGEvaluationInput:
queries: List[str]
ground_truth_documents: Optional[List[List[Document]]] = None
ground_truth_answers: Optional[List[str]] = None
additional_rag_inputs: Optional[Dict[str, Dict[str, List[Any]]]] = None
rag_pipeline_inputs: Optional[Dict[str, Dict[str, List[Any]]]] = None


@dataclass(frozen=True)
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ tag-pattern = 'v(?P<version>.*)'
allow-direct-references = true

[tool.hatch.build.targets.sdist]
include = ["/haystack_experimental", "/VERSION.txt"]
include = ["/haystack_experimental"]

[tool.hatch.build.targets.wheel]
packages = ["haystack_experimental"]
Expand Down Expand Up @@ -166,7 +166,7 @@ ignore_missing_imports = true
[tool.ruff]
line-length = 120
target-version = "py38"
exclude = ["test"]
exclude = ["test", ".github"]

[tool.ruff.lint]
select = [
Expand Down
78 changes: 39 additions & 39 deletions test/evaluation/harness/rag/test_harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,13 @@
from haystack.components.builders import PromptBuilder
from haystack.components.evaluators import (
ContextRelevanceEvaluator,
DocumentMAPEvaluator,
DocumentMRREvaluator,
DocumentRecallEvaluator,
FaithfulnessEvaluator,
SASEvaluator,
)
from haystack.components.evaluators.document_recall import RecallMode
from haystack.components.retrievers.in_memory import (
InMemoryEmbeddingRetriever,
InMemoryBM25Retriever,
Expand Down Expand Up @@ -100,15 +104,23 @@ def run(self, query: str) -> Dict[str, Any]:


@component
class MockModelBasedEvaluator:
class MockEvaluator:
def __init__(self, metric: RAGEvaluationMetric) -> None:
self.metric = metric

io_map = {
RAGEvaluationMetric.DOCUMENT_MAP: DocumentMAPEvaluator(),
RAGEvaluationMetric.DOCUMENT_MRR: DocumentMRREvaluator(),
RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT: DocumentRecallEvaluator(
mode=RecallMode.SINGLE_HIT
),
RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT: DocumentRecallEvaluator(
mode=RecallMode.MULTI_HIT
),
RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: SASEvaluator(
"sentence-transformers/all-MiniLM-L6-v2"
),
RAGEvaluationMetric.ANSWER_FAITHFULNESS: FaithfulnessEvaluator(
RAGEvaluationMetric.FAITHFULNESS: FaithfulnessEvaluator(
api_key=Secret.from_token("test_key")
),
RAGEvaluationMetric.CONTEXT_RELEVANCE: ContextRelevanceEvaluator(
Expand All @@ -121,7 +133,13 @@ def __init__(self, metric: RAGEvaluationMetric) -> None:

@staticmethod
def default_output(metric) -> Dict[str, Any]:
if metric == RAGEvaluationMetric.ANSWER_FAITHFULNESS:
if metric in (
RAGEvaluationMetric.FAITHFULNESS,
RAGEvaluationMetric.DOCUMENT_MAP,
RAGEvaluationMetric.DOCUMENT_MRR,
RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,
RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT,
):
return {
"individual_scores": [1] * 6,
"score": 1.0,
Expand Down Expand Up @@ -496,9 +514,7 @@ def test_run_invalid_additional_input(
ground_truth_documents=[
[Document(content="Paris is the capital of France.")]
],
additional_rag_inputs={
"query_embedder": {"text": ["Some other question?"]}
},
rag_pipeline_inputs={"query_embedder": {"text": ["Some other question?"]}},
)

with pytest.raises(
Expand Down Expand Up @@ -552,20 +568,27 @@ def test_run_invalid_override(
)

def test_run_statistical_metrics(self):
metrics = {
RAGEvaluationMetric.DOCUMENT_MAP,
RAGEvaluationMetric.DOCUMENT_MRR,
RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,
RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT,
}
harness = RAGEvaluationHarness.default_with_keyword_retriever(
build_rag_pipeline_with_keyword_retriever(
retriever_component=MockKeywordRetriever(),
generator_component=MockGenerator(arg=0),
generator_name="generator",
),
metrics={
RAGEvaluationMetric.DOCUMENT_MAP,
RAGEvaluationMetric.DOCUMENT_MRR,
RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,
RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT,
},
metrics=metrics,
)

mock_eval_pipeline = Pipeline()
for m in metrics:
mock_eval_pipeline.add_component(m.value, MockEvaluator(metric=m))

harness.evaluation_pipeline = mock_eval_pipeline

inputs = RAGEvaluationInput(
queries=["What is the capital of France?"] * 6,
ground_truth_documents=[
Expand All @@ -591,22 +614,7 @@ def test_run_statistical_metrics(self):
assert output.inputs == inputs
assert output.results.run_name == "test_run"
assert output.results.results == {
"metric_doc_map": {
"score": 0.7222222222222222,
"individual_scores": [1.0, 0.8333333333333333, 1.0, 0.5, 0.0, 1.0],
},
"metric_doc_recall_single": {
"score": 0.8333333333333334,
"individual_scores": [1.0, 1.0, 1.0, 1.0, 0.0, 1.0],
},
"metric_doc_recall_multi": {
"score": 0.75,
"individual_scores": [1.0, 1.0, 0.5, 1.0, 0.0, 1.0],
},
"metric_doc_mrr": {
"score": 0.75,
"individual_scores": [1.0, 1.0, 1.0, 0.5, 0.0, 1.0],
},
m.value: MockEvaluator.default_output(m) for m in metrics
}
overriden_pipeline_dict = Pipeline.loads(output.evaluated_pipeline).to_dict()
assert (
Expand All @@ -618,7 +626,7 @@ def test_run_model_based_metrics(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test")

metrics = {
RAGEvaluationMetric.ANSWER_FAITHFULNESS,
RAGEvaluationMetric.FAITHFULNESS,
RAGEvaluationMetric.CONTEXT_RELEVANCE,
RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY,
}
Expand All @@ -633,7 +641,7 @@ def test_run_model_based_metrics(self, monkeypatch):

mock_eval_pipeline = Pipeline()
for m in metrics:
mock_eval_pipeline.add_component(m.value, MockModelBasedEvaluator(metric=m))
mock_eval_pipeline.add_component(m.value, MockEvaluator(metric=m))

harness.evaluation_pipeline = mock_eval_pipeline

Expand Down Expand Up @@ -721,13 +729,5 @@ def test_run_model_based_metrics(self, monkeypatch):
],
}
assert output.results.results == {
"metric_answer_faithfulness": MockModelBasedEvaluator.default_output(
RAGEvaluationMetric.ANSWER_FAITHFULNESS
),
"metric_context_relevance": MockModelBasedEvaluator.default_output(
RAGEvaluationMetric.CONTEXT_RELEVANCE
),
"metric_sas": MockModelBasedEvaluator.default_output(
RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY
),
m.value: MockEvaluator.default_output(m) for m in metrics
}

0 comments on commit 9973f3b

Please sign in to comment.