Skip to content

Commit

Permalink
feat: Implement RAGEvaluationHarness and related classes
Browse files Browse the repository at this point in the history
  • Loading branch information
shadeMe committed May 29, 2024
1 parent 80d80cb commit 9d176a2
Show file tree
Hide file tree
Showing 11 changed files with 1,155 additions and 13 deletions.
4 changes: 2 additions & 2 deletions haystack_experimental/evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
#
# SPDX-License-Identifier: Apache-2.0

from .harness import EvalRunOverrides, EvaluationHarness
from .harness import EvaluationHarness, EvaluationRunOverrides

_all_ = ["EvaluationHarness", "EvalRunOverrides"]
_all_ = ["EvaluationHarness", "EvaluationRunOverrides"]
4 changes: 2 additions & 2 deletions haystack_experimental/evaluation/harness/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
#
# SPDX-License-Identifier: Apache-2.0

from .evalution_harness import EvalRunOverrides, EvaluationHarness
from .evalution_harness import EvaluationHarness, EvaluationRunOverrides

_all_ = ["EvaluationHarness", "EvalRunOverrides"]
_all_ = ["EvaluationHarness", "EvaluationRunOverrides"]
15 changes: 8 additions & 7 deletions haystack_experimental/evaluation/harness/evalution_harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,10 @@

from haystack import Pipeline
from haystack.core.serialization import DeserializationCallbacks
from haystack.evaluation.eval_run_result import BaseEvaluationRunResult


@dataclass
class EvalRunOverrides:
class EvaluationRunOverrides:
"""
Overrides for an evaluation run.
Expand All @@ -32,7 +31,7 @@ class EvalRunOverrides:


EvalRunInputT = TypeVar("EvalRunInputT")
EvalRunOutputT = TypeVar("EvalRunOutputT", bound=BaseEvaluationRunResult)
EvalRunOutputT = TypeVar("EvalRunOutputT")
EvalRunOverridesT = TypeVar("EvalRunOverridesT")


Expand All @@ -43,9 +42,7 @@ class EvaluationHarness(ABC, Generic[EvalRunInputT, EvalRunOverridesT, EvalRunOu

@staticmethod
def _override_pipeline(pipeline: Pipeline, parameter_overrides: Optional[Dict[str, Any]]) -> Pipeline:
def component_pre_init_callback(
name: str, cls: Type, init_params: Dict[str, Any]
): # pylint: disable=unused-argument
def component_pre_init_callback(name: str, cls: Type, init_params: Dict[str, Any]): # pylint: disable=unused-argument
assert parameter_overrides is not None
overrides = parameter_overrides.get(name)
if overrides:
Expand All @@ -70,7 +67,11 @@ def validate_overrides():

@abstractmethod
def run(
self, inputs: EvalRunInputT, *, overrides: Optional[EvalRunOverridesT] = None, run_name: Optional[str] = None
self,
inputs: EvalRunInputT,
*,
overrides: Optional[EvalRunOverridesT] = None,
run_name: Optional[str] = None,
) -> EvalRunOutputT:
"""
Launch a evaluation run.
Expand Down
23 changes: 23 additions & 0 deletions haystack_experimental/evaluation/harness/rag/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from .harness import RAGEvaluationHarness
from .parameters import (
RAGEvaluationInput,
RAGEvaluationMetric,
RAGEvaluationOutput,
RAGEvaluationOverrides,
RAGExpectedComponent,
RAGExpectedComponentMetadata,
)

_all_ = [
"RAGEvaluationHarness",
"RAGExpectedComponent",
"RAGExpectedComponentMetadata",
"RAGEvaluationMetric",
"RAGEvaluationOutput",
"RAGEvaluationOverrides",
"RAGEvaluationInput",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from functools import partial
from typing import Set

from haystack import Pipeline
from haystack.components.evaluators import (
DocumentMAPEvaluator,
DocumentMRREvaluator,
DocumentRecallEvaluator,
FaithfulnessEvaluator,
SASEvaluator,
)
from haystack.components.evaluators.document_recall import RecallMode

from .parameters import RAGEvaluationMetric


def default_rag_evaluation_pipeline(
metrics: Set[RAGEvaluationMetric],
) -> Pipeline:
"""
Builds the default evaluation pipeline for RAG.
:param metrics:
The set of metrics to include in the pipeline.
:returns:
The evaluation pipeline.
"""
pipeline = Pipeline()

metric_ctors = {
RAGEvaluationMetric.DOCUMENT_MAP: DocumentMAPEvaluator,
RAGEvaluationMetric.DOCUMENT_MRR: DocumentMRREvaluator,
RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT: partial(DocumentRecallEvaluator, mode=RecallMode.SINGLE_HIT),
RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT: partial(DocumentRecallEvaluator, mode=RecallMode.MULTI_HIT),
RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: partial(
SASEvaluator, model="sentence-transformers/all-MiniLM-L6-v2"
),
RAGEvaluationMetric.ANSWER_FAITHFULNESS: FaithfulnessEvaluator,
}

for metric in metrics:
ctor = metric_ctors[metric]
pipeline.add_component(metric.value, ctor())

return pipeline
Loading

0 comments on commit 9d176a2

Please sign in to comment.