Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add ground truth documents and answers to RAG eval run results as inputs #17

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 22 additions & 14 deletions haystack_experimental/evaluation/harness/rag/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,23 +131,31 @@ def run( # noqa: D102
pipeline_outputs["second"],
)

result_inputs = {
"questions": inputs.queries,
"contexts": [
[doc.content for doc in docs]
for docs in self._lookup_component_output(
RAGExpectedComponent.DOCUMENT_RETRIEVER,
rag_outputs,
"retrieved_documents",
)
],
"responses": self._lookup_component_output(
RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies"
),
}
if inputs.ground_truth_answers is not None:
result_inputs["ground_truth_answers"] = inputs.ground_truth_answers
if inputs.ground_truth_documents is not None:
result_inputs["ground_truth_documents"] = [
[doc.content for doc in docs] for docs in inputs.ground_truth_documents
]

assert run_name is not None
run_results = EvaluationRunResult(
run_name,
inputs={
"questions": inputs.queries,
"contexts": [
[doc.content for doc in docs]
for docs in self._lookup_component_output(
RAGExpectedComponent.DOCUMENT_RETRIEVER,
rag_outputs,
"retrieved_documents",
)
],
"responses": self._lookup_component_output(
RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies"
),
},
inputs=result_inputs,
results=eval_outputs,
)

Expand Down
56 changes: 56 additions & 0 deletions test/evaluation/harness/rag/test_harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,62 @@ def test_run_model_based_metrics(self, monkeypatch):

assert output.inputs == inputs
assert output.results.run_name == "test_run"
assert output.results.inputs == {
"questions": ["What is the capital of France?"] * 6,
"contexts": [
["France"],
[
"9th century",
"10th century",
"9th",
],
[
"classical",
"rock music",
"dubstep",
],
[
"11th",
"the 11th",
"11th century",
],
[
"Denmark",
"Norway",
"Iceland",
],
[
"10th century",
"the first half of the 10th century",
"10th",
"10th",
],
],
"responses": [
"placeholder",
"placeholder",
"placeholder",
"placeholder",
"placeholder",
"placeholder",
],
"ground_truth_documents": [
["France"],
["9th century", "9th"],
["classical music", "classical"],
["11th century", "the 11th"],
["Denmark, Iceland and Norway"],
["10th century", "10th"],
],
"ground_truth_answers": [
"Paris is the capital of France.",
"9th century",
"classical music",
"11th century",
"Denmark, Iceland and Norway",
"10th century",
],
}
assert output.results.results == {
"metric_answer_faithfulness": MockModelBasedEvaluator.default_output(
RAGEvaluationMetric.ANSWER_FAITHFULNESS
Expand Down
Loading