Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add DocumentNDCGEvaluator component #8419

Merged
merged 9 commits into from
Oct 1, 2024
2 changes: 1 addition & 1 deletion docs/pydoc/config/evaluators_api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ loaders:
"context_relevance",
"document_map",
"document_mrr",
"document_recall",
"document_ndcg",
"document_recall",
"faithfulness",
"llm_evaluator",
Expand Down
2 changes: 2 additions & 0 deletions haystack/components/evaluators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .context_relevance import ContextRelevanceEvaluator
from .document_map import DocumentMAPEvaluator
from .document_mrr import DocumentMRREvaluator
from .document_ndcg import DocumentNDCGEvaluator
from .document_recall import DocumentRecallEvaluator
from .faithfulness import FaithfulnessEvaluator
from .llm_evaluator import LLMEvaluator
Expand All @@ -16,6 +17,7 @@
"ContextRelevanceEvaluator",
"DocumentMAPEvaluator",
"DocumentMRREvaluator",
"DocumentNDCGEvaluator",
"DocumentRecallEvaluator",
"FaithfulnessEvaluator",
"LLMEvaluator",
Expand Down
94 changes: 94 additions & 0 deletions haystack/components/evaluators/document_ndcg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from math import log2
from typing import Any, Dict, List

from haystack import Document, component


@component
class DocumentNDCGEvaluator:
"""
Evaluator that calculates the normalized discounted cumulative gain (NDCG) of retrieved documents.

Each question can have multiple ground truth documents and multiple retrieved documents.
If the ground truth documents have relevance scores, the NDCG calculation uses these scores.
Otherwise, it uses the inverse of the document ranks as scores.

Usage example:
```python
from haystack import Document
from haystack.components.evaluators import DocumentNDCGEvaluator

evaluator = DocumentNDCGEvaluator()
result = evaluator.run(
ground_truth_documents=[[Document(content="France"), Document(content="Paris")]],
retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]],
)
print(result["individual_scores"])
# [0.8869]
print(result["score"])
# 0.8869
```
"""

@component.output_types(score=float, individual_scores=List[float])
def run(
self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
) -> Dict[str, Any]:
"""
Run the DocumentNDCGEvaluator on the given inputs.

`ground_truth_documents` and `retrieved_documents` must have the same length.
Amnah199 marked this conversation as resolved.
Show resolved Hide resolved

:param ground_truth_documents:
A list of expected documents for each question with relevance scores or sorted by relevance.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In light of above comments, maybe this can also be refined. Currently, it sounds like we are expecting a list of documents.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated the docstring. Please let me know if it was better before or not.

:param retrieved_documents:
A list of retrieved documents for each question.
:returns:
A dictionary with the following outputs:
- `score` - The average of calculated scores.
- `individual_scores` - A list of numbers from 0.0 to 1.0 that represents the NDCG for each question.
"""
if len(ground_truth_documents) != len(retrieved_documents):
msg = "The length of ground_truth_documents and retrieved_documents must be the same."
raise ValueError(msg)

for gt_docs in ground_truth_documents:
if any(doc.score is not None for doc in gt_docs) and any(doc.score is None for doc in gt_docs):
msg = "Either none or all documents in each list of ground_truth_documents must have a score."
raise ValueError(msg)

individual_scores = []

for gt_docs, ret_docs in zip(ground_truth_documents, retrieved_documents):
dcg = self._calculate_dcg(gt_docs, ret_docs)
idcg = self._calculate_idcg(gt_docs)
ndcg = dcg / idcg if idcg > 0 else 0
individual_scores.append(ndcg)

score = sum(individual_scores) / len(ground_truth_documents)
tstadel marked this conversation as resolved.
Show resolved Hide resolved

return {"score": score, "individual_scores": individual_scores}

def _calculate_dcg(self, gt_docs: List[Document], ret_docs: List[Document]) -> float:
dcg = 0.0
relevant_id_to_score = {doc.id: doc.score for doc in gt_docs}
for i, doc in enumerate(ret_docs):
if doc.id in relevant_id_to_score: # TODO Related to https://github.com/deepset-ai/haystack/issues/8412
# If the gt document has a float score, use it; otherwise, use the inverse of the rank
Copy link
Member

@tstadel tstadel Oct 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we using the inverse of the rank as fallback? Effectively this would double the "rank-discount" of the retrieved document: One by dividing by (i +1) in line 85 and the other by dividing by log2(i + 2) in line 86.

I guess a better fallback would be to just use value 1 which would translate into a simple binary relevance schema according to https://en.wikipedia.org/wiki/Discounted_cumulative_gain

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My idea was that the user can provide the relevant documents as a sorted list without scores. With the current fallback, the retrieved documents get the highest NDCG score only if all relevant documents are retrieved in this particular order.
With a fallback to 1, the order of the relevant documents wouldn't matter anymore. I agree that's then simple binary relevance. Happy to change the fallback to that if users benefit more from that.
@bilgeyucel You wanted to pass a sorted list of documents without scores right?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@julian-risch yes, I'm using HotPot QA dataset from hugging face and it doesn't provide scores.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we change the fallback to binary relevance, what you could do is calculate scores yourself before passing the documents to the DocumentNDCGEvaluator. For example:

for i, doc in enumerate(docs, 1):
    doc.score = 1 / i

That would work for you too right?

Copy link
Member

@tstadel tstadel Oct 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@julian-risch I can understand the intuition now. Still I'd probably not make this the default behavior: When supplying ground-truth docs, I wouldn't expect that the order of them makes a difference, tbh.
And if you really need that, you could simply pass scores as you showed in the preceding comment.

Anyways, I think there is an error in the implemetatoin of the intuiton. If I got it correct, then document relevance should be based on the order of the passed ground-truth docs. In the current implementation it's instead based on the order of the retrieved documents: relevance = 1 / (i + 1) where i is the index of the retrieved doc, not the ground-truth doc.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, true. I will change the fallback to binary relevance. 👍

relevance = relevant_id_to_score[doc.id]
if relevance is None:
relevance = 1 / (i + 1)
dcg += relevance / log2(i + 2) # i + 2 because i is 0-indexed
return dcg

def _calculate_idcg(self, gt_docs: List[Document]) -> float:
idcg = 0.0
for i, doc in enumerate(sorted(gt_docs, key=lambda x: x.score if x.score is not None else 1, reverse=True)):
# If the document has a score, use it; otherwise, use the inverse of the rank
relevance = doc.score if doc.score is not None else 1 / (i + 1)
idcg += relevance / log2(i + 2)
return idcg
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Added a new component DocumentNDCGEvaluator, which is similar to DocumentMRREvaluator and useful for retrieval evaluation. It calculates the normalized discounted cumulative gain, an evaluation metric useful when there are multiple ground truth relevant documents and the order in which they are retrieved is important.
147 changes: 147 additions & 0 deletions test/components/evaluators/test_document_ndcg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
import pytest

from haystack import Document
from haystack.components.evaluators.document_ndcg import DocumentNDCGEvaluator


def test_run_with_scores():
Amnah199 marked this conversation as resolved.
Show resolved Hide resolved
evaluator = DocumentNDCGEvaluator()
result = evaluator.run(
ground_truth_documents=[
[
Document(content="doc1", score=3),
Document(content="doc2", score=2),
Document(content="doc3", score=3),
Document(content="doc6", score=2),
Document(content="doc7", score=3),
Document(content="doc8", score=2),
]
],
retrieved_documents=[
[
Document(content="doc1"),
Document(content="doc2"),
Document(content="doc3"),
Document(content="doc4"),
Document(content="doc5"),
]
],
)
assert result["individual_scores"][0] == pytest.approx(0.6592, abs=1e-4)
assert result["score"] == pytest.approx(0.6592, abs=1e-4)


def test_run_without_scores():
evaluator = DocumentNDCGEvaluator()
result = evaluator.run(
ground_truth_documents=[[Document(content="France"), Document(content="Paris")]],
retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]],
)
assert result["individual_scores"][0] == pytest.approx(0.8869, abs=1e-4)
assert result["score"] == pytest.approx(0.8869, abs=1e-4)


def test_run_with_different_lengths():
evaluator = DocumentNDCGEvaluator()
with pytest.raises(ValueError):
evaluator.run(
ground_truth_documents=[[Document(content="Berlin")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
)
with pytest.raises(ValueError):
evaluator.run(
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")]],
)


def test_run_with_mixed_documents_with_and_without_scores():
evaluator = DocumentNDCGEvaluator()
with pytest.raises(ValueError):
evaluator.run(
ground_truth_documents=[[Document(content="France", score=3), Document(content="Paris")]],
retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]],
)


def test_run_empty_retrieved():
evaluator = DocumentNDCGEvaluator()
result = evaluator.run(ground_truth_documents=[[Document(content="France")]], retrieved_documents=[[]])
assert result["individual_scores"] == [0.0]
assert result["score"] == 0.0


def test_run_empty_ground_truth():
evaluator = DocumentNDCGEvaluator()
result = evaluator.run(ground_truth_documents=[[]], retrieved_documents=[[Document(content="France")]])
assert result["individual_scores"] == [0.0]
assert result["score"] == 0.0
tstadel marked this conversation as resolved.
Show resolved Hide resolved


def test_calculate_dcg_with_scores():
evaluator = DocumentNDCGEvaluator()
gt_docs = [
Document(content="doc1", score=3),
Document(content="doc2", score=2),
Document(content="doc3", score=3),
Document(content="doc4", score=0),
Document(content="doc5", score=1),
Document(content="doc6", score=2),
]
ret_docs = [
Document(content="doc1"),
Document(content="doc2"),
Document(content="doc3"),
Document(content="doc4"),
Document(content="doc5"),
Document(content="doc6"),
]
dcg = evaluator._calculate_dcg(gt_docs, ret_docs)
assert dcg == pytest.approx(6.8611, abs=1e-4)


def test_calculate_dcg_without_scores():
evaluator = DocumentNDCGEvaluator()
gt_docs = [Document(content="doc1"), Document(content="doc2")]
ret_docs = [Document(content="doc2"), Document(content="doc3"), Document(content="doc1")]
dcg = evaluator._calculate_dcg(gt_docs, ret_docs)
assert dcg == pytest.approx(1.1667, abs=1e-4)


def test_calculate_dcg_empty():
evaluator = DocumentNDCGEvaluator()
gt_docs = [Document(content="doc1")]
ret_docs = []
dcg = evaluator._calculate_dcg(gt_docs, ret_docs)
assert dcg == 0


def test_calculate_idcg_with_scores():
evaluator = DocumentNDCGEvaluator()
gt_docs = [
Document(content="doc1", score=3),
Document(content="doc2", score=3),
Document(content="doc3", score=2),
Document(content="doc4", score=3),
Document(content="doc5", score=2),
Document(content="doc6", score=2),
]
idcg = evaluator._calculate_idcg(gt_docs)
assert idcg == pytest.approx(8.7403, abs=1e-4)


def test_calculate_idcg_without_scores():
evaluator = DocumentNDCGEvaluator()
gt_docs = [Document(content="doc1"), Document(content="doc2"), Document(content="doc3")]
idcg = evaluator._calculate_idcg(gt_docs)
assert idcg == pytest.approx(1.4821, abs=1e-4)


def test_calculate_idcg_empty():
evaluator = DocumentNDCGEvaluator()
gt_docs = []
idcg = evaluator._calculate_idcg(gt_docs)
assert idcg == 0
Loading