diff --git a/docs/pydoc/config/evaluators_api.yml b/docs/pydoc/config/evaluators_api.yml index 8c6ffe1556..9a8460b94a 100644 --- a/docs/pydoc/config/evaluators_api.yml +++ b/docs/pydoc/config/evaluators_api.yml @@ -7,7 +7,7 @@ loaders: "context_relevance", "document_map", "document_mrr", - "document_recall", + "document_ndcg", "document_recall", "faithfulness", "llm_evaluator", diff --git a/haystack/components/evaluators/__init__.py b/haystack/components/evaluators/__init__.py index 69983a1108..d57da5c1f2 100644 --- a/haystack/components/evaluators/__init__.py +++ b/haystack/components/evaluators/__init__.py @@ -6,6 +6,7 @@ from .context_relevance import ContextRelevanceEvaluator from .document_map import DocumentMAPEvaluator from .document_mrr import DocumentMRREvaluator +from .document_ndcg import DocumentNDCGEvaluator from .document_recall import DocumentRecallEvaluator from .faithfulness import FaithfulnessEvaluator from .llm_evaluator import LLMEvaluator @@ -16,6 +17,7 @@ "ContextRelevanceEvaluator", "DocumentMAPEvaluator", "DocumentMRREvaluator", + "DocumentNDCGEvaluator", "DocumentRecallEvaluator", "FaithfulnessEvaluator", "LLMEvaluator", diff --git a/haystack/components/evaluators/document_ndcg.py b/haystack/components/evaluators/document_ndcg.py new file mode 100644 index 0000000000..e3430f1db7 --- /dev/null +++ b/haystack/components/evaluators/document_ndcg.py @@ -0,0 +1,133 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from math import log2 +from typing import Any, Dict, List + +from haystack import Document, component + + +@component +class DocumentNDCGEvaluator: + """ + Evaluator that calculates the normalized discounted cumulative gain (NDCG) of retrieved documents. + + Each question can have multiple ground truth documents and multiple retrieved documents. + If the ground truth documents have relevance scores, the NDCG calculation uses these scores. + Otherwise, it assumes binary relevance of all ground truth documents. + + Usage example: + ```python + from haystack import Document + from haystack.components.evaluators import DocumentNDCGEvaluator + + evaluator = DocumentNDCGEvaluator() + result = evaluator.run( + ground_truth_documents=[[Document(content="France", score=1.0), Document(content="Paris", score=0.5)]], + retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]], + ) + print(result["individual_scores"]) + # [0.8869] + print(result["score"]) + # 0.8869 + ``` + """ + + @component.output_types(score=float, individual_scores=List[float]) + def run( + self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]] + ) -> Dict[str, Any]: + """ + Run the DocumentNDCGEvaluator on the given inputs. + + `ground_truth_documents` and `retrieved_documents` must have the same length. + The list items within `ground_truth_documents` and `retrieved_documents` can differ in length. + + :param ground_truth_documents: + Lists of expected documents, one list per question. Binary relevance is used if documents have no scores. + :param retrieved_documents: + Lists of retrieved documents, one list per question. + :returns: + A dictionary with the following outputs: + - `score` - The average of calculated scores. + - `individual_scores` - A list of numbers from 0.0 to 1.0 that represents the NDCG for each question. + """ + self.validate_inputs(ground_truth_documents, retrieved_documents) + + individual_scores = [] + + for gt_docs, ret_docs in zip(ground_truth_documents, retrieved_documents): + dcg = self.calculate_dcg(gt_docs, ret_docs) + idcg = self.calculate_idcg(gt_docs) + ndcg = dcg / idcg if idcg > 0 else 0 + individual_scores.append(ndcg) + + score = sum(individual_scores) / len(ground_truth_documents) + + return {"score": score, "individual_scores": individual_scores} + + @staticmethod + def validate_inputs(gt_docs: List[List[Document]], ret_docs: List[List[Document]]): + """ + Validate the input parameters. + + :param gt_docs: + The ground_truth_documents to validate. + :param ret_docs: + The retrieved_documents to validate. + + :raises ValueError: + If the ground_truth_documents or the retrieved_documents are an empty a list. + If the length of ground_truth_documents and retrieved_documents differs. + If any list of documents in ground_truth_documents contains a mix of documents with and without a score. + """ + if len(gt_docs) == 0 or len(ret_docs) == 0: + msg = "ground_truth_documents and retrieved_documents must be provided." + raise ValueError(msg) + + if len(gt_docs) != len(ret_docs): + msg = "The length of ground_truth_documents and retrieved_documents must be the same." + raise ValueError(msg) + + for docs in gt_docs: + if any(doc.score is not None for doc in docs) and any(doc.score is None for doc in docs): + msg = "Either none or all documents in each list of ground_truth_documents must have a score." + raise ValueError(msg) + + @staticmethod + def calculate_dcg(gt_docs: List[Document], ret_docs: List[Document]) -> float: + """ + Calculate the discounted cumulative gain (DCG) of the retrieved documents. + + :param gt_docs: + The ground truth documents. + :param ret_docs: + The retrieved documents. + :returns: + The discounted cumulative gain (DCG) of the retrieved + documents based on the ground truth documents. + """ + dcg = 0.0 + relevant_id_to_score = {doc.id: doc.score if doc.score is not None else 1 for doc in gt_docs} + for i, doc in enumerate(ret_docs): + if doc.id in relevant_id_to_score: # TODO Related to https://github.com/deepset-ai/haystack/issues/8412 + dcg += relevant_id_to_score[doc.id] / log2(i + 2) # i + 2 because i is 0-indexed + return dcg + + @staticmethod + def calculate_idcg(gt_docs: List[Document]) -> float: + """ + Calculate the ideal discounted cumulative gain (IDCG) of the ground truth documents. + + :param gt_docs: + The ground truth documents. + :returns: + The ideal discounted cumulative gain (IDCG) of the ground truth documents. + """ + idcg = 0.0 + for i, doc in enumerate(sorted(gt_docs, key=lambda x: x.score if x.score is not None else 1, reverse=True)): + # If the document has a score, use it; otherwise, use 1 for binary relevance. + relevance = doc.score if doc.score is not None else 1 + idcg += relevance / log2(i + 2) # i + 2 because i is 0-indexed + return idcg diff --git a/releasenotes/notes/document-ndcg-evaluator-d579f51dd76ae76a.yaml b/releasenotes/notes/document-ndcg-evaluator-d579f51dd76ae76a.yaml new file mode 100644 index 0000000000..eae5d32079 --- /dev/null +++ b/releasenotes/notes/document-ndcg-evaluator-d579f51dd76ae76a.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Added a new component DocumentNDCGEvaluator, which is similar to DocumentMRREvaluator and useful for retrieval evaluation. It calculates the normalized discounted cumulative gain, an evaluation metric useful when there are multiple ground truth relevant documents and the order in which they are retrieved is important. diff --git a/test/components/evaluators/test_document_ndcg.py b/test/components/evaluators/test_document_ndcg.py new file mode 100644 index 0000000000..3924855f72 --- /dev/null +++ b/test/components/evaluators/test_document_ndcg.py @@ -0,0 +1,202 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +import pytest + +from haystack import Document +from haystack.components.evaluators.document_ndcg import DocumentNDCGEvaluator + + +def test_run_with_scores(): + evaluator = DocumentNDCGEvaluator() + result = evaluator.run( + ground_truth_documents=[ + [ + Document(content="doc1", score=3), + Document(content="doc2", score=2), + Document(content="doc3", score=3), + Document(content="doc6", score=2), + Document(content="doc7", score=3), + Document(content="doc8", score=2), + ] + ], + retrieved_documents=[ + [ + Document(content="doc1"), + Document(content="doc2"), + Document(content="doc3"), + Document(content="doc4"), + Document(content="doc5"), + ] + ], + ) + assert result["individual_scores"][0] == pytest.approx(0.6592, abs=1e-4) + assert result["score"] == pytest.approx(0.6592, abs=1e-4) + + +def test_run_without_scores(): + evaluator = DocumentNDCGEvaluator() + result = evaluator.run( + ground_truth_documents=[[Document(content="France"), Document(content="Paris")]], + retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]], + ) + assert result["individual_scores"][0] == pytest.approx(0.9197, abs=1e-4) + assert result["score"] == pytest.approx(0.9197, abs=1e-4) + + +def test_run_with_multiple_lists_of_docs(): + evaluator = DocumentNDCGEvaluator() + result = evaluator.run( + ground_truth_documents=[ + [Document(content="France"), Document(content="Paris")], + [ + Document(content="doc1", score=3), + Document(content="doc2", score=2), + Document(content="doc3", score=3), + Document(content="doc6", score=2), + Document(content="doc7", score=3), + Document(content="doc8", score=2), + ], + ], + retrieved_documents=[ + [Document(content="France"), Document(content="Germany"), Document(content="Paris")], + [ + Document(content="doc1"), + Document(content="doc2"), + Document(content="doc3"), + Document(content="doc4"), + Document(content="doc5"), + ], + ], + ) + assert result["individual_scores"][0] == pytest.approx(0.9197, abs=1e-4) + assert result["individual_scores"][1] == pytest.approx(0.6592, abs=1e-4) + assert result["score"] == pytest.approx(0.7895, abs=1e-4) + + +def test_run_with_different_lengths(): + evaluator = DocumentNDCGEvaluator() + with pytest.raises(ValueError): + evaluator.run( + ground_truth_documents=[[Document(content="Berlin")]], + retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]], + ) + with pytest.raises(ValueError): + evaluator.run( + ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]], + retrieved_documents=[[Document(content="Berlin")]], + ) + + +def test_run_with_mixed_documents_with_and_without_scores(): + evaluator = DocumentNDCGEvaluator() + with pytest.raises(ValueError): + evaluator.run( + ground_truth_documents=[[Document(content="France", score=3), Document(content="Paris")]], + retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]], + ) + + +def test_run_empty_retrieved(): + evaluator = DocumentNDCGEvaluator() + result = evaluator.run(ground_truth_documents=[[Document(content="France")]], retrieved_documents=[[]]) + assert result["individual_scores"] == [0.0] + assert result["score"] == 0.0 + + +def test_run_empty_ground_truth(): + evaluator = DocumentNDCGEvaluator() + result = evaluator.run(ground_truth_documents=[[]], retrieved_documents=[[Document(content="France")]]) + assert result["individual_scores"] == [0.0] + assert result["score"] == 0.0 + + +def test_run_empty_retrieved_and_empty_ground_truth(): + evaluator = DocumentNDCGEvaluator() + result = evaluator.run(ground_truth_documents=[[]], retrieved_documents=[[]]) + assert result["individual_scores"] == [0.0] + assert result["score"] == 0.0 + + +def test_run_no_retrieved(): + evaluator = DocumentNDCGEvaluator() + with pytest.raises(ValueError): + result = evaluator.run(ground_truth_documents=[[Document(content="France")]], retrieved_documents=[]) + + +def test_run_no_ground_truth(): + evaluator = DocumentNDCGEvaluator() + with pytest.raises(ValueError): + evaluator.run(ground_truth_documents=[], retrieved_documents=[[Document(content="France")]]) + + +def test_run_no_retrieved_and_no_ground_truth(): + evaluator = DocumentNDCGEvaluator() + with pytest.raises(ValueError): + evaluator.run(ground_truth_documents=[], retrieved_documents=[]) + + +def test_calculate_dcg_with_scores(): + evaluator = DocumentNDCGEvaluator() + gt_docs = [ + Document(content="doc1", score=3), + Document(content="doc2", score=2), + Document(content="doc3", score=3), + Document(content="doc4", score=0), + Document(content="doc5", score=1), + Document(content="doc6", score=2), + ] + ret_docs = [ + Document(content="doc1"), + Document(content="doc2"), + Document(content="doc3"), + Document(content="doc4"), + Document(content="doc5"), + Document(content="doc6"), + ] + dcg = evaluator.calculate_dcg(gt_docs, ret_docs) + assert dcg == pytest.approx(6.8611, abs=1e-4) + + +def test_calculate_dcg_without_scores(): + evaluator = DocumentNDCGEvaluator() + gt_docs = [Document(content="doc1"), Document(content="doc2")] + ret_docs = [Document(content="doc2"), Document(content="doc3"), Document(content="doc1")] + dcg = evaluator.calculate_dcg(gt_docs, ret_docs) + assert dcg == pytest.approx(1.5, abs=1e-4) + + +def test_calculate_dcg_empty(): + evaluator = DocumentNDCGEvaluator() + gt_docs = [Document(content="doc1")] + ret_docs = [] + dcg = evaluator.calculate_dcg(gt_docs, ret_docs) + assert dcg == 0 + + +def test_calculate_idcg_with_scores(): + evaluator = DocumentNDCGEvaluator() + gt_docs = [ + Document(content="doc1", score=3), + Document(content="doc2", score=3), + Document(content="doc3", score=2), + Document(content="doc4", score=3), + Document(content="doc5", score=2), + Document(content="doc6", score=2), + ] + idcg = evaluator.calculate_idcg(gt_docs) + assert idcg == pytest.approx(8.7403, abs=1e-4) + + +def test_calculate_idcg_without_scores(): + evaluator = DocumentNDCGEvaluator() + gt_docs = [Document(content="doc1"), Document(content="doc2"), Document(content="doc3")] + idcg = evaluator.calculate_idcg(gt_docs) + assert idcg == pytest.approx(2.1309, abs=1e-4) + + +def test_calculate_idcg_empty(): + evaluator = DocumentNDCGEvaluator() + gt_docs = [] + idcg = evaluator.calculate_idcg(gt_docs) + assert idcg == 0