From 20f04f605486e4bcca105f86e1216a75447a3f38 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Tue, 16 Jan 2024 08:52:58 +0100 Subject: [PATCH] feat: MetaFieldRanker update (#6742) * Add weight and ranking_mode as params to run for easier experimentation * renaming of metadata to meta * User logger.warning instead of warnings * Add another unit test * Add support for sort_order and fix formatting of error messages * Make MetaFieldRanker more robust. Doesn't crash pipeline if some Documents are missing keys. * Don't print same warning message twice * Add another test * Making MetaFieldRanker more robust * Move up if return statement to earlier in the function * Setting up infer_type * Remove infer_type for now * Release notes * Add init file * Update releasenotes/notes/metafieldranker_sort-order_refactor-2000d89dc40dc15a.yaml Co-authored-by: Stefano Fiorucci --------- Co-authored-by: Stefano Fiorucci --- haystack/components/rankers/meta_field.py | 201 ++++++++++++------ ..._sort-order_refactor-2000d89dc40dc15a.yaml | 6 + test/components/rankers/__init__.py | 0 test/components/rankers/test_metafield.py | 123 ++++++++--- 4 files changed, 237 insertions(+), 93 deletions(-) create mode 100644 releasenotes/notes/metafieldranker_sort-order_refactor-2000d89dc40dc15a.yaml create mode 100644 test/components/rankers/__init__.py diff --git a/haystack/components/rankers/meta_field.py b/haystack/components/rankers/meta_field.py index 6980d3af1d..57e48995e2 100644 --- a/haystack/components/rankers/meta_field.py +++ b/haystack/components/rankers/meta_field.py @@ -1,9 +1,8 @@ import logging -import warnings from collections import defaultdict from typing import List, Dict, Any, Optional, Literal -from haystack import ComponentError, Document, component, default_to_dict +from haystack import Document, component, default_to_dict logger = logging.getLogger(__name__) @@ -11,7 +10,8 @@ @component class MetaFieldRanker: """ - Ranks Documents based on the value of their specific metadata field. The ranking is done in a descending order. + Ranks Documents based on the value of their specific meta field. + The ranking can be performed in descending order or ascending order. Usage example: ``` @@ -36,45 +36,64 @@ def __init__( weight: float = 1.0, top_k: Optional[int] = None, ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"] = "reciprocal_rank_fusion", + sort_order: Literal["ascending", "descending"] = "descending", ): """ Creates an instance of MetaFieldRanker. - :param meta_field: The name of the metadata field to rank by. + :param meta_field: The name of the meta field to rank by. :param weight: In range [0,1]. - 0 disables ranking by a metadata field. - 0.5 content and metadata fields have the same impact for the ranking. - 1 means ranking by a metadata field only. The highest value comes first. - :param top_k: The maximum number of Documents you want the Ranker to return per query. + 0 disables ranking by a meta field. + 0.5 content and meta fields have the same impact for the ranking. + 1 means ranking by a meta field only. The highest value comes first. + :param top_k: The maximum number of Documents you want the Ranker to return per query. If not provided, the + Ranker returns all documents it receives in the new ranking order. :param ranking_mode: The mode used to combine the Retriever's and Ranker's scores. Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'. Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1]. + :param sort_order: Whether to sort the meta field by ascending or descending order. + Possible values are `descending` (default) and `ascending`. """ self.meta_field = meta_field self.weight = weight self.top_k = top_k self.ranking_mode = ranking_mode + self.sort_order = sort_order + self._validate_params( + weight=self.weight, top_k=self.top_k, ranking_mode=self.ranking_mode, sort_order=self.sort_order + ) + + def _validate_params( + self, + weight: float, + top_k: Optional[int], + ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"], + sort_order: Literal["ascending", "descending"], + ): + if top_k is not None and top_k <= 0: + raise ValueError("top_k must be > 0, but got %s" % top_k) + + if weight < 0 or weight > 1: + raise ValueError( + "Parameter must be in range [0,1] but is currently set to '%s'.\n'0' disables sorting by a " + "meta field, '0.5' assigns equal weight to the previous relevance scores and the meta field, and " + "'1' ranks by the meta field only.\nChange the parameter to a value in range 0 to 1 when " + "initializing the MetaFieldRanker." % self.weight + ) - if self.weight < 0 or self.weight > 1: + if ranking_mode not in ["reciprocal_rank_fusion", "linear_score"]: raise ValueError( - """ - Parameter must be in range [0,1] but is currently set to '{}'.\n - '0' disables sorting by a metadata field, '0.5' assigns equal weight to the previous relevance scores and the metadata field, and '1' ranks by the metadata field only.\n - Change the parameter to a value in range 0 to 1 when initializing the MetaFieldRanker. - """.format( - self.weight - ) + "The value of parameter must be 'reciprocal_rank_fusion' or 'linear_score', but is " + "currently set to '%s'.\nChange the value to 'reciprocal_rank_fusion' or " + "'linear_score' when initializing the MetaFieldRanker." % ranking_mode ) - if self.ranking_mode not in ["reciprocal_rank_fusion", "linear_score"]: + if sort_order not in ["ascending", "descending"]: raise ValueError( - """ - The value of parameter must be 'reciprocal_rank_fusion' or 'linear_score', but is currently set to '{}'. \n - Change the value to 'reciprocal_rank_fusion' or 'linear_score' when initializing the MetaFieldRanker. - """.format( - self.ranking_mode - ) + "The value of parameter must be 'ascending' or 'descending', but is currently set to '%s'.\n" + "Change the value to 'ascending' or 'descending' when initializing the " + "MetaFieldRanker." % sort_order ) def to_dict(self) -> Dict[str, Any]: @@ -82,76 +101,128 @@ def to_dict(self) -> Dict[str, Any]: Serialize object to a dictionary. """ return default_to_dict( - self, meta_field=self.meta_field, weight=self.weight, top_k=self.top_k, ranking_mode=self.ranking_mode + self, + meta_field=self.meta_field, + weight=self.weight, + top_k=self.top_k, + ranking_mode=self.ranking_mode, + sort_order=self.sort_order, ) @component.output_types(documents=List[Document]) - def run(self, documents: List[Document], top_k: Optional[int] = None): + def run( + self, + documents: List[Document], + top_k: Optional[int] = None, + weight: Optional[float] = None, + ranking_mode: Optional[Literal["reciprocal_rank_fusion", "linear_score"]] = None, + sort_order: Optional[Literal["ascending", "descending"]] = None, + ): """ - Use this method to rank a list of Documents based on the selected metadata field by: - 1. Sorting the Documents by the metadata field in descending order. - 2. Merging the scores from the metadata field with the scores from the previous component according to the strategy and weight provided. + Use this method to rank a list of Documents based on the selected meta field by: + 1. Sorting the Documents by the meta field in descending or ascending order. + 2. Merging the scores from the meta field with the scores from the previous component according to the strategy and weight provided. 3. Returning the top-k documents. :param documents: Documents to be ranked. - :param top_k: (optional) The number of Documents you want the Ranker to return. If not provided, the Ranker returns all Documents it received. + :param top_k: (optional) The number of Documents you want the Ranker to return. + If not provided, the top_k provided at initialization time is used. + :param weight: (optional) In range [0,1]. + 0 disables ranking by a meta field. + 0.5 content and meta fields have the same impact for the ranking. + 1 means ranking by a meta field only. The highest value comes first. + If not provided, the weight provided at initialization time is used. + :param ranking_mode: (optional) The mode used to combine the Retriever's and Ranker's scores. + Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'. + Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1]. + If not provided, the ranking_mode provided at initialization time is used. + :param sort_order: Whether to sort the meta field by ascending or descending order. + Possible values are `descending` (default) and `ascending`. + If not provided, the sort_order provided at initialization time is used. """ if not documents: return {"documents": []} - if top_k is None: - top_k = self.top_k - elif top_k <= 0: - raise ValueError(f"top_k must be > 0, but got {top_k}") + top_k = top_k or self.top_k + weight = weight or self.weight + ranking_mode = ranking_mode or self.ranking_mode + sort_order = sort_order or self.sort_order + self._validate_params(weight=weight, top_k=top_k, ranking_mode=ranking_mode, sort_order=sort_order) + + # If the weight is 0 then ranking by meta field is disabled and the original documents should be returned + if weight == 0: + return {"documents": documents[:top_k]} + + docs_with_meta_field = [doc for doc in documents if self.meta_field in doc.meta] + docs_missing_meta_field = [doc for doc in documents if self.meta_field not in doc.meta] + + # If all docs are missing self.meta_field return original documents + if len(docs_with_meta_field) == 0: + logger.warning( + "The parameter is currently set to '%s', but none of the provided Documents with IDs %s have this meta key.\n" + "Set to the name of a field that is present within the provided Documents.\n" + "Returning the of the original Documents since there are no values to rank.", + self.meta_field, + ",".join([doc.id for doc in documents]), + ) + return {"documents": documents[:top_k]} + + if len(docs_missing_meta_field) > 0: + logger.warning( + "The parameter is currently set to '%s' but the Documents with IDs %s don't have this meta key.\n" + "These Documents will be placed at the end of the sorting order.", + self.meta_field, + ",".join([doc.id for doc in docs_missing_meta_field]), + ) + # Sort the documents by self.meta_field + reverse = sort_order == "descending" try: - sorted_by_metadata = sorted(documents, key=lambda doc: doc.meta[self.meta_field], reverse=True) - except KeyError: - raise ComponentError( - """ - The parameter is currently set to '{}' but the Documents {} don't have this metadata key.\n - Double-check the names of the metadata fields in your documents \n - and set to the name of the field that contains the metadata you want to use for ranking. - """.format( - self.meta_field, ",".join([doc.id for doc in documents if self.meta_field not in doc.meta]) - ) + sorted_by_meta = sorted(docs_with_meta_field, key=lambda doc: doc.meta[self.meta_field], reverse=reverse) + except TypeError as error: + # Return original documents if mixed types that are not comparable are returned (e.g. int and list) + logger.warning( + "Tried to sort Documents with IDs %s, but got TypeError with the message: %s\n" + "Returning the of the original Documents since meta field ranking is not possible.", + ",".join([doc.id for doc in docs_with_meta_field]), + error, ) + return {"documents": documents[:top_k]} - if self.weight > 0: - sorted_documents = self._merge_scores(documents, sorted_by_metadata) - return {"documents": sorted_documents[:top_k]} - else: - return {"documents": sorted_by_metadata[:top_k]} + # Add the docs missing the meta_field back on the end + sorted_documents = sorted_by_meta + docs_missing_meta_field + sorted_documents = self._merge_rankings(documents, sorted_documents) + return {"documents": sorted_documents[:top_k]} - def _merge_scores(self, documents: List[Document], sorted_documents: List[Document]) -> List[Document]: + def _merge_rankings(self, documents: List[Document], sorted_documents: List[Document]) -> List[Document]: """ - Merge scores for Documents sorted both by their content and by their metadata field. + Merge the two different rankings for Documents sorted both by their content and by their meta field. """ scores_map: Dict = defaultdict(int) if self.ranking_mode == "reciprocal_rank_fusion": - for i, (doc, sorted_doc) in enumerate(zip(documents, sorted_documents)): - scores_map[doc.id] += self._calculate_rrf(rank=i) * (1 - self.weight) + for i, (document, sorted_doc) in enumerate(zip(documents, sorted_documents)): + scores_map[document.id] += self._calculate_rrf(rank=i) * (1 - self.weight) scores_map[sorted_doc.id] += self._calculate_rrf(rank=i) * self.weight elif self.ranking_mode == "linear_score": - for i, (doc, sorted_doc) in enumerate(zip(documents, sorted_documents)): + for i, (document, sorted_doc) in enumerate(zip(documents, sorted_documents)): score = float(0) - if doc.score is None: - warnings.warn("The score wasn't provided; defaulting to 0.") - elif doc.score < 0 or doc.score > 1: - warnings.warn( - "The score {} for Document {} is outside the [0,1] range; defaulting to 0".format( - doc.score, doc.id - ) + if document.score is None: + logger.warning("The score wasn't provided; defaulting to 0.") + elif document.score < 0 or document.score > 1: + logger.warning( + "The score %s for Document %s is outside the [0,1] range; defaulting to 0", + document.score, + document.id, ) else: - score = doc.score + score = document.score - scores_map[doc.id] += score * (1 - self.weight) + scores_map[document.id] += score * (1 - self.weight) scores_map[sorted_doc.id] += self._calc_linear_score(rank=i, amount=len(sorted_documents)) * self.weight - for doc in documents: - doc.score = scores_map[doc.id] + for document in documents: + document.score = scores_map[document.id] new_sorted_documents = sorted(documents, key=lambda doc: doc.score if doc.score else -1, reverse=True) return new_sorted_documents @@ -167,7 +238,7 @@ def _calculate_rrf(rank: int, k: int = 61) -> float: @staticmethod def _calc_linear_score(rank: int, amount: int) -> float: """ - Calculate the metadata field score as a linear score between the greatest and the lowest score in the list. + Calculate the meta field score as a linear score between the greatest and the lowest score in the list. This linear scaling is useful for: - Reducing the effect of outliers - Creating scores that are meaningfully distributed in the range [0,1], diff --git a/releasenotes/notes/metafieldranker_sort-order_refactor-2000d89dc40dc15a.yaml b/releasenotes/notes/metafieldranker_sort-order_refactor-2000d89dc40dc15a.yaml new file mode 100644 index 0000000000..18a792c232 --- /dev/null +++ b/releasenotes/notes/metafieldranker_sort-order_refactor-2000d89dc40dc15a.yaml @@ -0,0 +1,6 @@ +--- +enhancements: + - | + Prevent the `MetaFieldRanker` from throwing an error if one or more of the documents doesn't contain the specific meta data field. Now those documents will be ignored for ranking purposes and placed at the end of the ranked list so we don't completely throw them away. + Adding a sort_order that can have values of descending or ascending. + Added more runtime parameters. diff --git a/test/components/rankers/__init__.py b/test/components/rankers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/components/rankers/test_metafield.py b/test/components/rankers/test_metafield.py index 1c085ad446..1269d3ca83 100644 --- a/test/components/rankers/test_metafield.py +++ b/test/components/rankers/test_metafield.py @@ -1,6 +1,7 @@ import pytest +import logging -from haystack import Document, ComponentError +from haystack import Document from haystack.components.rankers.meta_field import MetaFieldRanker @@ -15,25 +16,33 @@ def test_to_dict(self): "weight": 1.0, "top_k": None, "ranking_mode": "reciprocal_rank_fusion", + "sort_order": "descending", }, } def test_to_dict_with_custom_init_parameters(self): - component = MetaFieldRanker(meta_field="rating", weight=0.5, top_k=5, ranking_mode="linear_score") + component = MetaFieldRanker( + meta_field="rating", weight=0.5, top_k=5, ranking_mode="linear_score", sort_order="ascending" + ) data = component.to_dict() assert data == { "type": "haystack.components.rankers.meta_field.MetaFieldRanker", - "init_parameters": {"meta_field": "rating", "weight": 0.5, "top_k": 5, "ranking_mode": "linear_score"}, + "init_parameters": { + "meta_field": "rating", + "weight": 0.5, + "top_k": 5, + "ranking_mode": "linear_score", + "sort_order": "ascending", + }, } - @pytest.mark.integration - @pytest.mark.parametrize("metafield_values, expected_first_value", [([1.3, 0.7, 2.1], 2.1), ([1, 5, 8], 8)]) - def test_run(self, metafield_values, expected_first_value): + @pytest.mark.parametrize("meta_field_values, expected_first_value", [([1.3, 0.7, 2.1], 2.1), ([1, 5, 8], 8)]) + def test_run(self, meta_field_values, expected_first_value): """ Test if the component ranks documents correctly. """ ranker = MetaFieldRanker(meta_field="rating") - docs_before = [Document(content="abc", meta={"rating": value}) for value in metafield_values] + docs_before = [Document(content="abc", meta={"rating": value}) for value in meta_field_values] output = ranker.run(documents=docs_before) docs_after = output["documents"] @@ -44,32 +53,93 @@ def test_run(self, metafield_values, expected_first_value): sorted_scores = sorted([doc.meta["rating"] for doc in docs_after], reverse=True) assert [doc.meta["rating"] for doc in docs_after] == sorted_scores - @pytest.mark.integration + def test_run_with_weight_equal_to_0(self): + ranker = MetaFieldRanker(meta_field="rating", weight=0.0) + docs_before = [Document(content="abc", meta={"rating": value}) for value in [1.1, 0.5, 2.3]] + output = ranker.run(documents=docs_before) + docs_after = output["documents"] + + assert len(docs_after) == 3 + assert [doc.meta["rating"] for doc in docs_after] == [1.1, 0.5, 2.3] + + def test_run_with_weight_equal_to_1(self): + ranker = MetaFieldRanker(meta_field="rating", weight=1.0) + docs_before = [Document(content="abc", meta={"rating": value}) for value in [1.1, 0.5, 2.3]] + output = ranker.run(documents=docs_before) + docs_after = output["documents"] + + assert len(docs_after) == 3 + sorted_scores = sorted([doc.meta["rating"] for doc in docs_after], reverse=True) + assert [doc.meta["rating"] for doc in docs_after] == sorted_scores + + def test_sort_order_ascending(self): + ranker = MetaFieldRanker(meta_field="rating", weight=1.0, sort_order="ascending") + docs_before = [Document(content="abc", meta={"rating": value}) for value in [1.1, 0.5, 2.3]] + output = ranker.run(documents=docs_before) + docs_after = output["documents"] + + assert len(docs_after) == 3 + sorted_scores = sorted([doc.meta["rating"] for doc in docs_after]) + assert [doc.meta["rating"] for doc in docs_after] == sorted_scores + def test_returns_empty_list_if_no_documents_are_provided(self): ranker = MetaFieldRanker(meta_field="rating") output = ranker.run(documents=[]) docs_after = output["documents"] assert docs_after == [] - @pytest.mark.integration - def test_raises_component_error_if_metadata_not_found(self): + def test_warning_if_meta_not_found(self, caplog): ranker = MetaFieldRanker(meta_field="rating") - docs_before = [Document(content="abc", meta={"wrong_field": 1.3})] - with pytest.raises(ComponentError): + docs_before = [Document(id="1", content="abc", meta={"wrong_field": 1.3})] + with caplog.at_level(logging.WARNING): + ranker.run(documents=docs_before) + assert ( + "The parameter is currently set to 'rating', but none of the provided Documents with IDs 1 have this meta key." + in caplog.text + ) + + def test_warning_if_some_meta_not_found(self, caplog): + ranker = MetaFieldRanker(meta_field="rating") + docs_before = [ + Document(id="1", content="abc", meta={"wrong_field": 1.3}), + Document(id="2", content="def", meta={"rating": 1.3}), + ] + with caplog.at_level(logging.WARNING): ranker.run(documents=docs_before) + assert ( + "The parameter is currently set to 'rating' but the Documents with IDs 1 don't have this meta key." + in caplog.text + ) - @pytest.mark.integration - def test_raises_component_error_if_wrong_ranking_mode(self): + def test_warning_if_unsortable_values(self, caplog): + ranker = MetaFieldRanker(meta_field="rating") + docs_before = [ + Document(id="1", content="abc", meta={"rating": 1.3}), + Document(id="2", content="abc", meta={"rating": "1.2"}), + Document(id="3", content="abc", meta={"rating": 2.1}), + ] + with caplog.at_level(logging.WARNING): + output = ranker.run(documents=docs_before) + assert len(output["documents"]) == 3 + assert "Tried to sort Documents with IDs 1,2,3, but got TypeError with the message:" in caplog.text + + def test_raises_value_error_if_wrong_ranking_mode(self): with pytest.raises(ValueError): MetaFieldRanker(meta_field="rating", ranking_mode="wrong_mode") - @pytest.mark.integration + def test_raises_value_error_if_wrong_top_k(self): + with pytest.raises(ValueError): + MetaFieldRanker(meta_field="rating", top_k=-1) + @pytest.mark.parametrize("score", [-1, 2, 1.3, 2.1]) def test_raises_component_error_if_wrong_weight(self, score): with pytest.raises(ValueError): MetaFieldRanker(meta_field="rating", weight=score) - @pytest.mark.integration + def test_raises_value_error_if_wrong_sort_order(self): + with pytest.raises(ValueError): + MetaFieldRanker(meta_field="rating", sort_order="wrong_order") + def test_linear_score(self): ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5) docs_before = [ @@ -81,7 +151,6 @@ def test_linear_score(self): docs_after = output["documents"] assert docs_after[0].score == 0.8 - @pytest.mark.integration def test_reciprocal_rank_fusion(self): ranker = MetaFieldRanker(meta_field="rating", ranking_mode="reciprocal_rank_fusion", weight=0.5) docs_before = [ @@ -93,22 +162,19 @@ def test_reciprocal_rank_fusion(self): docs_after = output["documents"] assert docs_after[0].score == 0.01626123744050767 - @pytest.mark.integration @pytest.mark.parametrize("score", [-1, 2, 1.3, 2.1]) - def test_linear_score_raises_warning_if_doc_wrong_score(self, score): + def test_linear_score_raises_warning_if_doc_wrong_score(self, score, caplog): ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5) docs_before = [ - Document(id=1, content="abc", meta={"rating": 1.3}, score=score), - Document(id=2, content="abc", meta={"rating": 0.7}, score=0.4), - Document(id=3, content="abc", meta={"rating": 2.1}, score=0.6), + Document(id="1", content="abc", meta={"rating": 1.3}, score=score), + Document(id="2", content="abc", meta={"rating": 0.7}, score=0.4), + Document(id="3", content="abc", meta={"rating": 2.1}, score=0.6), ] - with pytest.warns( - UserWarning, match=rf"The score {score} for Document 1 is outside the \[0,1\] range; defaulting to 0" - ): + with caplog.at_level(logging.WARNING): ranker.run(documents=docs_before) + assert f"The score {score} for Document 1 is outside the [0,1] range; defaulting to 0" in caplog.text - @pytest.mark.integration - def test_linear_score_raises_raises_warning_if_doc_without_score(self): + def test_linear_score_raises_raises_warning_if_doc_without_score(self, caplog): ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5) docs_before = [ Document(content="abc", meta={"rating": 1.3}), @@ -116,5 +182,6 @@ def test_linear_score_raises_raises_warning_if_doc_without_score(self): Document(content="abc", meta={"rating": 2.1}), ] - with pytest.warns(UserWarning, match="The score wasn't provided; defaulting to 0."): + with caplog.at_level(logging.WARNING): ranker.run(documents=docs_before) + assert "The score wasn't provided; defaulting to 0." in caplog.text