From e5a80722c22c59eb99416bf0cd712f6de7cd581a Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 12 Nov 2024 16:01:53 +0100 Subject: [PATCH] feat: adding metadata grouper component (#8512) * initial import * making tests more readable; adding docstring * adding release notes * adding LICENSE header * Update test/components/rankers/test_metadata_grouper.py Co-authored-by: Stefano Fiorucci * refactoring * fixing docstring * fixing types * test docstrings * renaming test * handling too-many-arguments * liting * Update haystack/components/rankers/metadata_grouper.py Co-authored-by: Stefano Fiorucci * changing name * Update haystack/components/rankers/metadata_grouper.py Co-authored-by: Daria Fokina * Update haystack/components/rankers/metadata_grouper.py Co-authored-by: Daria Fokina * assiging value inside function for re-use * improving docstring * updating name to MetaFieldGroupingRanker * adding to pydocs * fixing imports * adding output docstring * Update haystack/components/rankers/meta_field_grouper_ranker.py Co-authored-by: Stefano Fiorucci * Update haystack/components/rankers/__init__.py Co-authored-by: Stefano Fiorucci * Update releasenotes/notes/add-metadata-grouper-21ec05fd4a307425.yaml Co-authored-by: Stefano Fiorucci * Update test/components/rankers/test_metadata_grouper.py Co-authored-by: Stefano Fiorucci * update docstring tests * fixing imports * rename modules for consistency * fix pydocs * simplification + more tests --------- Co-authored-by: Stefano Fiorucci Co-authored-by: Daria Fokina --- docs/pydoc/config/rankers_api.yml | 2 +- haystack/components/rankers/__init__.py | 2 + .../rankers/meta_field_grouping_ranker.py | 118 ++++++++++++ ...add-metadata-grouper-21ec05fd4a307425.yaml | 4 + test/components/generators/test_openai.py | 2 +- .../test_meta_field_grouping_ranker.py | 181 ++++++++++++++++++ 6 files changed, 307 insertions(+), 2 deletions(-) create mode 100644 haystack/components/rankers/meta_field_grouping_ranker.py create mode 100644 releasenotes/notes/add-metadata-grouper-21ec05fd4a307425.yaml create mode 100644 test/components/rankers/test_meta_field_grouping_ranker.py diff --git a/docs/pydoc/config/rankers_api.yml b/docs/pydoc/config/rankers_api.yml index 31dc468ad8..09ea957e78 100644 --- a/docs/pydoc/config/rankers_api.yml +++ b/docs/pydoc/config/rankers_api.yml @@ -1,7 +1,7 @@ loaders: - type: haystack_pydoc_tools.loaders.CustomPythonLoader search_path: [../../../haystack/components/rankers] - modules: ["lost_in_the_middle", "meta_field", "transformers_similarity", "sentence_transformers_diversity"] + modules: ["lost_in_the_middle", "meta_field", "meta_field_grouping_ranker", "transformers_similarity", "sentence_transformers_diversity"] ignore_when_discovered: ["__init__"] processors: - type: filter diff --git a/haystack/components/rankers/__init__.py b/haystack/components/rankers/__init__.py index eb2728a344..e76fa68c69 100644 --- a/haystack/components/rankers/__init__.py +++ b/haystack/components/rankers/__init__.py @@ -4,12 +4,14 @@ from haystack.components.rankers.lost_in_the_middle import LostInTheMiddleRanker from haystack.components.rankers.meta_field import MetaFieldRanker +from haystack.components.rankers.meta_field_grouping_ranker import MetaFieldGroupingRanker from haystack.components.rankers.sentence_transformers_diversity import SentenceTransformersDiversityRanker from haystack.components.rankers.transformers_similarity import TransformersSimilarityRanker __all__ = [ "LostInTheMiddleRanker", "MetaFieldRanker", + "MetaFieldGroupingRanker", "SentenceTransformersDiversityRanker", "TransformersSimilarityRanker", ] diff --git a/haystack/components/rankers/meta_field_grouping_ranker.py b/haystack/components/rankers/meta_field_grouping_ranker.py new file mode 100644 index 0000000000..293b7dbc1f --- /dev/null +++ b/haystack/components/rankers/meta_field_grouping_ranker.py @@ -0,0 +1,118 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from collections import defaultdict +from typing import Any, Dict, List, Optional, cast + +from haystack import Document, component, logging + +logger = logging.getLogger(__name__) + + +@component +class MetaFieldGroupingRanker: + """ + Reorders the documents by grouping them based on metadata keys. + + The MetaFieldGroupingRanker can group documents by a primary metadata key `group_by`, and subgroup them with an optional + secondary key, `subgroup_by`. + Within each group or subgroup, it can also sort documents by a metadata key `sort_docs_by`. + + The output is a flat list of documents ordered by `group_by` and `subgroup_by` values. + Any documents without a group are placed at the end of the list. + + The proper organization of documents helps improve the efficiency and performance of subsequent processing by an LLM. + + ### Usage example + + ```python + from haystack.components.rankers import MetaFieldGroupingRanker + from haystack.dataclasses import Document + + + docs = [ + Document(content="Javascript is a popular programming language", meta={"group": "42", "split_id": 7, "subgroup": "subB"}), + Document(content="Python is a popular programming language",meta={"group": "42", "split_id": 4, "subgroup": "subB"}), + Document(content="A chromosome is a package of DNA", meta={"group": "314", "split_id": 2, "subgroup": "subC"}), + Document(content="An octopus has three hearts", meta={"group": "11", "split_id": 2, "subgroup": "subD"}), + Document(content="Java is a popular programming language", meta={"group": "42", "split_id": 3, "subgroup": "subB"}) + ] + + ranker = MetaFieldGroupingRanker(group_by="group",subgroup_by="subgroup", sort_docs_by="split_id") + result = ranker.run(documents=docs) + print(result["documents"]) + + # [ + # Document(id=d665bbc83e52c08c3d8275bccf4f22bf2bfee21c6e77d78794627637355b8ebc, + # content: 'Java is a popular programming language', meta: {'group': '42', 'split_id': 3, 'subgroup': 'subB'}), + # Document(id=a20b326f07382b3cbf2ce156092f7c93e8788df5d48f2986957dce2adb5fe3c2, + # content: 'Python is a popular programming language', meta: {'group': '42', 'split_id': 4, 'subgroup': 'subB'}), + # Document(id=ce12919795d22f6ca214d0f161cf870993889dcb146f3bb1b3e1ffdc95be960f, + # content: 'Javascript is a popular programming language', meta: {'group': '42', 'split_id': 7, 'subgroup': 'subB'}), + # Document(id=d9fc857046c904e5cf790b3969b971b1bbdb1b3037d50a20728fdbf82991aa94, + # content: 'A chromosome is a package of DNA', meta: {'group': '314', 'split_id': 2, 'subgroup': 'subC'}), + # Document(id=6d3b7bdc13d09aa01216471eb5fb0bfdc53c5f2f3e98ad125ff6b85d3106c9a3, + # content: 'An octopus has three hearts', meta: {'group': '11', 'split_id': 2, 'subgroup': 'subD'}) + # ] + ``` + """ # noqa: E501 + + def __init__(self, group_by: str, subgroup_by: Optional[str] = None, sort_docs_by: Optional[str] = None): + """ + Creates an instance of DeepsetMetadataGrouper. + + :param group_by: The metadata key to aggregate the documents by. + :param subgroup_by: The metadata key to aggregate the documents within a group that was created by the + `group_by` key. + :param sort_docs_by: Determines which metadata key is used to sort the documents. If not provided, the + documents within the groups or subgroups are not sorted and are kept in the same order as + they were inserted in the subgroups. + + """ + self.group_by = group_by + self.sort_docs_by = sort_docs_by + self.subgroup_by = subgroup_by + + @component.output_types(documents=List[Document]) + def run(self, documents: List[Document]) -> Dict[str, Any]: + """ + Groups the provided list of documents based on the `group_by` parameter and optionally the `subgroup_by`. + + The output is a list of documents reordered based on how they were grouped. + + :param documents: The list of documents to group. + :returns: + A dictionary with the following keys: + - documents: The list of documents ordered by the `group_by` and `subgroup_by` metadata values. + """ + + if not documents: + return {"documents": []} + + document_groups: Dict[str, Dict[str, List[Document]]] = defaultdict(lambda: defaultdict(list)) + no_group_docs = [] + + for doc in documents: + group_value = str(doc.meta.get(self.group_by, "")) + + if group_value: + subgroup_value = "no_subgroup" + if self.subgroup_by and self.subgroup_by in doc.meta: + subgroup_value = doc.meta[self.subgroup_by] + + document_groups[group_value][subgroup_value].append(doc) + else: + no_group_docs.append(doc) + + ordered_docs = [] + for group in document_groups: + for subgroup in document_groups[group]: + docs = document_groups[group][subgroup] + if self.sort_docs_by: + docs.sort(key=lambda d: d.meta.get(cast(str, self.sort_docs_by), float("inf"))) + ordered_docs.extend(docs) + + ordered_docs.extend(no_group_docs) + + return {"documents": ordered_docs} diff --git a/releasenotes/notes/add-metadata-grouper-21ec05fd4a307425.yaml b/releasenotes/notes/add-metadata-grouper-21ec05fd4a307425.yaml new file mode 100644 index 0000000000..1be9c334cb --- /dev/null +++ b/releasenotes/notes/add-metadata-grouper-21ec05fd4a307425.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + We have added a new MetaFieldGroupingRanker component that reorders documents by grouping them based on metadata keys. This can be useful for pre-processing Documents before feeding them to an LLM. diff --git a/test/components/generators/test_openai.py b/test/components/generators/test_openai.py index 2b5e73d85c..bde41becba 100644 --- a/test/components/generators/test_openai.py +++ b/test/components/generators/test_openai.py @@ -332,4 +332,4 @@ def test_run_with_system_prompt(self): "Can you explain the Pitagoras therom?", system_prompt="You answer in German, regardless of the language on which a question is asked.", ) - assert "pythagoras".lower() in result["replies"][0].lower() + assert "pythagoras" in result["replies"][0].lower() diff --git a/test/components/rankers/test_meta_field_grouping_ranker.py b/test/components/rankers/test_meta_field_grouping_ranker.py new file mode 100644 index 0000000000..f070321acb --- /dev/null +++ b/test/components/rankers/test_meta_field_grouping_ranker.py @@ -0,0 +1,181 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Dict + +from haystack import Pipeline +from haystack.dataclasses import Document + +from haystack.components.rankers.meta_field_grouping_ranker import MetaFieldGroupingRanker + +DOC_LIST = [ + # regular + Document(content="Javascript is a popular language", meta={"group": "42", "split_id": 7, "subgroup": "subB"}), + Document(content="A chromosome is a package of DNA", meta={"group": "314", "split_id": 2, "subgroup": "subC"}), + Document(content="DNA carries genetic information", meta={"group": "314", "split_id": 1, "subgroup": "subE"}), + Document(content="Blue whales have a big heart", meta={"group": "11", "split_id": 8, "subgroup": "subF"}), + Document(content="Python is a popular language", meta={"group": "42", "split_id": 4, "subgroup": "subB"}), + Document(content="bla bla bla bla", meta={"split_id": 8, "subgroup": "subG"}), + Document(content="Java is a popular programming language", meta={"group": "42", "split_id": 3, "subgroup": "subB"}), + Document(content="An octopus has three hearts", meta={"group": "11", "split_id": 2, "subgroup": "subD"}), + # without split id + Document(content="without split id", meta={"group": "11"}), + Document(content="without split id2", meta={"group": "22", "subgroup": "subI"}), + Document(content="without split id3", meta={"group": "11"}), + # with list values in the metadata + Document(content="list values", meta={"value_list": ["11"], "split_id": 8, "sub_value_list": ["subF"]}), + Document(content="list values2", meta={"value_list": ["12"], "split_id": 3, "sub_value_list": ["subX"]}), + Document(content="list values3", meta={"value_list": ["12"], "split_id": 8, "sub_value_list": ["subX"]}), +] + + +class TestMetaFieldGroupingRanker: + def test_init_default(self) -> None: + """ + Test the default initialization of the MetaFieldGroupingRanker component. + """ + sample_ranker = MetaFieldGroupingRanker(group_by="group", sort_docs_by=None) + result = sample_ranker.run(documents=[]) + assert "documents" in result + assert result["documents"] == [] + + def test_run_group_by_only(self) -> None: + """ + Test the MetaFieldGroupingRanker component with only the 'group_by' parameter. No subgroup or sorting is done. + """ + sample_ranker = MetaFieldGroupingRanker(group_by="group") + result = sample_ranker.run(documents=DOC_LIST) + assert "documents" in result + assert len(DOC_LIST) == len(result["documents"]) + assert result["documents"][0].meta["split_id"] == 7 and result["documents"][0].meta["group"] == "42" + assert result["documents"][1].meta["split_id"] == 4 and result["documents"][1].meta["group"] == "42" + assert result["documents"][2].meta["split_id"] == 3 and result["documents"][2].meta["group"] == "42" + assert result["documents"][3].meta["split_id"] == 2 and result["documents"][3].meta["group"] == "314" + assert result["documents"][4].meta["split_id"] == 1 and result["documents"][4].meta["group"] == "314" + assert result["documents"][5].meta["split_id"] == 8 and result["documents"][5].meta["group"] == "11" + assert result["documents"][6].meta["split_id"] == 2 and result["documents"][6].meta["group"] == "11" + assert result["documents"][7].content == "without split id" and result["documents"][7].meta["group"] == "11" + assert result["documents"][8].content == "without split id3" and result["documents"][8].meta["group"] == "11" + assert result["documents"][9].content == "without split id2" and result["documents"][9].meta["group"] == "22" + assert result["documents"][10].content == "bla bla bla bla" + + def test_with_group_subgroup_and_sorting(self) -> None: + """ + Test the MetaFieldGroupingRanker component with all parameters set, i.e.: grouping by 'group', subgrouping by 'subgroup', + and sorting by 'split_id'. + """ + ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id") + result = ranker.run(documents=DOC_LIST) + + assert "documents" in result + assert len(DOC_LIST) == len(result["documents"]) + assert ( + result["documents"][0].meta["subgroup"] == "subB" + and result["documents"][0].meta["group"] == "42" + and result["documents"][0].meta["split_id"] == 3 + ) + assert ( + result["documents"][1].meta["subgroup"] == "subB" + and result["documents"][1].meta["group"] == "42" + and result["documents"][1].meta["split_id"] == 4 + ) + assert ( + result["documents"][2].meta["subgroup"] == "subB" + and result["documents"][2].meta["group"] == "42" + and result["documents"][2].meta["split_id"] == 7 + ) + assert result["documents"][3].meta["subgroup"] == "subC" and result["documents"][3].meta["group"] == "314" + assert result["documents"][4].meta["subgroup"] == "subE" and result["documents"][4].meta["group"] == "314" + assert result["documents"][5].meta["subgroup"] == "subF" and result["documents"][6].meta["group"] == "11" + assert result["documents"][6].meta["subgroup"] == "subD" and result["documents"][5].meta["group"] == "11" + assert result["documents"][7].content == "without split id" and result["documents"][7].meta["group"] == "11" + assert result["documents"][8].content == "without split id3" and result["documents"][8].meta["group"] == "11" + assert result["documents"][9].content == "without split id2" and result["documents"][9].meta["group"] == "22" + assert result["documents"][10].content == "bla bla bla bla" + + def test_run_with_lists(self) -> None: + """ + Test if the MetaFieldGroupingRanker component can handle list values in the metadata. + """ + ranker = MetaFieldGroupingRanker(group_by="value_list", subgroup_by="subvaluelist", sort_docs_by="split_id") + result = ranker.run(documents=DOC_LIST) + assert "documents" in result + assert len(DOC_LIST) == len(result["documents"]) + assert result["documents"][0].content == "list values" and result["documents"][0].meta["value_list"] == ["11"] + assert result["documents"][1].content == "list values2" and result["documents"][1].meta["value_list"] == ["12"] + assert result["documents"][2].content == "list values3" and result["documents"][2].meta["value_list"] == ["12"] + + def test_run_empty_input(self) -> None: + """ + Test the behavior of the MetaFieldGroupingRanker component with an empty list of documents. + """ + sample_ranker = MetaFieldGroupingRanker(group_by="group") + result = sample_ranker.run(documents=[]) + assert "documents" in result + assert result["documents"] == [] + + def test_run_missing_metadata_keys(self) -> None: + """ + Test the behavior of the MetaFieldGroupingRanker component when some documents are missing the required metadata keys. + """ + docs_with_missing_keys = [ + Document(content="Document without group", meta={"split_id": 1, "subgroup": "subA"}), + Document(content="Document without subgroup", meta={"group": "42", "split_id": 2}), + Document(content="Document with all keys", meta={"group": "42", "split_id": 3, "subgroup": "subB"}), + ] + sample_ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id") + result = sample_ranker.run(documents=docs_with_missing_keys) + assert "documents" in result + assert len(result["documents"]) == 3 + assert result["documents"][0].meta["group"] == "42" + assert result["documents"][1].meta["group"] == "42" + assert result["documents"][2].content == "Document without group" + + def test_run_metadata_with_different_data_types(self) -> None: + """ + Test the behavior of the MetaFieldGroupingRanker component when the metadata values have different data types. + """ + docs_with_mixed_data_types = [ + Document(content="Document with string group", meta={"group": "42", "split_id": 1, "subgroup": "subA"}), + Document(content="Document with number group", meta={"group": 42, "split_id": 2, "subgroup": "subB"}), + Document(content="Document with boolean group", meta={"group": True, "split_id": 3, "subgroup": "subC"}), + ] + sample_ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id") + result = sample_ranker.run(documents=docs_with_mixed_data_types) + assert "documents" in result + assert len(result["documents"]) == 3 + assert result["documents"][0].meta["group"] == "42" + assert result["documents"][1].meta["group"] == 42 + assert result["documents"][2].meta["group"] is True + + def test_run_duplicate_documents(self) -> None: + """ + Test the behavior of the MetaFieldGroupingRanker component when the input contains duplicate documents. + """ + docs_with_duplicates = [ + Document(content="Duplicate 1", meta={"group": "42", "split_id": 1, "subgroup": "subA"}), + Document(content="Duplicate 1", meta={"group": "42", "split_id": 1, "subgroup": "subA"}), + Document(content="Unique document", meta={"group": "42", "split_id": 2, "subgroup": "subB"}), + ] + sample_ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id") + result = sample_ranker.run(documents=docs_with_duplicates) + assert "documents" in result + assert len(result["documents"]) == 3 + assert result["documents"][0].content == "Duplicate 1" + assert result["documents"][1].content == "Duplicate 1" + assert result["documents"][2].content == "Unique document" + + def test_run_in_pipeline_dumps_and_loads(self) -> None: + """ + Test if the MetaFieldGroupingRanker component can be dumped to a YAML string and reloaded from it. + """ + ranker = MetaFieldGroupingRanker(group_by="group", sort_docs_by="split_id") + result_single = ranker.run(documents=DOC_LIST) + pipeline = Pipeline() + pipeline.add_component("ranker", ranker) + pipeline_yaml_str = pipeline.dumps() + pipeline_reloaded = Pipeline().loads(pipeline_yaml_str) + result: Dict[str, Any] = pipeline_reloaded.run(data={"documents": DOC_LIST}) + result = result["ranker"] + assert result_single == result