-
Notifications
You must be signed in to change notification settings - Fork 2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: adding metadata grouper component (#8512)
* initial import * making tests more readable; adding docstring * adding release notes * adding LICENSE header * Update test/components/rankers/test_metadata_grouper.py Co-authored-by: Stefano Fiorucci <[email protected]> * refactoring * fixing docstring * fixing types * test docstrings * renaming test * handling too-many-arguments * liting * Update haystack/components/rankers/metadata_grouper.py Co-authored-by: Stefano Fiorucci <[email protected]> * changing name * Update haystack/components/rankers/metadata_grouper.py Co-authored-by: Daria Fokina <[email protected]> * Update haystack/components/rankers/metadata_grouper.py Co-authored-by: Daria Fokina <[email protected]> * assiging value inside function for re-use * improving docstring * updating name to MetaFieldGroupingRanker * adding to pydocs * fixing imports * adding output docstring * Update haystack/components/rankers/meta_field_grouper_ranker.py Co-authored-by: Stefano Fiorucci <[email protected]> * Update haystack/components/rankers/__init__.py Co-authored-by: Stefano Fiorucci <[email protected]> * Update releasenotes/notes/add-metadata-grouper-21ec05fd4a307425.yaml Co-authored-by: Stefano Fiorucci <[email protected]> * Update test/components/rankers/test_metadata_grouper.py Co-authored-by: Stefano Fiorucci <[email protected]> * update docstring tests * fixing imports * rename modules for consistency * fix pydocs * simplification + more tests --------- Co-authored-by: Stefano Fiorucci <[email protected]> Co-authored-by: Daria Fokina <[email protected]>
- Loading branch information
1 parent
fcdf392
commit e5a8072
Showing
6 changed files
with
307 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
118 changes: 118 additions & 0 deletions
118
haystack/components/rankers/meta_field_grouping_ranker.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]> | ||
# | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
from collections import defaultdict | ||
from typing import Any, Dict, List, Optional, cast | ||
|
||
from haystack import Document, component, logging | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
@component | ||
class MetaFieldGroupingRanker: | ||
""" | ||
Reorders the documents by grouping them based on metadata keys. | ||
The MetaFieldGroupingRanker can group documents by a primary metadata key `group_by`, and subgroup them with an optional | ||
secondary key, `subgroup_by`. | ||
Within each group or subgroup, it can also sort documents by a metadata key `sort_docs_by`. | ||
The output is a flat list of documents ordered by `group_by` and `subgroup_by` values. | ||
Any documents without a group are placed at the end of the list. | ||
The proper organization of documents helps improve the efficiency and performance of subsequent processing by an LLM. | ||
### Usage example | ||
```python | ||
from haystack.components.rankers import MetaFieldGroupingRanker | ||
from haystack.dataclasses import Document | ||
docs = [ | ||
Document(content="Javascript is a popular programming language", meta={"group": "42", "split_id": 7, "subgroup": "subB"}), | ||
Document(content="Python is a popular programming language",meta={"group": "42", "split_id": 4, "subgroup": "subB"}), | ||
Document(content="A chromosome is a package of DNA", meta={"group": "314", "split_id": 2, "subgroup": "subC"}), | ||
Document(content="An octopus has three hearts", meta={"group": "11", "split_id": 2, "subgroup": "subD"}), | ||
Document(content="Java is a popular programming language", meta={"group": "42", "split_id": 3, "subgroup": "subB"}) | ||
] | ||
ranker = MetaFieldGroupingRanker(group_by="group",subgroup_by="subgroup", sort_docs_by="split_id") | ||
result = ranker.run(documents=docs) | ||
print(result["documents"]) | ||
# [ | ||
# Document(id=d665bbc83e52c08c3d8275bccf4f22bf2bfee21c6e77d78794627637355b8ebc, | ||
# content: 'Java is a popular programming language', meta: {'group': '42', 'split_id': 3, 'subgroup': 'subB'}), | ||
# Document(id=a20b326f07382b3cbf2ce156092f7c93e8788df5d48f2986957dce2adb5fe3c2, | ||
# content: 'Python is a popular programming language', meta: {'group': '42', 'split_id': 4, 'subgroup': 'subB'}), | ||
# Document(id=ce12919795d22f6ca214d0f161cf870993889dcb146f3bb1b3e1ffdc95be960f, | ||
# content: 'Javascript is a popular programming language', meta: {'group': '42', 'split_id': 7, 'subgroup': 'subB'}), | ||
# Document(id=d9fc857046c904e5cf790b3969b971b1bbdb1b3037d50a20728fdbf82991aa94, | ||
# content: 'A chromosome is a package of DNA', meta: {'group': '314', 'split_id': 2, 'subgroup': 'subC'}), | ||
# Document(id=6d3b7bdc13d09aa01216471eb5fb0bfdc53c5f2f3e98ad125ff6b85d3106c9a3, | ||
# content: 'An octopus has three hearts', meta: {'group': '11', 'split_id': 2, 'subgroup': 'subD'}) | ||
# ] | ||
``` | ||
""" # noqa: E501 | ||
|
||
def __init__(self, group_by: str, subgroup_by: Optional[str] = None, sort_docs_by: Optional[str] = None): | ||
""" | ||
Creates an instance of DeepsetMetadataGrouper. | ||
:param group_by: The metadata key to aggregate the documents by. | ||
:param subgroup_by: The metadata key to aggregate the documents within a group that was created by the | ||
`group_by` key. | ||
:param sort_docs_by: Determines which metadata key is used to sort the documents. If not provided, the | ||
documents within the groups or subgroups are not sorted and are kept in the same order as | ||
they were inserted in the subgroups. | ||
""" | ||
self.group_by = group_by | ||
self.sort_docs_by = sort_docs_by | ||
self.subgroup_by = subgroup_by | ||
|
||
@component.output_types(documents=List[Document]) | ||
def run(self, documents: List[Document]) -> Dict[str, Any]: | ||
""" | ||
Groups the provided list of documents based on the `group_by` parameter and optionally the `subgroup_by`. | ||
The output is a list of documents reordered based on how they were grouped. | ||
:param documents: The list of documents to group. | ||
:returns: | ||
A dictionary with the following keys: | ||
- documents: The list of documents ordered by the `group_by` and `subgroup_by` metadata values. | ||
""" | ||
|
||
if not documents: | ||
return {"documents": []} | ||
|
||
document_groups: Dict[str, Dict[str, List[Document]]] = defaultdict(lambda: defaultdict(list)) | ||
no_group_docs = [] | ||
|
||
for doc in documents: | ||
group_value = str(doc.meta.get(self.group_by, "")) | ||
|
||
if group_value: | ||
subgroup_value = "no_subgroup" | ||
if self.subgroup_by and self.subgroup_by in doc.meta: | ||
subgroup_value = doc.meta[self.subgroup_by] | ||
|
||
document_groups[group_value][subgroup_value].append(doc) | ||
else: | ||
no_group_docs.append(doc) | ||
|
||
ordered_docs = [] | ||
for group in document_groups: | ||
for subgroup in document_groups[group]: | ||
docs = document_groups[group][subgroup] | ||
if self.sort_docs_by: | ||
docs.sort(key=lambda d: d.meta.get(cast(str, self.sort_docs_by), float("inf"))) | ||
ordered_docs.extend(docs) | ||
|
||
ordered_docs.extend(no_group_docs) | ||
|
||
return {"documents": ordered_docs} |
4 changes: 4 additions & 0 deletions
4
releasenotes/notes/add-metadata-grouper-21ec05fd4a307425.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
--- | ||
features: | ||
- | | ||
We have added a new MetaFieldGroupingRanker component that reorders documents by grouping them based on metadata keys. This can be useful for pre-processing Documents before feeding them to an LLM. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
181 changes: 181 additions & 0 deletions
181
test/components/rankers/test_meta_field_grouping_ranker.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]> | ||
# | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
from typing import Any, Dict | ||
|
||
from haystack import Pipeline | ||
from haystack.dataclasses import Document | ||
|
||
from haystack.components.rankers.meta_field_grouping_ranker import MetaFieldGroupingRanker | ||
|
||
DOC_LIST = [ | ||
# regular | ||
Document(content="Javascript is a popular language", meta={"group": "42", "split_id": 7, "subgroup": "subB"}), | ||
Document(content="A chromosome is a package of DNA", meta={"group": "314", "split_id": 2, "subgroup": "subC"}), | ||
Document(content="DNA carries genetic information", meta={"group": "314", "split_id": 1, "subgroup": "subE"}), | ||
Document(content="Blue whales have a big heart", meta={"group": "11", "split_id": 8, "subgroup": "subF"}), | ||
Document(content="Python is a popular language", meta={"group": "42", "split_id": 4, "subgroup": "subB"}), | ||
Document(content="bla bla bla bla", meta={"split_id": 8, "subgroup": "subG"}), | ||
Document(content="Java is a popular programming language", meta={"group": "42", "split_id": 3, "subgroup": "subB"}), | ||
Document(content="An octopus has three hearts", meta={"group": "11", "split_id": 2, "subgroup": "subD"}), | ||
# without split id | ||
Document(content="without split id", meta={"group": "11"}), | ||
Document(content="without split id2", meta={"group": "22", "subgroup": "subI"}), | ||
Document(content="without split id3", meta={"group": "11"}), | ||
# with list values in the metadata | ||
Document(content="list values", meta={"value_list": ["11"], "split_id": 8, "sub_value_list": ["subF"]}), | ||
Document(content="list values2", meta={"value_list": ["12"], "split_id": 3, "sub_value_list": ["subX"]}), | ||
Document(content="list values3", meta={"value_list": ["12"], "split_id": 8, "sub_value_list": ["subX"]}), | ||
] | ||
|
||
|
||
class TestMetaFieldGroupingRanker: | ||
def test_init_default(self) -> None: | ||
""" | ||
Test the default initialization of the MetaFieldGroupingRanker component. | ||
""" | ||
sample_ranker = MetaFieldGroupingRanker(group_by="group", sort_docs_by=None) | ||
result = sample_ranker.run(documents=[]) | ||
assert "documents" in result | ||
assert result["documents"] == [] | ||
|
||
def test_run_group_by_only(self) -> None: | ||
""" | ||
Test the MetaFieldGroupingRanker component with only the 'group_by' parameter. No subgroup or sorting is done. | ||
""" | ||
sample_ranker = MetaFieldGroupingRanker(group_by="group") | ||
result = sample_ranker.run(documents=DOC_LIST) | ||
assert "documents" in result | ||
assert len(DOC_LIST) == len(result["documents"]) | ||
assert result["documents"][0].meta["split_id"] == 7 and result["documents"][0].meta["group"] == "42" | ||
assert result["documents"][1].meta["split_id"] == 4 and result["documents"][1].meta["group"] == "42" | ||
assert result["documents"][2].meta["split_id"] == 3 and result["documents"][2].meta["group"] == "42" | ||
assert result["documents"][3].meta["split_id"] == 2 and result["documents"][3].meta["group"] == "314" | ||
assert result["documents"][4].meta["split_id"] == 1 and result["documents"][4].meta["group"] == "314" | ||
assert result["documents"][5].meta["split_id"] == 8 and result["documents"][5].meta["group"] == "11" | ||
assert result["documents"][6].meta["split_id"] == 2 and result["documents"][6].meta["group"] == "11" | ||
assert result["documents"][7].content == "without split id" and result["documents"][7].meta["group"] == "11" | ||
assert result["documents"][8].content == "without split id3" and result["documents"][8].meta["group"] == "11" | ||
assert result["documents"][9].content == "without split id2" and result["documents"][9].meta["group"] == "22" | ||
assert result["documents"][10].content == "bla bla bla bla" | ||
|
||
def test_with_group_subgroup_and_sorting(self) -> None: | ||
""" | ||
Test the MetaFieldGroupingRanker component with all parameters set, i.e.: grouping by 'group', subgrouping by 'subgroup', | ||
and sorting by 'split_id'. | ||
""" | ||
ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id") | ||
result = ranker.run(documents=DOC_LIST) | ||
|
||
assert "documents" in result | ||
assert len(DOC_LIST) == len(result["documents"]) | ||
assert ( | ||
result["documents"][0].meta["subgroup"] == "subB" | ||
and result["documents"][0].meta["group"] == "42" | ||
and result["documents"][0].meta["split_id"] == 3 | ||
) | ||
assert ( | ||
result["documents"][1].meta["subgroup"] == "subB" | ||
and result["documents"][1].meta["group"] == "42" | ||
and result["documents"][1].meta["split_id"] == 4 | ||
) | ||
assert ( | ||
result["documents"][2].meta["subgroup"] == "subB" | ||
and result["documents"][2].meta["group"] == "42" | ||
and result["documents"][2].meta["split_id"] == 7 | ||
) | ||
assert result["documents"][3].meta["subgroup"] == "subC" and result["documents"][3].meta["group"] == "314" | ||
assert result["documents"][4].meta["subgroup"] == "subE" and result["documents"][4].meta["group"] == "314" | ||
assert result["documents"][5].meta["subgroup"] == "subF" and result["documents"][6].meta["group"] == "11" | ||
assert result["documents"][6].meta["subgroup"] == "subD" and result["documents"][5].meta["group"] == "11" | ||
assert result["documents"][7].content == "without split id" and result["documents"][7].meta["group"] == "11" | ||
assert result["documents"][8].content == "without split id3" and result["documents"][8].meta["group"] == "11" | ||
assert result["documents"][9].content == "without split id2" and result["documents"][9].meta["group"] == "22" | ||
assert result["documents"][10].content == "bla bla bla bla" | ||
|
||
def test_run_with_lists(self) -> None: | ||
""" | ||
Test if the MetaFieldGroupingRanker component can handle list values in the metadata. | ||
""" | ||
ranker = MetaFieldGroupingRanker(group_by="value_list", subgroup_by="subvaluelist", sort_docs_by="split_id") | ||
result = ranker.run(documents=DOC_LIST) | ||
assert "documents" in result | ||
assert len(DOC_LIST) == len(result["documents"]) | ||
assert result["documents"][0].content == "list values" and result["documents"][0].meta["value_list"] == ["11"] | ||
assert result["documents"][1].content == "list values2" and result["documents"][1].meta["value_list"] == ["12"] | ||
assert result["documents"][2].content == "list values3" and result["documents"][2].meta["value_list"] == ["12"] | ||
|
||
def test_run_empty_input(self) -> None: | ||
""" | ||
Test the behavior of the MetaFieldGroupingRanker component with an empty list of documents. | ||
""" | ||
sample_ranker = MetaFieldGroupingRanker(group_by="group") | ||
result = sample_ranker.run(documents=[]) | ||
assert "documents" in result | ||
assert result["documents"] == [] | ||
|
||
def test_run_missing_metadata_keys(self) -> None: | ||
""" | ||
Test the behavior of the MetaFieldGroupingRanker component when some documents are missing the required metadata keys. | ||
""" | ||
docs_with_missing_keys = [ | ||
Document(content="Document without group", meta={"split_id": 1, "subgroup": "subA"}), | ||
Document(content="Document without subgroup", meta={"group": "42", "split_id": 2}), | ||
Document(content="Document with all keys", meta={"group": "42", "split_id": 3, "subgroup": "subB"}), | ||
] | ||
sample_ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id") | ||
result = sample_ranker.run(documents=docs_with_missing_keys) | ||
assert "documents" in result | ||
assert len(result["documents"]) == 3 | ||
assert result["documents"][0].meta["group"] == "42" | ||
assert result["documents"][1].meta["group"] == "42" | ||
assert result["documents"][2].content == "Document without group" | ||
|
||
def test_run_metadata_with_different_data_types(self) -> None: | ||
""" | ||
Test the behavior of the MetaFieldGroupingRanker component when the metadata values have different data types. | ||
""" | ||
docs_with_mixed_data_types = [ | ||
Document(content="Document with string group", meta={"group": "42", "split_id": 1, "subgroup": "subA"}), | ||
Document(content="Document with number group", meta={"group": 42, "split_id": 2, "subgroup": "subB"}), | ||
Document(content="Document with boolean group", meta={"group": True, "split_id": 3, "subgroup": "subC"}), | ||
] | ||
sample_ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id") | ||
result = sample_ranker.run(documents=docs_with_mixed_data_types) | ||
assert "documents" in result | ||
assert len(result["documents"]) == 3 | ||
assert result["documents"][0].meta["group"] == "42" | ||
assert result["documents"][1].meta["group"] == 42 | ||
assert result["documents"][2].meta["group"] is True | ||
|
||
def test_run_duplicate_documents(self) -> None: | ||
""" | ||
Test the behavior of the MetaFieldGroupingRanker component when the input contains duplicate documents. | ||
""" | ||
docs_with_duplicates = [ | ||
Document(content="Duplicate 1", meta={"group": "42", "split_id": 1, "subgroup": "subA"}), | ||
Document(content="Duplicate 1", meta={"group": "42", "split_id": 1, "subgroup": "subA"}), | ||
Document(content="Unique document", meta={"group": "42", "split_id": 2, "subgroup": "subB"}), | ||
] | ||
sample_ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id") | ||
result = sample_ranker.run(documents=docs_with_duplicates) | ||
assert "documents" in result | ||
assert len(result["documents"]) == 3 | ||
assert result["documents"][0].content == "Duplicate 1" | ||
assert result["documents"][1].content == "Duplicate 1" | ||
assert result["documents"][2].content == "Unique document" | ||
|
||
def test_run_in_pipeline_dumps_and_loads(self) -> None: | ||
""" | ||
Test if the MetaFieldGroupingRanker component can be dumped to a YAML string and reloaded from it. | ||
""" | ||
ranker = MetaFieldGroupingRanker(group_by="group", sort_docs_by="split_id") | ||
result_single = ranker.run(documents=DOC_LIST) | ||
pipeline = Pipeline() | ||
pipeline.add_component("ranker", ranker) | ||
pipeline_yaml_str = pipeline.dumps() | ||
pipeline_reloaded = Pipeline().loads(pipeline_yaml_str) | ||
result: Dict[str, Any] = pipeline_reloaded.run(data={"documents": DOC_LIST}) | ||
result = result["ranker"] | ||
assert result_single == result |