Skip to content

Commit

Permalink
feat: adding metadata grouper component (#8512)
Browse files Browse the repository at this point in the history
* initial import

* making tests more readable; adding docstring

* adding release notes

* adding LICENSE header

* Update test/components/rankers/test_metadata_grouper.py

Co-authored-by: Stefano Fiorucci <[email protected]>

* refactoring

* fixing docstring

* fixing types

* test docstrings

* renaming test

* handling too-many-arguments

* liting

* Update haystack/components/rankers/metadata_grouper.py

Co-authored-by: Stefano Fiorucci <[email protected]>

* changing name

* Update haystack/components/rankers/metadata_grouper.py

Co-authored-by: Daria Fokina <[email protected]>

* Update haystack/components/rankers/metadata_grouper.py

Co-authored-by: Daria Fokina <[email protected]>

* assiging value inside function for re-use

* improving docstring

* updating name to MetaFieldGroupingRanker

* adding to pydocs

* fixing imports

* adding output docstring

* Update haystack/components/rankers/meta_field_grouper_ranker.py

Co-authored-by: Stefano Fiorucci <[email protected]>

* Update haystack/components/rankers/__init__.py

Co-authored-by: Stefano Fiorucci <[email protected]>

* Update releasenotes/notes/add-metadata-grouper-21ec05fd4a307425.yaml

Co-authored-by: Stefano Fiorucci <[email protected]>

* Update test/components/rankers/test_metadata_grouper.py

Co-authored-by: Stefano Fiorucci <[email protected]>

* update docstring tests

* fixing imports

* rename modules for consistency

* fix pydocs

* simplification + more tests

---------

Co-authored-by: Stefano Fiorucci <[email protected]>
Co-authored-by: Daria Fokina <[email protected]>
  • Loading branch information
3 people authored Nov 12, 2024
1 parent fcdf392 commit e5a8072
Show file tree
Hide file tree
Showing 6 changed files with 307 additions and 2 deletions.
2 changes: 1 addition & 1 deletion docs/pydoc/config/rankers_api.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
loaders:
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
search_path: [../../../haystack/components/rankers]
modules: ["lost_in_the_middle", "meta_field", "transformers_similarity", "sentence_transformers_diversity"]
modules: ["lost_in_the_middle", "meta_field", "meta_field_grouping_ranker", "transformers_similarity", "sentence_transformers_diversity"]
ignore_when_discovered: ["__init__"]
processors:
- type: filter
Expand Down
2 changes: 2 additions & 0 deletions haystack/components/rankers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@

from haystack.components.rankers.lost_in_the_middle import LostInTheMiddleRanker
from haystack.components.rankers.meta_field import MetaFieldRanker
from haystack.components.rankers.meta_field_grouping_ranker import MetaFieldGroupingRanker
from haystack.components.rankers.sentence_transformers_diversity import SentenceTransformersDiversityRanker
from haystack.components.rankers.transformers_similarity import TransformersSimilarityRanker

__all__ = [
"LostInTheMiddleRanker",
"MetaFieldRanker",
"MetaFieldGroupingRanker",
"SentenceTransformersDiversityRanker",
"TransformersSimilarityRanker",
]
118 changes: 118 additions & 0 deletions haystack/components/rankers/meta_field_grouping_ranker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from collections import defaultdict
from typing import Any, Dict, List, Optional, cast

from haystack import Document, component, logging

logger = logging.getLogger(__name__)


@component
class MetaFieldGroupingRanker:
"""
Reorders the documents by grouping them based on metadata keys.
The MetaFieldGroupingRanker can group documents by a primary metadata key `group_by`, and subgroup them with an optional
secondary key, `subgroup_by`.
Within each group or subgroup, it can also sort documents by a metadata key `sort_docs_by`.
The output is a flat list of documents ordered by `group_by` and `subgroup_by` values.
Any documents without a group are placed at the end of the list.
The proper organization of documents helps improve the efficiency and performance of subsequent processing by an LLM.
### Usage example
```python
from haystack.components.rankers import MetaFieldGroupingRanker
from haystack.dataclasses import Document
docs = [
Document(content="Javascript is a popular programming language", meta={"group": "42", "split_id": 7, "subgroup": "subB"}),
Document(content="Python is a popular programming language",meta={"group": "42", "split_id": 4, "subgroup": "subB"}),
Document(content="A chromosome is a package of DNA", meta={"group": "314", "split_id": 2, "subgroup": "subC"}),
Document(content="An octopus has three hearts", meta={"group": "11", "split_id": 2, "subgroup": "subD"}),
Document(content="Java is a popular programming language", meta={"group": "42", "split_id": 3, "subgroup": "subB"})
]
ranker = MetaFieldGroupingRanker(group_by="group",subgroup_by="subgroup", sort_docs_by="split_id")
result = ranker.run(documents=docs)
print(result["documents"])
# [
# Document(id=d665bbc83e52c08c3d8275bccf4f22bf2bfee21c6e77d78794627637355b8ebc,
# content: 'Java is a popular programming language', meta: {'group': '42', 'split_id': 3, 'subgroup': 'subB'}),
# Document(id=a20b326f07382b3cbf2ce156092f7c93e8788df5d48f2986957dce2adb5fe3c2,
# content: 'Python is a popular programming language', meta: {'group': '42', 'split_id': 4, 'subgroup': 'subB'}),
# Document(id=ce12919795d22f6ca214d0f161cf870993889dcb146f3bb1b3e1ffdc95be960f,
# content: 'Javascript is a popular programming language', meta: {'group': '42', 'split_id': 7, 'subgroup': 'subB'}),
# Document(id=d9fc857046c904e5cf790b3969b971b1bbdb1b3037d50a20728fdbf82991aa94,
# content: 'A chromosome is a package of DNA', meta: {'group': '314', 'split_id': 2, 'subgroup': 'subC'}),
# Document(id=6d3b7bdc13d09aa01216471eb5fb0bfdc53c5f2f3e98ad125ff6b85d3106c9a3,
# content: 'An octopus has three hearts', meta: {'group': '11', 'split_id': 2, 'subgroup': 'subD'})
# ]
```
""" # noqa: E501

def __init__(self, group_by: str, subgroup_by: Optional[str] = None, sort_docs_by: Optional[str] = None):
"""
Creates an instance of DeepsetMetadataGrouper.
:param group_by: The metadata key to aggregate the documents by.
:param subgroup_by: The metadata key to aggregate the documents within a group that was created by the
`group_by` key.
:param sort_docs_by: Determines which metadata key is used to sort the documents. If not provided, the
documents within the groups or subgroups are not sorted and are kept in the same order as
they were inserted in the subgroups.
"""
self.group_by = group_by
self.sort_docs_by = sort_docs_by
self.subgroup_by = subgroup_by

@component.output_types(documents=List[Document])
def run(self, documents: List[Document]) -> Dict[str, Any]:
"""
Groups the provided list of documents based on the `group_by` parameter and optionally the `subgroup_by`.
The output is a list of documents reordered based on how they were grouped.
:param documents: The list of documents to group.
:returns:
A dictionary with the following keys:
- documents: The list of documents ordered by the `group_by` and `subgroup_by` metadata values.
"""

if not documents:
return {"documents": []}

document_groups: Dict[str, Dict[str, List[Document]]] = defaultdict(lambda: defaultdict(list))
no_group_docs = []

for doc in documents:
group_value = str(doc.meta.get(self.group_by, ""))

if group_value:
subgroup_value = "no_subgroup"
if self.subgroup_by and self.subgroup_by in doc.meta:
subgroup_value = doc.meta[self.subgroup_by]

document_groups[group_value][subgroup_value].append(doc)
else:
no_group_docs.append(doc)

ordered_docs = []
for group in document_groups:
for subgroup in document_groups[group]:
docs = document_groups[group][subgroup]
if self.sort_docs_by:
docs.sort(key=lambda d: d.meta.get(cast(str, self.sort_docs_by), float("inf")))
ordered_docs.extend(docs)

ordered_docs.extend(no_group_docs)

return {"documents": ordered_docs}
4 changes: 4 additions & 0 deletions releasenotes/notes/add-metadata-grouper-21ec05fd4a307425.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
We have added a new MetaFieldGroupingRanker component that reorders documents by grouping them based on metadata keys. This can be useful for pre-processing Documents before feeding them to an LLM.
2 changes: 1 addition & 1 deletion test/components/generators/test_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,4 +332,4 @@ def test_run_with_system_prompt(self):
"Can you explain the Pitagoras therom?",
system_prompt="You answer in German, regardless of the language on which a question is asked.",
)
assert "pythagoras".lower() in result["replies"][0].lower()
assert "pythagoras" in result["replies"][0].lower()
181 changes: 181 additions & 0 deletions test/components/rankers/test_meta_field_grouping_ranker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Dict

from haystack import Pipeline
from haystack.dataclasses import Document

from haystack.components.rankers.meta_field_grouping_ranker import MetaFieldGroupingRanker

DOC_LIST = [
# regular
Document(content="Javascript is a popular language", meta={"group": "42", "split_id": 7, "subgroup": "subB"}),
Document(content="A chromosome is a package of DNA", meta={"group": "314", "split_id": 2, "subgroup": "subC"}),
Document(content="DNA carries genetic information", meta={"group": "314", "split_id": 1, "subgroup": "subE"}),
Document(content="Blue whales have a big heart", meta={"group": "11", "split_id": 8, "subgroup": "subF"}),
Document(content="Python is a popular language", meta={"group": "42", "split_id": 4, "subgroup": "subB"}),
Document(content="bla bla bla bla", meta={"split_id": 8, "subgroup": "subG"}),
Document(content="Java is a popular programming language", meta={"group": "42", "split_id": 3, "subgroup": "subB"}),
Document(content="An octopus has three hearts", meta={"group": "11", "split_id": 2, "subgroup": "subD"}),
# without split id
Document(content="without split id", meta={"group": "11"}),
Document(content="without split id2", meta={"group": "22", "subgroup": "subI"}),
Document(content="without split id3", meta={"group": "11"}),
# with list values in the metadata
Document(content="list values", meta={"value_list": ["11"], "split_id": 8, "sub_value_list": ["subF"]}),
Document(content="list values2", meta={"value_list": ["12"], "split_id": 3, "sub_value_list": ["subX"]}),
Document(content="list values3", meta={"value_list": ["12"], "split_id": 8, "sub_value_list": ["subX"]}),
]


class TestMetaFieldGroupingRanker:
def test_init_default(self) -> None:
"""
Test the default initialization of the MetaFieldGroupingRanker component.
"""
sample_ranker = MetaFieldGroupingRanker(group_by="group", sort_docs_by=None)
result = sample_ranker.run(documents=[])
assert "documents" in result
assert result["documents"] == []

def test_run_group_by_only(self) -> None:
"""
Test the MetaFieldGroupingRanker component with only the 'group_by' parameter. No subgroup or sorting is done.
"""
sample_ranker = MetaFieldGroupingRanker(group_by="group")
result = sample_ranker.run(documents=DOC_LIST)
assert "documents" in result
assert len(DOC_LIST) == len(result["documents"])
assert result["documents"][0].meta["split_id"] == 7 and result["documents"][0].meta["group"] == "42"
assert result["documents"][1].meta["split_id"] == 4 and result["documents"][1].meta["group"] == "42"
assert result["documents"][2].meta["split_id"] == 3 and result["documents"][2].meta["group"] == "42"
assert result["documents"][3].meta["split_id"] == 2 and result["documents"][3].meta["group"] == "314"
assert result["documents"][4].meta["split_id"] == 1 and result["documents"][4].meta["group"] == "314"
assert result["documents"][5].meta["split_id"] == 8 and result["documents"][5].meta["group"] == "11"
assert result["documents"][6].meta["split_id"] == 2 and result["documents"][6].meta["group"] == "11"
assert result["documents"][7].content == "without split id" and result["documents"][7].meta["group"] == "11"
assert result["documents"][8].content == "without split id3" and result["documents"][8].meta["group"] == "11"
assert result["documents"][9].content == "without split id2" and result["documents"][9].meta["group"] == "22"
assert result["documents"][10].content == "bla bla bla bla"

def test_with_group_subgroup_and_sorting(self) -> None:
"""
Test the MetaFieldGroupingRanker component with all parameters set, i.e.: grouping by 'group', subgrouping by 'subgroup',
and sorting by 'split_id'.
"""
ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id")
result = ranker.run(documents=DOC_LIST)

assert "documents" in result
assert len(DOC_LIST) == len(result["documents"])
assert (
result["documents"][0].meta["subgroup"] == "subB"
and result["documents"][0].meta["group"] == "42"
and result["documents"][0].meta["split_id"] == 3
)
assert (
result["documents"][1].meta["subgroup"] == "subB"
and result["documents"][1].meta["group"] == "42"
and result["documents"][1].meta["split_id"] == 4
)
assert (
result["documents"][2].meta["subgroup"] == "subB"
and result["documents"][2].meta["group"] == "42"
and result["documents"][2].meta["split_id"] == 7
)
assert result["documents"][3].meta["subgroup"] == "subC" and result["documents"][3].meta["group"] == "314"
assert result["documents"][4].meta["subgroup"] == "subE" and result["documents"][4].meta["group"] == "314"
assert result["documents"][5].meta["subgroup"] == "subF" and result["documents"][6].meta["group"] == "11"
assert result["documents"][6].meta["subgroup"] == "subD" and result["documents"][5].meta["group"] == "11"
assert result["documents"][7].content == "without split id" and result["documents"][7].meta["group"] == "11"
assert result["documents"][8].content == "without split id3" and result["documents"][8].meta["group"] == "11"
assert result["documents"][9].content == "without split id2" and result["documents"][9].meta["group"] == "22"
assert result["documents"][10].content == "bla bla bla bla"

def test_run_with_lists(self) -> None:
"""
Test if the MetaFieldGroupingRanker component can handle list values in the metadata.
"""
ranker = MetaFieldGroupingRanker(group_by="value_list", subgroup_by="subvaluelist", sort_docs_by="split_id")
result = ranker.run(documents=DOC_LIST)
assert "documents" in result
assert len(DOC_LIST) == len(result["documents"])
assert result["documents"][0].content == "list values" and result["documents"][0].meta["value_list"] == ["11"]
assert result["documents"][1].content == "list values2" and result["documents"][1].meta["value_list"] == ["12"]
assert result["documents"][2].content == "list values3" and result["documents"][2].meta["value_list"] == ["12"]

def test_run_empty_input(self) -> None:
"""
Test the behavior of the MetaFieldGroupingRanker component with an empty list of documents.
"""
sample_ranker = MetaFieldGroupingRanker(group_by="group")
result = sample_ranker.run(documents=[])
assert "documents" in result
assert result["documents"] == []

def test_run_missing_metadata_keys(self) -> None:
"""
Test the behavior of the MetaFieldGroupingRanker component when some documents are missing the required metadata keys.
"""
docs_with_missing_keys = [
Document(content="Document without group", meta={"split_id": 1, "subgroup": "subA"}),
Document(content="Document without subgroup", meta={"group": "42", "split_id": 2}),
Document(content="Document with all keys", meta={"group": "42", "split_id": 3, "subgroup": "subB"}),
]
sample_ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id")
result = sample_ranker.run(documents=docs_with_missing_keys)
assert "documents" in result
assert len(result["documents"]) == 3
assert result["documents"][0].meta["group"] == "42"
assert result["documents"][1].meta["group"] == "42"
assert result["documents"][2].content == "Document without group"

def test_run_metadata_with_different_data_types(self) -> None:
"""
Test the behavior of the MetaFieldGroupingRanker component when the metadata values have different data types.
"""
docs_with_mixed_data_types = [
Document(content="Document with string group", meta={"group": "42", "split_id": 1, "subgroup": "subA"}),
Document(content="Document with number group", meta={"group": 42, "split_id": 2, "subgroup": "subB"}),
Document(content="Document with boolean group", meta={"group": True, "split_id": 3, "subgroup": "subC"}),
]
sample_ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id")
result = sample_ranker.run(documents=docs_with_mixed_data_types)
assert "documents" in result
assert len(result["documents"]) == 3
assert result["documents"][0].meta["group"] == "42"
assert result["documents"][1].meta["group"] == 42
assert result["documents"][2].meta["group"] is True

def test_run_duplicate_documents(self) -> None:
"""
Test the behavior of the MetaFieldGroupingRanker component when the input contains duplicate documents.
"""
docs_with_duplicates = [
Document(content="Duplicate 1", meta={"group": "42", "split_id": 1, "subgroup": "subA"}),
Document(content="Duplicate 1", meta={"group": "42", "split_id": 1, "subgroup": "subA"}),
Document(content="Unique document", meta={"group": "42", "split_id": 2, "subgroup": "subB"}),
]
sample_ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id")
result = sample_ranker.run(documents=docs_with_duplicates)
assert "documents" in result
assert len(result["documents"]) == 3
assert result["documents"][0].content == "Duplicate 1"
assert result["documents"][1].content == "Duplicate 1"
assert result["documents"][2].content == "Unique document"

def test_run_in_pipeline_dumps_and_loads(self) -> None:
"""
Test if the MetaFieldGroupingRanker component can be dumped to a YAML string and reloaded from it.
"""
ranker = MetaFieldGroupingRanker(group_by="group", sort_docs_by="split_id")
result_single = ranker.run(documents=DOC_LIST)
pipeline = Pipeline()
pipeline.add_component("ranker", ranker)
pipeline_yaml_str = pipeline.dumps()
pipeline_reloaded = Pipeline().loads(pipeline_yaml_str)
result: Dict[str, Any] = pipeline_reloaded.run(data={"documents": DOC_LIST})
result = result["ranker"]
assert result_single == result

0 comments on commit e5a8072

Please sign in to comment.