deepset-ai · davidsbatista · Aug 26, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from haystack_experimental.components.retrievers.auto_merging_retriever import AutoMergingRetriever
 from haystack_experimental.components.retrievers.chat_message_retriever import ChatMessageRetriever
 
-_all_ = ["ChatMessageRetriever"]
+_all_ = ["AutoMergingRetriever", "ChatMessageRetriever"]
@@ -0,0 +1,151 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from collections import defaultdict
+from typing import Any, Dict, List
+
+from haystack import DeserializationError, Document, component, default_to_dict
+from haystack.core.serialization import default_from_dict, import_class_by_name
+from haystack.document_stores.types import DocumentStore
+
+
+@component
+class AutoMergingRetriever:
+    """
+    A retriever which returns parent documents of the matched leaf nodes documents, based on a threshold setting.
+
+    The AutoMergingRetriever assumes you have a hierarchical tree structure of documents, where the leaf nodes
+    are indexed in a document store. See the HierarchicalDocumentSplitter for more information on how to create
+    such a structure. During retrieval, if the number of matched leaf documents below the same parent is
+    higher than a defined threshold, the retriever will return the parent document instead of the individual leaf
+    documents.
+
+    The rational is, given that a paragraph is split into multiple chunks represented as leaf documents, and if for
+    a given query, multiple chunks are matched, the whole paragraph might be more informative than the individual
+    chunks alone.
+
+    ```python
+
+    from haystack import Document
+    from haystack_experimental.components.splitters import HierarchicalDocumentBuilder
+    from haystack_experimental.components.retrievers.auto_merging_retriever import AutoMergingRetriever
+    from haystack.document_stores.in_memory import InMemoryDocumentStore
+
+    text = "The sun rose early in the morning. It cast a warm glow over the trees. Birds began to sing."
+
+    docs = [Document(content=text)]
+    builder = HierarchicalDocumentBuilder(block_sizes=[10, 3], split_overlap=0, split_by="word")
+    docs = builder.run(docs)
+
+    # store level-1 parent documents and initialize the retriever
+    doc_store_parents = InMemoryDocumentStore()
+    for doc in docs["documents"]:
+        if doc.meta["children_ids"] and doc.meta["level"] == 1:
+            doc_store_parents.write_documents([doc])
+    retriever = AutoMergingRetriever(doc_store_parents, threshold=0.5)
+
+    # assume we retrieved 2 leaf docs from the same parent, the parent document should be returned,
+    # since it has 3 children and the threshold=0.5, and we retrieved 2 children (2/3 > 0.66(6))
+    leaf_docs = [doc for doc in docs["documents"] if not doc.meta["children_ids"]]
+    docs = retriever.run(leaf_docs[4:6])
+    >> {'documents': [Document(id=5384f4d58e13beb40ce80ab324a1da24f70ed69c2ec4c4f2a6f64abbc846a794,
+    >> content: 'warm glow over the trees. Birds began to sing.',
+    >> meta: {'block_size': 10, 'parent_id': '835b610ae31936739a47ce504674d3e86756688728b8c2b83f83484f3e1e4697',
+    >> 'children_ids': ['c17e28e4b4577f892aba181a3aaa2880ef7531c8fbc5d267bda709198b3fec0b', '3ffd48a3a273ed72c83240d3f74e40cdebfb5dbc706b198d3be86ce45086593d', '3520de2d4a0c107bce7c84c181663b93b13e1a0cc0e4ea1bcafd0f9b5761ef42'],
+    >> 'level': 1, 'source_id': '835b610ae31936739a47ce504674d3e86756688728b8c2b83f83484f3e1e4697',
+    >> 'page_number': 1, 'split_id': 1, 'split_idx_start': 45})]}
+    ```
+    """  # noqa: E501
+
+    def __init__(self, document_store: DocumentStore, threshold: float = 0.5):
+        """
+        Initialize the AutoMergingRetriever.
+
+        :param document_store: DocumentStore from which to retrieve the parent documents
+        :param threshold: Threshold to decide whether the parent instead of the individual documents is returned
+        """
+
+        if threshold > 1 or threshold < 0:
+            raise ValueError("The threshold parameter must be between 0 and 1.")
+
+        self.document_store = document_store
+        self.threshold = threshold
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
+        docstore = self.document_store.to_dict()
+        return default_to_dict(self, document_store=docstore, threshold=self.threshold)
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "AutoMergingRetriever":
+        """
+        Deserializes the component from a dictionary.
+
+        :param data:
+            Dictionary with serialized data.
+        :returns:
+            An instance of the component.
+        """
+        init_params = data.get("init_parameters", {})
+
+        if "document_store" not in init_params:
+            raise DeserializationError("Missing 'document_store' in serialization data")
+        if "type" not in init_params["document_store"]:
+            raise DeserializationError("Missing 'type' in document store's serialization data")
+
+        # deserialize the document store
+        doc_store_data = data["init_parameters"]["document_store"]
+        try:
+            doc_store_class = import_class_by_name(doc_store_data["type"])
+        except ImportError as e:
+            raise DeserializationError(f"Class '{doc_store_data['type']}' not correctly imported") from e
+
+        if hasattr(doc_store_class, "from_dict"):
+            data["init_parameters"]["document_store"] = doc_store_class.from_dict(doc_store_data)
+        else:
+            data["init_parameters"]["document_store"] = default_from_dict(doc_store_class, doc_store_data)
+
+        # deserialize the component
+        return default_from_dict(cls, data)
+
+    @component.output_types(documents=List[Document])
+    def run(self, matched_leaf_documents: List[Document]):
+        """
+        Run the AutoMergingRetriever.
+
+        Groups the matched leaf documents by their parent documents and returns the parent documents if the number of
+        matched leaf documents below the same parent is higher than the defined threshold. Otherwise, returns the
+        matched leaf documents.
+
+        :param matched_leaf_documents: List of leaf documents that were matched by a retriever
+        """
+
+        docs_to_return = []
+
+        # group the matched leaf documents by their parent documents
+        parent_documents: Dict[str, List[Document]] = defaultdict(list)
+        for doc in matched_leaf_documents:
+            parent_documents[doc.meta["__parent_id"]].append(doc)
+
+        # find total number of children for each parent document
+        for doc_id, retrieved_child_docs in parent_documents.items():
+            parent_doc = self.document_store.filter_documents({"field": "id", "operator": "==", "value": doc_id})
+            parent_children_count = len(parent_doc[0].meta["__children_ids"])
+
+            # return either the parent document or the matched leaf documents based on the threshold value
+            score = len(retrieved_child_docs) / parent_children_count
+            if score >= self.threshold:
+                # return the parent document
+                docs_to_return.append(parent_doc[0])
+            else:
+                # return all the matched leaf documents which are child of this parent document
+                leafs_ids = [doc.id for doc in retrieved_child_docs]
+                docs_to_return.extend([doc for doc in matched_leaf_documents if doc.id in leafs_ids])
+
+        return {"documents": docs_to_return}
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from haystack_experimental.components.splitters.hierarchical_doc_splitter import HierarchicalDocumentSplitter
+
+_all_ = ["HierarchicalDocumentSplitter"]
@@ -0,0 +1,141 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Dict, List, Literal
+
+from haystack import Document, component, default_from_dict, default_to_dict
+from haystack.components.preprocessors import DocumentSplitter
+
+
+@component
+class HierarchicalDocumentSplitter:
+    """
+    Splits a documents into different block sizes building a hierarchical tree structure of blocks of different sizes.
+
+    The root node of the tree is the original document, the leaf nodes are the smallest blocks. The blocks in between
+    are connected such that the smaller blocks are children of the parent-larger blocks.
+
+    ## Usage example
+    ```python
+    from haystack import Document
+    from haystack.components.builders import HierarchicalDocumentBuilder
+
+    doc = Document(content="This is a simple test document")
+    builder = HierarchicalDocumentBuilder(block_sizes=[3, 2], split_overlap=0, split_by="word")
+    builder.run([doc])
+    >> {'documents': [Document(id=3f7e91e9a775ed0815606a0bc2f732b38b7682a84d5b23c06997a8dcfa849a0d, content: 'This is a simple test document', meta: {'block_size': 0, 'parent_id': None, 'children_ids': ['5ff4a36b5371580ac5f2ea21a5690dcc18802c7e5e187d57c5e2d312eee22dfd', '8dc5707ebe647dedab97db15d0fc82a5e551bbe54a9fb82f79b68b3e3046a3a2'], 'level': 0}),
+    >> Document(id=5ff4a36b5371580ac5f2ea21a5690dcc18802c7e5e187d57c5e2d312eee22dfd, content: 'This is a ', meta: {'block_size': 3, 'parent_id': '3f7e91e9a775ed0815606a0bc2f732b38b7682a84d5b23c06997a8dcfa849a0d', 'children_ids': ['f196b211ebadd5f47afedff14284759b4654f0722c38976760b88d675e7dc8f6', '52c7e9fc53ae9aa734cc15d8624ae63468b423a7032077ee4cdcf524569274d3'], 'level': 1, 'source_id': '3f7e91e9a775ed0815606a0bc2f732b38b7682a84d5b23c06997a8dcfa849a0d', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}),
+    >> Document(id=8dc5707ebe647dedab97db15d0fc82a5e551bbe54a9fb82f79b68b3e3046a3a2, content: 'simple test document', meta: {'block_size': 3, 'parent_id': '3f7e91e9a775ed0815606a0bc2f732b38b7682a84d5b23c06997a8dcfa849a0d', 'children_ids': ['39d299629a35051fc9ebb62a0594626546d6ec1b1de7cfcdfb03be58b769478e', 'e23ceb261772f539830384097e4f6c513205019cf422378078ff66dd4870f91a'], 'level': 1, 'source_id': '3f7e91e9a775ed0815606a0bc2f732b38b7682a84d5b23c06997a8dcfa849a0d', 'page_number': 1, 'split_id': 1, 'split_idx_start': 10}),
+    >> Document(id=f196b211ebadd5f47afedff14284759b4654f0722c38976760b88d675e7dc8f6, content: 'This is ', meta: {'block_size': 2, 'parent_id': '5ff4a36b5371580ac5f2ea21a5690dcc18802c7e5e187d57c5e2d312eee22dfd', 'children_ids': [], 'level': 2, 'source_id': '5ff4a36b5371580ac5f2ea21a5690dcc18802c7e5e187d57c5e2d312eee22dfd', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}),
+    >> Document(id=52c7e9fc53ae9aa734cc15d8624ae63468b423a7032077ee4cdcf524569274d3, content: 'a ', meta: {'block_size': 2, 'parent_id': '5ff4a36b5371580ac5f2ea21a5690dcc18802c7e5e187d57c5e2d312eee22dfd', 'children_ids': [], 'level': 2, 'source_id': '5ff4a36b5371580ac5f2ea21a5690dcc18802c7e5e187d57c5e2d312eee22dfd', 'page_number': 1, 'split_id': 1, 'split_idx_start': 8}),
+    >> Document(id=39d299629a35051fc9ebb62a0594626546d6ec1b1de7cfcdfb03be58b769478e, content: 'simple test ', meta: {'block_size': 2, 'parent_id': '8dc5707ebe647dedab97db15d0fc82a5e551bbe54a9fb82f79b68b3e3046a3a2', 'children_ids': [], 'level': 2, 'source_id': '8dc5707ebe647dedab97db15d0fc82a5e551bbe54a9fb82f79b68b3e3046a3a2', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}),
+    >> Document(id=e23ceb261772f539830384097e4f6c513205019cf422378078ff66dd4870f91a, content: 'document', meta: {'block_size': 2, 'parent_id': '8dc5707ebe647dedab97db15d0fc82a5e551bbe54a9fb82f79b68b3e3046a3a2', 'children_ids': [], 'level': 2, 'source_id': '8dc5707ebe647dedab97db15d0fc82a5e551bbe54a9fb82f79b68b3e3046a3a2', 'page_number': 1, 'split_id': 1, 'split_idx_start': 12})]}
+    ```
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        block_sizes: List[int],
+        split_overlap: int = 0,
+        split_by: Literal["word", "sentence", "page", "passage"] = "word",
+    ):
+        """
+        Initialize HierarchicalDocumentBuilder.
+
+        :param block_sizes: List of block sizes to split the document into. The blocks are split in descending order.
+        :param split_overlap: The number of overlapping units for each split.
+        :param split_by: The unit for splitting your documents.
+        """
+
+        if len(set(block_sizes)) != len(block_sizes):
+            raise ValueError("block_sizes must not contain duplicates")
+        self.block_sizes = sorted(set(block_sizes), reverse=True)
+        self.split_overlap = split_overlap
+        self.split_by = split_by
+
+    @component.output_types(documents=List[Document])
+    def run(self, documents: List[Document]):
+        """
+        Builds a hierarchical document structure for each document in a list of documents.
+
+        :param documents: List of Documents to split into hierarchical blocks.
+        :return: List of HierarchicalDocument
+        """
+        hierarchical_docs = []
+        for doc in documents:
+            hierarchical_docs.extend(self.build_hierarchy_from_doc(doc))
+        return {"documents": hierarchical_docs}
+
+    def _split_doc(self, doc: Document, block_size: int) -> List[Document]:
+        splitter = DocumentSplitter(split_length=block_size, split_overlap=self.split_overlap, split_by=self.split_by)
+        split_docs = splitter.run([doc])
+        return split_docs["documents"]
+
+    @staticmethod
+    def _add_meta_data(document: Document):
+        document.meta["__block_size"] = 0
+        document.meta["__parent_id"] = None
+        document.meta["__children_ids"] = []
+        document.meta["__level"] = 0
+        return document
+
+    def build_hierarchy_from_doc(self, document: Document) -> List[Document]:
+        """
+        Build a hierarchical tree document structure from a single document.
+
+        Given a document, this function splits the document into hierarchical blocks of different sizes represented
+        as HierarchicalDocument objects.
+
+        :param document: Document to split into hierarchical blocks.
+        :return:
+            List of HierarchicalDocument
+        """
+
+        root = self._add_meta_data(document)
+        current_level_nodes = [root]
+        all_docs = []
+
+        for block in self.block_sizes:
+            next_level_nodes = []
+            for doc in current_level_nodes:
+                child_docs = self._split_doc(doc, block)
+                # if it's only one document skip
+                if len(child_docs) == 1:
+                    next_level_nodes.append(doc)
+                    continue
+                for child_doc in child_docs:
+                    child_doc = self._add_meta_data(child_doc)
+                    child_doc.meta["__level"] = doc.meta["__level"] + 1
+                    child_doc.meta["__block_size"] = block
+                    child_doc.meta["__parent_id"] = doc.id
+                    all_docs.append(child_doc)
+                    doc.meta["__children_ids"].append(child_doc.id)
+                    next_level_nodes.append(child_doc)
+            current_level_nodes = next_level_nodes
+
+        return [root] + all_docs
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Returns a dictionary representation of the component.
+
+        :returns:
+                Serialized dictionary representation of the component.
+        """
+        return default_to_dict(
+            self, block_sizes=self.block_sizes, split_overlap=self.split_overlap, split_by=self.split_by
+        )
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "HierarchicalDocumentSplitter":
+        """
+        Deserialize this component from a dictionary.
+
+        :param data:
+            The dictionary to deserialize and create the component.
+
+        :returns:
+            The deserialized component.
+        """
+        return default_from_dict(cls, data)
@@ -0,0 +1,82 @@
+import pytest
+
+from haystack import Document
+from haystack_experimental.components.splitters import HierarchicalDocumentSplitter
+from haystack_experimental.components.retrievers.auto_merging_retriever import AutoMergingRetriever
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+
+
+class TestAutoMergingRetriever:
+    def test_init_default(self):
+        retriever = AutoMergingRetriever(InMemoryDocumentStore())
+        assert retriever.threshold == 0.5
+
+    def test_init_with_parameters(self):
+        retriever = AutoMergingRetriever(InMemoryDocumentStore(), threshold=0.7)
+        assert retriever.threshold == 0.7
+
+    def test_init_with_invalid_threshold(self):
+        with pytest.raises(ValueError):
+            AutoMergingRetriever(InMemoryDocumentStore(), threshold=-2)
+
+    def test_to_dict(self):
+        retriever = AutoMergingRetriever(InMemoryDocumentStore(), threshold=0.7)
+        expected = retriever.to_dict()
+        assert expected['type'] == 'haystack_experimental.components.retrievers.auto_merging_retriever.AutoMergingRetriever'
+        assert expected['init_parameters']['threshold'] == 0.7
+        assert expected['init_parameters']['document_store']['type'] == 'haystack.document_stores.in_memory.document_store.InMemoryDocumentStore'
+
+    def test_from_dict(self):
+        data = {
+            'type': 'haystack_experimental.components.retrievers.auto_merging_retriever.AutoMergingRetriever',
+            'init_parameters': {
+                'document_store': {
+                    'type': 'haystack.document_stores.in_memory.document_store.InMemoryDocumentStore',
+                    'init_parameters': {
+                        'bm25_tokenization_regex': '(?u)\\b\\w\\w+\\b',
+                        'bm25_algorithm': 'BM25L',
+                        'bm25_parameters': {},
+                        'embedding_similarity_function': 'dot_product',
+                        'index': '6b122bb4-211b-465e-804d-77c5857bf4c5'}},
+                'threshold': 0.7}}
+        retriever = AutoMergingRetriever.from_dict(data)
+        assert retriever.threshold == 0.7
+
+    def test_run_return_parent_document(self):
+        text = "The sun rose early in the morning. It cast a warm glow over the trees. Birds began to sing."
+
+        docs = [Document(content=text)]
+        builder = HierarchicalDocumentSplitter(block_sizes=[10, 3], split_overlap=0, split_by="word")
+        docs = builder.run(docs)
+
+        # store level-1 parent documents and initialize the retriever
+        doc_store_parents = InMemoryDocumentStore()
+        for doc in docs["documents"]:
+            if doc.meta["__children_ids"] and doc.meta["__level"] == 1:
+                doc_store_parents.write_documents([doc])
+        retriever = AutoMergingRetriever(doc_store_parents, threshold=0.5)
+
+        # assume we retrieved 2 leaf docs from the same parent, the parent document should be returned,
+        # since it has 3 children and the threshold=0.5, and we retrieved 2 children (2/3 > 0.66(6))
+        leaf_docs = [doc for doc in docs["documents"] if not doc.meta["__children_ids"]]
+        docs = retriever.run(leaf_docs[4:6])
+        assert len(docs["documents"]) == 1
+        assert docs["documents"][0].content == "warm glow over the trees. Birds began to sing."
+        assert len(docs["documents"][0].meta["__children_ids"]) == 3
+
+    def test_run_return_leafs_document(self):
+        docs = [Document(content="The monarch of the wild blue yonder rises from the eastern side of the horizon.")]
+        builder = HierarchicalDocumentSplitter(block_sizes=[10, 5, 2], split_overlap=0, split_by="word")
+        docs = builder.run(docs)
+
+        doc_store_parents = InMemoryDocumentStore()
+        for doc in docs["documents"]:
+            if doc.meta["__children_ids"]:
+                doc_store_parents.write_documents([doc])
+
+        leaf_docs = [doc for doc in docs["documents"] if not doc.meta["__children_ids"]]
+        retriever = AutoMergingRetriever(doc_store_parents, threshold=0.5)
+        retriever.run(leaf_docs[3:4])
+
+    def test_run_return_leafs_document_different_parents(self):
+        pass