deepset-ai · davidsbatista · Aug 26, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024
@@ -36,19 +36,25 @@ that includes it. Once it reaches the end of its lifespan, the experiment will b
 
 The latest version of the package contains the following experiments:
 
-| Name                        | Type                       | Expected experiment end date | Dependencies |
-|-----------------------------|----------------------------|------------------------------| ------------ |
-| [`Auto-Merge Retriever`][1] | Retrieval Technique        | November 2024                | None         |
-| [`EvaluationHarness`][2]    | Evaluation orchestrator    | October 2024                 | None         |
-| [`OpenAIFunctionCaller`][3] | Function Calling Component | October 2024                 | None         |
-| [`OpenAPITool`][4]          | OpenAPITool component      | October 2024                 | jsonref      |
-| [`Tool`][5]                 | Tool dataclass             | November 2024                | jsonschema   |
-
-[1]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/components/retrievers/auto_merge_retriever.py
-[2]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/evaluation/harness
-[3]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/components/tools/openai
-[4]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/components/tools/openapi
-[5]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/dataclasses/tool.py
+| --------------------------- | -------------------------- | ---------------------------- | ------------ |
+| [`EvaluationHarness`][1]    | Evaluation orchestrator    | October 2024                 | None         |
+| [`OpenAIFunctionCaller`][2] | Function Calling Component | October 2024                 | None         |
+| [`OpenAPITool`][3]          | OpenAPITool component      | October 2024                 | jsonref      |
+| [`Tool`][4]                 | Tool dataclass             | November 2024                | jsonschema   |
+| [`ChatMessageWriter`][5]    | Memory Component | November 2024                | None         |
+| [`ChatMessageRetriever`][6] | Memory Component | November 2024                | None         |
+| [`InMemoryChatMessageStore`][7] | Memory Store | November 2024                | None         |
+| [`Auto-Merge Retriever`][8] | Retrieval Technique        | November 2024                | None         |
+
+[1]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/evaluation/harness
+[2]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/components/tools/openai
+[3]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/components/tools/openapi
+[4]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/dataclasses/tool.py
+[5]: https://github.com/deepset-ai/haystack-experimental/blob/main/haystack_experimental/components/writers/chat_message_writer.py
+[6]: https://github.com/deepset-ai/haystack-experimental/blob/main/haystack_experimental/components/retrievers/chat_message_retriever.py
+[7]: https://github.com/deepset-ai/haystack-experimental/blob/main/haystack_experimental/chat_message_stores/in_memory.py
+[8]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/components/retrievers/auto_merge_retriever.py
+
 
 ## Usage
 

@@ -0,0 +1,27 @@
+loaders:
+  - type: haystack_pydoc_tools.loaders.CustomPythonLoader
+    search_path: [../../../]
+    modules: ["haystack_experimental.components.retrievers.auto_merging_retriever"]
+    ignore_when_discovered: ["__init__"]
+processors:
+  - type: filter
+    expression:
+    documented_only: true
+    do_not_filter_modules: false
+    skip_empty_modules: true
+  - type: smart
+  - type: crossref
+renderer:
+  type: haystack_pydoc_tools.renderers.ReadmeCoreRenderer
+  excerpt: Auto Merging Retriever for Haystack.
+  category_slug: experiments-api
+  title: Auto Merge Retriever
+  slug: auto-merge-retriever
+  order: 10
+  markdown:
+    descriptive_class_title: false
+    classdef_code_block: false
+    descriptive_module_title: true
+    add_method_class_prefix: true
+    add_member_class_prefix: false
+    filename: auto_merging_retriever.md
@@ -0,0 +1,27 @@
+loaders:
+  - type: haystack_pydoc_tools.loaders.CustomPythonLoader
+    search_path: [../../../]
+    modules: ["haystack_experimental.components.splitters.hierarchical_doc_splitter"]
+    ignore_when_discovered: ["__init__"]
+processors:
+  - type: filter
+    expression:
+    documented_only: true
+    do_not_filter_modules: false
+    skip_empty_modules: true
+  - type: smart
+  - type: crossref
+renderer:
+  type: haystack_pydoc_tools.renderers.ReadmeCoreRenderer
+  excerpt: Hierarchical Document Splitter for Haystack.
+  category_slug: experiments-api
+  title: Split documents into hierarchical chunks.
+  slug: hierarchical-document-splitter
+  order: 70
+  markdown:
+    descriptive_class_title: false
+    classdef_code_block: false
+    descriptive_module_title: true
+    add_method_class_prefix: true
+    add_member_class_prefix: false
+    filename: data_classess_api.md
@@ -10,8 +10,6 @@
 from haystack.document_stores.types import DocumentStore
 from haystack.utils import deserialize_document_store_in_init_parameters
 
-UNSUPPORTED_DOCUMENT_STORES = ("ChromaDocumentStore", "PineconeDocumentStore")
-
 
 @component
 class AutoMergingRetriever:
@@ -28,6 +26,12 @@ class AutoMergingRetriever:
     a given query, multiple chunks are matched, the whole paragraph might be more informative than the individual
     chunks alone.
 
+    Currently the AutoMergingRetriever can only be used by the following DocumentStores:
+    - [ElasticSearch](https://haystack.deepset.ai/docs/latest/documentstore/elasticsearch)
+    - [OpenSearch](https://haystack.deepset.ai/docs/latest/documentstore/opensearch)
+    - [PGVector](https://haystack.deepset.ai/docs/latest/documentstore/pgvector)
+    - [Qdrant](https://haystack.deepset.ai/docs/latest/documentstore/qdrant)
+
     ```python
     from haystack import Document
     from haystack_experimental.components.splitters import HierarchicalDocumentSplitter
@@ -51,11 +55,9 @@ class AutoMergingRetriever:
     # since it has 3 children and the threshold=0.5, and we retrieved 2 children (2/3 > 0.66(6))
     leaf_docs = [doc for doc in docs["documents"] if not doc.meta["children_ids"]]
     docs = retriever.run(leaf_docs[4:6])
-    >> {'documents': [Document(id=5384f4d58e13beb40ce80ab324a1da24f70ed69c2ec4c4f2a6f64abbc846a794,
+    >> {'documents': [Document(id=538..),
     >> content: 'warm glow over the trees. Birds began to sing.',
-    >> meta: {'block_size': 10, 'parent_id': '835b610ae31936739a47ce504674d3e86756688728b8c2b83f83484f3e1e4697',
-    >> 'children_ids': ['c17e28e4b4577f892aba181a3aaa2880ef7531c8fbc5d267bda709198b3fec0b', '3ffd48a3a273ed72c83240d3f74e40cdebfb5dbc706b198d3be86ce45086593d', '3520de2d4a0c107bce7c84c181663b93b13e1a0cc0e4ea1bcafd0f9b5761ef42'],
-    >> 'level': 1, 'source_id': '835b610ae31936739a47ce504674d3e86756688728b8c2b83f83484f3e1e4697',
+    >> meta: {'block_size': 10, 'parent_id': '835..', 'children_ids': ['c17...', '3ff...', '352...'], 'level': 1, 'source_id': '835...',
     >> 'page_number': 1, 'split_id': 1, 'split_idx_start': 45})]}
     ```
     """  # noqa: E501
@@ -68,14 +70,9 @@ def __init__(self, document_store: DocumentStore, threshold: float = 0.5):
         :param threshold: Threshold to decide whether the parent instead of the individual documents is returned
         """
 
-        if threshold > 1 or threshold < 0:
+        if not 0 < threshold < 1:
             raise ValueError("The threshold parameter must be between 0 and 1.")
 
-        if document_store.__class__.__name__ in UNSUPPORTED_DOCUMENT_STORES:
-            msg = (f"The document store type {document_store.__class__.__name__} "
-                   f"is not supported by the AutoMergingRetriever.")
-            raise ValueError(msg)
-
         self.document_store = document_store
         self.threshold = threshold
 
@@ -102,6 +99,18 @@ def from_dict(cls, data: Dict[str, Any]) -> "AutoMergingRetriever":
         data = deserialize_document_store_in_init_parameters(data)
         return default_from_dict(cls, data)
 
+    @staticmethod
+    def _check_valid_documents(matched_leaf_documents: List[Document]):
+        # check if the matched leaf documents have the required meta fields
+        if not all(doc.meta.get("__parent_id") for doc in matched_leaf_documents):
+            raise ValueError("The matched leaf documents do not have the required meta field '__parent_id'")
+
+        if not all(doc.meta.get("__level") for doc in matched_leaf_documents):
+            raise ValueError("The matched leaf documents do not have the required meta field '__level'")
+
+        if not all(doc.meta.get("__block_size") for doc in matched_leaf_documents):
+            raise ValueError("The matched leaf documents do not have the required meta field '__block_size'")
+
     @component.output_types(documents=List[Document])
     def run(self, matched_leaf_documents: List[Document]):
         """
@@ -112,6 +121,8 @@ def run(self, matched_leaf_documents: List[Document]):
         matched leaf documents.
 
         :param matched_leaf_documents: List of leaf documents that were matched by a retriever
+        :returns:
+            List of parent documents or matched leaf documents based on the threshold value
         """
 
         docs_to_return = []
@@ -124,6 +135,12 @@ def run(self, matched_leaf_documents: List[Document]):
         # find total number of children for each parent document
         for doc_id, retrieved_child_docs in parent_documents.items():
             parent_doc = self.document_store.filter_documents({"field": "id", "operator": "==", "value": doc_id})
+            if len(parent_doc) == 0:
+                raise ValueError(f"Parent document with id {doc_id} not found in the document store.")
+            if len(parent_doc) > 1:
+                raise ValueError(f"Multiple parent documents found with id {doc_id} in the document store.")
+            if not parent_doc[0].meta.get("__children_ids"):
+                raise ValueError(f"Parent document with id {doc_id} does not have any children.")
             parent_children_count = len(parent_doc[0].meta["__children_ids"])
 
             # return either the parent document or the matched leaf documents based on the threshold value
@@ -133,7 +150,7 @@ def run(self, matched_leaf_documents: List[Document]):
                 docs_to_return.append(parent_doc[0])
             else:
                 # return all the matched leaf documents which are child of this parent document
-                leafs_ids = [doc.id for doc in retrieved_child_docs]
+                leafs_ids = {doc.id for doc in retrieved_child_docs}
                 docs_to_return.extend([doc for doc in matched_leaf_documents if doc.id in leafs_ids])
 
         return {"documents": docs_to_return}
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, Dict, List, Literal
+from typing import Any, Dict, List, Literal, Set
 
 from haystack import Document, component, default_from_dict, default_to_dict
 from haystack.components.preprocessors import DocumentSplitter
@@ -24,53 +24,54 @@ class HierarchicalDocumentSplitter:
     doc = Document(content="This is a simple test document")
     builder = HierarchicalDocumentBuilder(block_sizes=[3, 2], split_overlap=0, split_by="word")
     builder.run([doc])
-    >> {'documents': [Document(id=3f7e91e9a775ed0815606a0bc2f732b38b7682a84d5b23c06997a8dcfa849a0d, content: 'This is a simple test document', meta: {'block_size': 0, 'parent_id': None, 'children_ids': ['5ff4a36b5371580ac5f2ea21a5690dcc18802c7e5e187d57c5e2d312eee22dfd', '8dc5707ebe647dedab97db15d0fc82a5e551bbe54a9fb82f79b68b3e3046a3a2'], 'level': 0}),
-    >> Document(id=5ff4a36b5371580ac5f2ea21a5690dcc18802c7e5e187d57c5e2d312eee22dfd, content: 'This is a ', meta: {'block_size': 3, 'parent_id': '3f7e91e9a775ed0815606a0bc2f732b38b7682a84d5b23c06997a8dcfa849a0d', 'children_ids': ['f196b211ebadd5f47afedff14284759b4654f0722c38976760b88d675e7dc8f6', '52c7e9fc53ae9aa734cc15d8624ae63468b423a7032077ee4cdcf524569274d3'], 'level': 1, 'source_id': '3f7e91e9a775ed0815606a0bc2f732b38b7682a84d5b23c06997a8dcfa849a0d', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}),
-    >> Document(id=8dc5707ebe647dedab97db15d0fc82a5e551bbe54a9fb82f79b68b3e3046a3a2, content: 'simple test document', meta: {'block_size': 3, 'parent_id': '3f7e91e9a775ed0815606a0bc2f732b38b7682a84d5b23c06997a8dcfa849a0d', 'children_ids': ['39d299629a35051fc9ebb62a0594626546d6ec1b1de7cfcdfb03be58b769478e', 'e23ceb261772f539830384097e4f6c513205019cf422378078ff66dd4870f91a'], 'level': 1, 'source_id': '3f7e91e9a775ed0815606a0bc2f732b38b7682a84d5b23c06997a8dcfa849a0d', 'page_number': 1, 'split_id': 1, 'split_idx_start': 10}),
-    >> Document(id=f196b211ebadd5f47afedff14284759b4654f0722c38976760b88d675e7dc8f6, content: 'This is ', meta: {'block_size': 2, 'parent_id': '5ff4a36b5371580ac5f2ea21a5690dcc18802c7e5e187d57c5e2d312eee22dfd', 'children_ids': [], 'level': 2, 'source_id': '5ff4a36b5371580ac5f2ea21a5690dcc18802c7e5e187d57c5e2d312eee22dfd', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}),
-    >> Document(id=52c7e9fc53ae9aa734cc15d8624ae63468b423a7032077ee4cdcf524569274d3, content: 'a ', meta: {'block_size': 2, 'parent_id': '5ff4a36b5371580ac5f2ea21a5690dcc18802c7e5e187d57c5e2d312eee22dfd', 'children_ids': [], 'level': 2, 'source_id': '5ff4a36b5371580ac5f2ea21a5690dcc18802c7e5e187d57c5e2d312eee22dfd', 'page_number': 1, 'split_id': 1, 'split_idx_start': 8}),
-    >> Document(id=39d299629a35051fc9ebb62a0594626546d6ec1b1de7cfcdfb03be58b769478e, content: 'simple test ', meta: {'block_size': 2, 'parent_id': '8dc5707ebe647dedab97db15d0fc82a5e551bbe54a9fb82f79b68b3e3046a3a2', 'children_ids': [], 'level': 2, 'source_id': '8dc5707ebe647dedab97db15d0fc82a5e551bbe54a9fb82f79b68b3e3046a3a2', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}),
-    >> Document(id=e23ceb261772f539830384097e4f6c513205019cf422378078ff66dd4870f91a, content: 'document', meta: {'block_size': 2, 'parent_id': '8dc5707ebe647dedab97db15d0fc82a5e551bbe54a9fb82f79b68b3e3046a3a2', 'children_ids': [], 'level': 2, 'source_id': '8dc5707ebe647dedab97db15d0fc82a5e551bbe54a9fb82f79b68b3e3046a3a2', 'page_number': 1, 'split_id': 1, 'split_idx_start': 12})]}
+    >> {'documents': [Document(id=3f7..., content: 'This is a simple test document', meta: {'block_size': 0, 'parent_id': None, 'children_ids': ['5ff..', '8dc..'], 'level': 0}),
+    >> Document(id=5ff.., content: 'This is a ', meta: {'block_size': 3, 'parent_id': '3f7..', 'children_ids': ['f19..', '52c..'], 'level': 1, 'source_id': '3f7..', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}),
+    >> Document(id=8dc.., content: 'simple test document', meta: {'block_size': 3, 'parent_id': '3f7..', 'children_ids': ['39d..', 'e23..'], 'level': 1, 'source_id': '3f7..', 'page_number': 1, 'split_id': 1, 'split_idx_start': 10}),
+    >> Document(id=f19.., content: 'This is ', meta: {'block_size': 2, 'parent_id': '5ff..', 'children_ids': [], 'level': 2, 'source_id': '5ff..', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}),
+    >> Document(id=52c.., content: 'a ', meta: {'block_size': 2, 'parent_id': '5ff..', 'children_ids': [], 'level': 2, 'source_id': '5ff..', 'page_number': 1, 'split_id': 1, 'split_idx_start': 8}),
+    >> Document(id=39d.., content: 'simple test ', meta: {'block_size': 2, 'parent_id': '8dc..', 'children_ids': [], 'level': 2, 'source_id': '8dc..', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}),
+    >> Document(id=e23.., content: 'document', meta: {'block_size': 2, 'parent_id': '8dc..', 'children_ids': [], 'level': 2, 'source_id': '8dc..', 'page_number': 1, 'split_id': 1, 'split_idx_start': 12})]}
     ```
     """  # noqa: E501
 
     def __init__(
         self,
-        block_sizes: List[int],
+        block_sizes: Set[int],
         split_overlap: int = 0,
         split_by: Literal["word", "sentence", "page", "passage"] = "word",
     ):
         """
         Initialize HierarchicalDocumentBuilder.
 
-        :param block_sizes: List of block sizes to split the document into. The blocks are split in descending order.
+        :param block_sizes: Set of block sizes to split the document into. The blocks are split in descending order.
         :param split_overlap: The number of overlapping units for each split.
         :param split_by: The unit for splitting your documents.
         """
 
-        if len(set(block_sizes)) != len(block_sizes):
-            raise ValueError("block_sizes must not contain duplicates")
         self.block_sizes = sorted(set(block_sizes), reverse=True)
+        self.splitters: Dict[int, DocumentSplitter] = {}
         self.split_overlap = split_overlap
         self.split_by = split_by
+        self._build_block_sizes()
 
     @component.output_types(documents=List[Document])
     def run(self, documents: List[Document]):
         """
         Builds a hierarchical document structure for each document in a list of documents.
 
         :param documents: List of Documents to split into hierarchical blocks.
-        :return: List of HierarchicalDocument
+        :returns: List of HierarchicalDocument
         """
         hierarchical_docs = []
         for doc in documents:
             hierarchical_docs.extend(self.build_hierarchy_from_doc(doc))
         return {"documents": hierarchical_docs}
 
-    def _split_doc(self, doc: Document, block_size: int) -> List[Document]:
-        splitter = DocumentSplitter(split_length=block_size, split_overlap=self.split_overlap, split_by=self.split_by)
-        split_docs = splitter.run([doc])
-        return split_docs["documents"]
+    def _build_block_sizes(self):
+        for block_size in self.block_sizes:
+            self.splitters[block_size] = DocumentSplitter(
+                split_length=block_size, split_overlap=self.split_overlap, split_by=self.split_by
+            )
 
     @staticmethod
     def _add_meta_data(document: Document):
@@ -88,7 +89,7 @@ def build_hierarchy_from_doc(self, document: Document) -> List[Document]:
         as HierarchicalDocument objects.
 
         :param document: Document to split into hierarchical blocks.
-        :return:
+        :returns:
             List of HierarchicalDocument
         """
 
@@ -99,7 +100,8 @@ def build_hierarchy_from_doc(self, document: Document) -> List[Document]:
         for block in self.block_sizes:
             next_level_nodes = []
             for doc in current_level_nodes:
-                child_docs = self._split_doc(doc, block)
+                splitted_docs = self.splitters[block].run([doc])
+                child_docs = splitted_docs["documents"]
                 # if it's only one document skip
                 if len(child_docs) == 1:
                     next_level_nodes.append(doc)