-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: adding an hierarchical document builder and a auto-merge retrie…
…ver (#56) * initial import * adding LICENSE header * fixing tests for experimental package * fixing tests for experimental package * fixing tests for experimental package * linting issues * attending PR comments * adding serialization and deserialization * adding serialization and deserialization * fixing tests * Update haystack_experimental/components/retrievers/auto_merging_retriever.py Co-authored-by: Stefano Fiorucci <[email protected]> * Update test/components/splitters/test_hierarchical_doc_splitter.py Co-authored-by: Stefano Fiorucci <[email protected]> * attending PR comments * attending PR comments * fixing tests * improving test coverage * improving test coverage * warning for unsupported doc stores * fix for unsupported doc stores * adding dependencies for tests * trying another approach: adding dependencies for tests * missed using fixture * moving all inside the test * fixing unsupported DBs test * updating tests * removing unused imports * remove import patching * nit * Update haystack_experimental/components/retrievers/auto_merging_retriever.py Co-authored-by: Madeesh Kannan <[email protected]> * Update haystack_experimental/components/splitters/hierarchical_doc_splitter.py Co-authored-by: Madeesh Kannan <[email protected]> * docstrings * docstrings * fixing tests * docstrings * improving tests * adding checks/safeguards * adding checks/safeguards for not supported DocumentStores * cleaning * removing old checks * fixing docstrings * fixing linting and tests * cleaning tests * cleaning * adding pydoc * fix pydocs * fix pydocs * DocumentSplitter part of the object, created only once * adding serdes to the retriever --------- Co-authored-by: Stefano Fiorucci <[email protected]> Co-authored-by: Madeesh Kannan <[email protected]>
- Loading branch information
1 parent
c4ea059
commit f7eeb91
Showing
9 changed files
with
655 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
loaders: | ||
- type: haystack_pydoc_tools.loaders.CustomPythonLoader | ||
search_path: [../../../] | ||
modules: ["haystack_experimental.components.retrievers.auto_merging_retriever"] | ||
ignore_when_discovered: ["__init__"] | ||
processors: | ||
- type: filter | ||
expression: | ||
documented_only: true | ||
do_not_filter_modules: false | ||
skip_empty_modules: true | ||
- type: smart | ||
- type: crossref | ||
renderer: | ||
type: haystack_pydoc_tools.renderers.ReadmeCoreRenderer | ||
excerpt: Auto Merging Retriever for Haystack. | ||
category_slug: experiments-api | ||
title: Auto Merge Retriever | ||
slug: auto-merge-retriever | ||
order: 10 | ||
markdown: | ||
descriptive_class_title: false | ||
classdef_code_block: false | ||
descriptive_module_title: true | ||
add_method_class_prefix: true | ||
add_member_class_prefix: false | ||
filename: auto_merging_retriever.md |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
loaders: | ||
- type: haystack_pydoc_tools.loaders.CustomPythonLoader | ||
search_path: [../../../] | ||
modules: ["haystack_experimental.components.splitters.hierarchical_doc_splitter"] | ||
ignore_when_discovered: ["__init__"] | ||
processors: | ||
- type: filter | ||
expression: | ||
documented_only: true | ||
do_not_filter_modules: false | ||
skip_empty_modules: true | ||
- type: smart | ||
- type: crossref | ||
renderer: | ||
type: haystack_pydoc_tools.renderers.ReadmeCoreRenderer | ||
excerpt: Hierarchical Document Splitter for Haystack. | ||
category_slug: experiments-api | ||
title: Split documents into hierarchical chunks. | ||
slug: hierarchical-document-splitter | ||
order: 70 | ||
markdown: | ||
descriptive_class_title: false | ||
classdef_code_block: false | ||
descriptive_module_title: true | ||
add_method_class_prefix: true | ||
add_member_class_prefix: false | ||
filename: data_classess_api.md |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
156 changes: 156 additions & 0 deletions
156
haystack_experimental/components/retrievers/auto_merging_retriever.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]> | ||
# | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
from collections import defaultdict | ||
from typing import Any, Dict, List | ||
|
||
from haystack import Document, component, default_to_dict | ||
from haystack.core.serialization import default_from_dict | ||
from haystack.document_stores.types import DocumentStore | ||
from haystack.utils import deserialize_document_store_in_init_parameters | ||
|
||
|
||
@component | ||
class AutoMergingRetriever: | ||
""" | ||
A retriever which returns parent documents of the matched leaf nodes documents, based on a threshold setting. | ||
The AutoMergingRetriever assumes you have a hierarchical tree structure of documents, where the leaf nodes | ||
are indexed in a document store. See the HierarchicalDocumentSplitter for more information on how to create | ||
such a structure. During retrieval, if the number of matched leaf documents below the same parent is | ||
higher than a defined threshold, the retriever will return the parent document instead of the individual leaf | ||
documents. | ||
The rational is, given that a paragraph is split into multiple chunks represented as leaf documents, and if for | ||
a given query, multiple chunks are matched, the whole paragraph might be more informative than the individual | ||
chunks alone. | ||
Currently the AutoMergingRetriever can only be used by the following DocumentStores: | ||
- [ElasticSearch](https://haystack.deepset.ai/docs/latest/documentstore/elasticsearch) | ||
- [OpenSearch](https://haystack.deepset.ai/docs/latest/documentstore/opensearch) | ||
- [PGVector](https://haystack.deepset.ai/docs/latest/documentstore/pgvector) | ||
- [Qdrant](https://haystack.deepset.ai/docs/latest/documentstore/qdrant) | ||
```python | ||
from haystack import Document | ||
from haystack_experimental.components.splitters import HierarchicalDocumentSplitter | ||
from haystack_experimental.components.retrievers.auto_merging_retriever import AutoMergingRetriever | ||
from haystack.document_stores.in_memory import InMemoryDocumentStore | ||
# create a hierarchical document structure with 2 levels, where the parent document has 3 children | ||
text = "The sun rose early in the morning. It cast a warm glow over the trees. Birds began to sing." | ||
original_document = Document(content=text) | ||
builder = HierarchicalDocumentSplitter(block_sizes=[10, 3], split_overlap=0, split_by="word") | ||
docs = builder.run([original_document])["documents"] | ||
# store level-1 parent documents and initialize the retriever | ||
doc_store_parents = InMemoryDocumentStore() | ||
for doc in docs["documents"]: | ||
if doc.meta["children_ids"] and doc.meta["level"] == 1: | ||
doc_store_parents.write_documents([doc]) | ||
retriever = AutoMergingRetriever(doc_store_parents, threshold=0.5) | ||
# assume we retrieved 2 leaf docs from the same parent, the parent document should be returned, | ||
# since it has 3 children and the threshold=0.5, and we retrieved 2 children (2/3 > 0.66(6)) | ||
leaf_docs = [doc for doc in docs["documents"] if not doc.meta["children_ids"]] | ||
docs = retriever.run(leaf_docs[4:6]) | ||
>> {'documents': [Document(id=538..), | ||
>> content: 'warm glow over the trees. Birds began to sing.', | ||
>> meta: {'block_size': 10, 'parent_id': '835..', 'children_ids': ['c17...', '3ff...', '352...'], 'level': 1, 'source_id': '835...', | ||
>> 'page_number': 1, 'split_id': 1, 'split_idx_start': 45})]} | ||
``` | ||
""" # noqa: E501 | ||
|
||
def __init__(self, document_store: DocumentStore, threshold: float = 0.5): | ||
""" | ||
Initialize the AutoMergingRetriever. | ||
:param document_store: DocumentStore from which to retrieve the parent documents | ||
:param threshold: Threshold to decide whether the parent instead of the individual documents is returned | ||
""" | ||
|
||
if not 0 < threshold < 1: | ||
raise ValueError("The threshold parameter must be between 0 and 1.") | ||
|
||
self.document_store = document_store | ||
self.threshold = threshold | ||
|
||
def to_dict(self) -> Dict[str, Any]: | ||
""" | ||
Serializes the component to a dictionary. | ||
:returns: | ||
Dictionary with serialized data. | ||
""" | ||
docstore = self.document_store.to_dict() | ||
return default_to_dict(self, document_store=docstore, threshold=self.threshold) | ||
|
||
@classmethod | ||
def from_dict(cls, data: Dict[str, Any]) -> "AutoMergingRetriever": | ||
""" | ||
Deserializes the component from a dictionary. | ||
:param data: | ||
Dictionary with serialized data. | ||
:returns: | ||
An instance of the component. | ||
""" | ||
data = deserialize_document_store_in_init_parameters(data) | ||
return default_from_dict(cls, data) | ||
|
||
@staticmethod | ||
def _check_valid_documents(matched_leaf_documents: List[Document]): | ||
# check if the matched leaf documents have the required meta fields | ||
if not all(doc.meta.get("__parent_id") for doc in matched_leaf_documents): | ||
raise ValueError("The matched leaf documents do not have the required meta field '__parent_id'") | ||
|
||
if not all(doc.meta.get("__level") for doc in matched_leaf_documents): | ||
raise ValueError("The matched leaf documents do not have the required meta field '__level'") | ||
|
||
if not all(doc.meta.get("__block_size") for doc in matched_leaf_documents): | ||
raise ValueError("The matched leaf documents do not have the required meta field '__block_size'") | ||
|
||
@component.output_types(documents=List[Document]) | ||
def run(self, matched_leaf_documents: List[Document]): | ||
""" | ||
Run the AutoMergingRetriever. | ||
Groups the matched leaf documents by their parent documents and returns the parent documents if the number of | ||
matched leaf documents below the same parent is higher than the defined threshold. Otherwise, returns the | ||
matched leaf documents. | ||
:param matched_leaf_documents: List of leaf documents that were matched by a retriever | ||
:returns: | ||
List of parent documents or matched leaf documents based on the threshold value | ||
""" | ||
|
||
docs_to_return = [] | ||
|
||
# group the matched leaf documents by their parent documents | ||
parent_documents: Dict[str, List[Document]] = defaultdict(list) | ||
for doc in matched_leaf_documents: | ||
parent_documents[doc.meta["__parent_id"]].append(doc) | ||
|
||
# find total number of children for each parent document | ||
for doc_id, retrieved_child_docs in parent_documents.items(): | ||
parent_doc = self.document_store.filter_documents({"field": "id", "operator": "==", "value": doc_id}) | ||
if len(parent_doc) == 0: | ||
raise ValueError(f"Parent document with id {doc_id} not found in the document store.") | ||
if len(parent_doc) > 1: | ||
raise ValueError(f"Multiple parent documents found with id {doc_id} in the document store.") | ||
if not parent_doc[0].meta.get("__children_ids"): | ||
raise ValueError(f"Parent document with id {doc_id} does not have any children.") | ||
parent_children_count = len(parent_doc[0].meta["__children_ids"]) | ||
|
||
# return either the parent document or the matched leaf documents based on the threshold value | ||
score = len(retrieved_child_docs) / parent_children_count | ||
if score >= self.threshold: | ||
# return the parent document | ||
docs_to_return.append(parent_doc[0]) | ||
else: | ||
# return all the matched leaf documents which are child of this parent document | ||
leafs_ids = {doc.id for doc in retrieved_child_docs} | ||
docs_to_return.extend([doc for doc in matched_leaf_documents if doc.id in leafs_ids]) | ||
|
||
return {"documents": docs_to_return} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]> | ||
# | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
from haystack_experimental.components.splitters.hierarchical_doc_splitter import HierarchicalDocumentSplitter | ||
|
||
_all_ = ["HierarchicalDocumentSplitter"] |
Oops, something went wrong.