Add a Neo4j Chunk reader (#135)

* Adds a neo4j chunk reader One e2e test is failing, that's normal for now * Update * Add example * Merge and fix * Add more end-to-end examples * Fix tests * Update changelog and doc * Merge * Use constants everywhere for consistency * Use the dynamic properties from the LexicalGraphConfig everywhere * Cleaning * Minor fixes in tests * Improve description * ruff
neo4j · Oct 25, 2024 · 158c624 · 158c624
1 parent 99bf50e
commit 158c624
Show file tree

Hide file tree

Showing 14 changed files with 821 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@
 - Made `relations` and `potential_schema` optional in `SchemaBuilder`.
 - Added a check to prevent the use of deprecated Cypher syntax for Neo4j versions 5.23.0 and above.
 - Added a `LexicalGraphBuilder` component to enable the import of the lexical graph (document, chunks) without performing entity and relation extraction.
+- Added a `Neo4jChunkReader` component to be able to read chunk text from the database.
 
 ### Changed
 - Vector and Hybrid retrievers used with `return_properties` now also return the node labels (`nodeLabels`) and the node's element ID (`id`).

diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -58,6 +58,14 @@ LexicalGraphBuilder
     :members:
     :exclude-members: component_inputs, component_outputs
 
+
+Neo4jChunkReader
+================
+
+.. autoclass:: neo4j_graphrag.experimental.components.neo4j_reader.Neo4jChunkReader
+    :members:
+    :exclude-members: component_inputs, component_outputs
+
 SchemaBuilder
 =============
 

diff --git a/docs/source/user_guide_kg_builder.rst b/docs/source/user_guide_kg_builder.rst
@@ -16,7 +16,7 @@ unstructured data.
 Pipeline Structure
 ******************
 
-A Knowledge Graph (KG) construction pipeline requires a few components:
+A Knowledge Graph (KG) construction pipeline requires a few components (some of the below components are optional):
 
 - **Document parser**: extract text from files (PDFs, ...).
 - **Document chunker**: split the text into smaller pieces of text, manageable by the LLM context window (token limit).
@@ -205,6 +205,47 @@ Example usage:
 See :ref:`kg-writer-section` to learn how to write the resulting nodes and relationships to Neo4j.
 
 
+Neo4j Chunk Reader
+==================
+
+The Neo4j chunk reader component is used to read text chunks from Neo4j. Text chunks can be created
+by the lexical graph builder or another process.
+
+.. code:: python
+
+    import neo4j
+    from neo4j_graphrag.experimental.components.neo4j_reader import Neo4jChunkReader
+    from neo4j_graphrag.experimental.components.types import LexicalGraphConfig
+
+    reader = Neo4jChunkReader(driver)
+    result = await reader.run()
+
+
+Configure node labels and relationship types
+---------------------------------------------
+
+Optionally, the document and chunk node labels can be configured using a `LexicalGraphConfig` object:
+
+.. code:: python
+
+    from neo4j_graphrag.experimental.components.neo4j_reader import Neo4jChunkReader
+    from neo4j_graphrag.experimental.components.types import LexicalGraphConfig, TextChunks
+
+    # optionally, define a LexicalGraphConfig object
+    # shown below with the default values
+    config = LexicalGraphConfig(
+        id_prefix="",  # used to prefix the chunk and document IDs
+        chunk_node_label="Chunk",
+        document_node_label="Document",
+        chunk_to_document_relationship_type="PART_OF_DOCUMENT",
+        next_chunk_relationship_type="NEXT_CHUNK",
+        node_to_chunk_relationship_type="PART_OF_CHUNK",
+        chunk_embedding_property="embeddings",
+    )
+    reader = Neo4jChunkReader(driver)
+    result = await reader.run(lexical_graph_config=config)
+
+
 Schema Builder
 ==============
 

diff --git a/examples/README.md b/examples/README.md
@@ -92,6 +92,8 @@ are listed in [the last section of this file](#customize).
 - [End to end example with explicit components and text input](./customize/build_graph/pipeline/kg_builder_from_text.py)
 - [End to end example with explicit components and PDF input](./customize/build_graph/pipeline/kg_builder_from_pdf.py)
 - [Process multiple documents](./customize/build_graph/pipeline/kg_builder_two_documents_entity_resolution.py)
+- [Export lexical graph creation into another pipeline](./customize/build_graph/pipeline/text_to_lexical_graph_to_entity_graph_two_pipelines.py)
+
 
 #### Components
 

diff --git a/examples/customize/build_graph/components/chunk_reader/neo4j_chunk_reader.py b/examples/customize/build_graph/components/chunk_reader/neo4j_chunk_reader.py
@@ -0,0 +1,21 @@
+import asyncio
+
+import neo4j
+from neo4j_graphrag.experimental.components.neo4j_reader import Neo4jChunkReader
+from neo4j_graphrag.experimental.components.types import LexicalGraphConfig, TextChunks
+
+
+async def main(driver: neo4j.Driver) -> TextChunks:
+    config = LexicalGraphConfig(  # only needed to overwrite the default values
+        chunk_node_label="TextPart",
+    )
+    reader = Neo4jChunkReader(driver)
+    result = await reader.run(lexical_graph_config=config)
+    return result
+
+
+if __name__ == "__main__":
+    with neo4j.GraphDatabase.driver(
+        "bolt://localhost:7687", auth=("neo4j", "password")
+    ) as driver:
+        print(asyncio.run(main(driver)))
diff --git a/examples/customize/build_graph/components/lexical_graph_builder/lexical_graph_builder.py b/examples/customize/build_graph/components/lexical_graph_builder/lexical_graph_builder.py
@@ -10,7 +10,6 @@
 
 
 async def main() -> GraphResult:
-    """ """
     # optionally, define a LexicalGraphConfig object
     # shown below with default values
     config = LexicalGraphConfig(

diff --git a/...s/customize/build_graph/pipeline/text_to_lexical_graph_to_entity_graph_single_pipeline.py b/...s/customize/build_graph/pipeline/text_to_lexical_graph_to_entity_graph_single_pipeline.py
@@ -0,0 +1,196 @@
+"""In this example, we set up a single pipeline with two Neo4j writers:
+one for creating the lexical graph (Document and Chunks)
+and another for creating the entity graph (entities and relations derived from the text).
+"""
+
+from __future__ import annotations
+
+import asyncio
+
+import neo4j
+from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
+from neo4j_graphrag.experimental.components.embedder import TextChunkEmbedder
+from neo4j_graphrag.experimental.components.entity_relation_extractor import (
+    LLMEntityRelationExtractor,
+)
+from neo4j_graphrag.experimental.components.kg_writer import Neo4jWriter
+from neo4j_graphrag.experimental.components.lexical_graph import LexicalGraphBuilder
+from neo4j_graphrag.experimental.components.schema import (
+    SchemaBuilder,
+    SchemaEntity,
+    SchemaProperty,
+    SchemaRelation,
+)
+from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import (
+    FixedSizeSplitter,
+)
+from neo4j_graphrag.experimental.components.types import LexicalGraphConfig
+from neo4j_graphrag.experimental.pipeline import Pipeline
+from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult
+from neo4j_graphrag.llm import LLMInterface, OpenAILLM
+
+
+async def define_and_run_pipeline(
+    neo4j_driver: neo4j.Driver,
+    llm: LLMInterface,
+    lexical_graph_config: LexicalGraphConfig,
+    text: str,
+) -> PipelineResult:
+    """Define and run the pipeline with the following components:
+
+    - Text Splitter: to split the text into manageable chunks of fixed size
+    - Chunk Embedder: to embed the chunks' text
+    - Lexical Graph Builder: to build the lexical graph, ie creating the chunk nodes and relationships between them
+    - LG KG writer: save the lexical graph to Neo4j
+
+    - Schema Builder: this component takes a list of entities, relationships and
+        possible triplets as inputs, validate them and return a schema ready to use
+        for the rest of the pipeline
+    - LLM Entity Relation Extractor is an LLM-based entity and relation extractor:
+        based on the provided schema, the LLM will do its best to identity these
+        entities and their relations within the provided text
+    - EG KG writer: once entities and relations are extracted, they can be writen
+        to a Neo4j database
+
+    """
+    pipe = Pipeline()
+    # define the components
+    pipe.add_component(
+        FixedSizeSplitter(chunk_size=200, chunk_overlap=50),
+        "splitter",
+    )
+    pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")
+    pipe.add_component(
+        LexicalGraphBuilder(lexical_graph_config),
+        "lexical_graph_builder",
+    )
+    pipe.add_component(Neo4jWriter(neo4j_driver), "lg_writer")
+    pipe.add_component(SchemaBuilder(), "schema")
+    pipe.add_component(
+        LLMEntityRelationExtractor(
+            llm=llm,
+            create_lexical_graph=False,
+        ),
+        "extractor",
+    )
+    pipe.add_component(Neo4jWriter(neo4j_driver), "eg_writer")
+    # define the execution order of component
+    # and how the output of previous components must be used
+    pipe.connect("splitter", "chunk_embedder", input_config={"text_chunks": "splitter"})
+    pipe.connect(
+        "chunk_embedder",
+        "lexical_graph_builder",
+        input_config={"text_chunks": "chunk_embedder"},
+    )
+    pipe.connect(
+        "lexical_graph_builder",
+        "lg_writer",
+        input_config={
+            "graph": "lexical_graph_builder.graph",
+            "lexical_graph_config": "lexical_graph_builder.config",
+        },
+    )
+    # define the execution order of component
+    # and how the output of previous components must be used
+    pipe.connect(
+        "chunk_embedder", "extractor", input_config={"chunks": "chunk_embedder"}
+    )
+    pipe.connect("schema", "extractor", input_config={"schema": "schema"})
+    pipe.connect(
+        "extractor",
+        "eg_writer",
+        input_config={"graph": "extractor"},
+    )
+    # make sure the lexical graph is created before creating the entity graph:
+    pipe.connect("lg_writer", "eg_writer", {})
+    # user input:
+    # the initial text
+    # and the list of entities and relations we are looking for
+    pipe_inputs = {
+        "splitter": {
+            "text": text,
+        },
+        "lexical_graph_builder": {
+            "document_info": {
+                # 'path' can be anything
+                "path": "example/lexical_graph_from_text.py"
+            },
+        },
+        "schema": {
+            "entities": [
+                SchemaEntity(
+                    label="Person",
+                    properties=[
+                        SchemaProperty(name="name", type="STRING"),
+                        SchemaProperty(name="place_of_birth", type="STRING"),
+                        SchemaProperty(name="date_of_birth", type="DATE"),
+                    ],
+                ),
+                SchemaEntity(
+                    label="Organization",
+                    properties=[
+                        SchemaProperty(name="name", type="STRING"),
+                        SchemaProperty(name="country", type="STRING"),
+                    ],
+                ),
+                SchemaEntity(
+                    label="Field",
+                    properties=[
+                        SchemaProperty(name="name", type="STRING"),
+                    ],
+                ),
+            ],
+            "relations": [
+                SchemaRelation(
+                    label="WORKED_ON",
+                ),
+                SchemaRelation(
+                    label="WORKED_FOR",
+                ),
+            ],
+            "potential_schema": [
+                ("Person", "WORKED_ON", "Field"),
+                ("Person", "WORKED_FOR", "Organization"),
+            ],
+        },
+        "extractor": {
+            "lexical_graph_config": lexical_graph_config,
+        },
+    }
+    # run the pipeline
+    return await pipe.run(pipe_inputs)
+
+
+async def main(driver: neo4j.Driver) -> PipelineResult:
+    # optional: define some custom node labels for the lexical graph:
+    lexical_graph_config = LexicalGraphConfig(
+        id_prefix="example",
+        chunk_node_label="TextPart",
+        document_node_label="Text",
+    )
+    text = """Albert Einstein was a German physicist born in 1879 who
+            wrote many groundbreaking papers especially about general relativity
+            and quantum mechanics. He worked for many different institutions, including
+            the University of Bern in Switzerland and the University of Oxford."""
+    llm = OpenAILLM(
+        model_name="gpt-4o",
+        model_params={
+            "max_tokens": 1000,
+            "response_format": {"type": "json_object"},
+        },
+    )
+    res = await define_and_run_pipeline(
+        driver,
+        llm,
+        lexical_graph_config,
+        text,
+    )
+    await llm.async_client.close()
+    return res
+
+
+if __name__ == "__main__":
+    with neo4j.GraphDatabase.driver(
+        "bolt://localhost:7687", auth=("neo4j", "password")
+    ) as driver:
+        print(asyncio.run(main(driver)))