Save Document node in lexical graph (#116)

* WIP * Add document info model - move chunk index to TextChunk model * Update tests to test DocumentInfo * Update examples * Update e2e tests, documentation, CHANGELOG * Remove print * Add docstrings, remove another print * Fix tests after merge
neo4j · Sep 3, 2024 · 378c98d · 378c98d
1 parent 53fbc1a
commit 378c98d
Show file tree

Hide file tree

Showing 14 changed files with 292 additions and 129 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ## Next
 
+### Changed
+- When saving the lexical graph in a KG creation pipeline, the document is also saved as a specific node, together with relationships between each chunk and the document they were created from.
+
 ## 0.5.0
 
 ### Fixed

diff --git a/docs/source/user_guide_kg_builder.rst b/docs/source/user_guide_kg_builder.rst
@@ -289,9 +289,11 @@ Lexical Graph
 
 By default, the `LLMEntityRelationExtractor` adds some extra nodes and relationships to the extracted graph:
 
+- `Document` node: represent the processed document and have a `path` property.
 - `Chunk` nodes: represent the text chunks. They have a `text` property and, if computed, an `embedding` property.
 - `NEXT_CHUNK` relationships between one chunk node and the next one in the document. It can be used to enhance the context in a RAG application.
 - `FROM_CHUNK` relationship between any extracted entity and the chunk it has been identified into.
+- `FROM_DOCUMENT` relationship between each chunk and the document it was built from.
 
 If this 'lexical graph' is not desired, set the `created_lexical_graph` to `False` in the extractor constructor:
 

diff --git a/examples/pipeline/kg_builder_from_pdf.py b/examples/pipeline/kg_builder_from_pdf.py
@@ -16,7 +16,7 @@
 
 import asyncio
 import logging
-from typing import Any
+from typing import Any, Dict, List
 
 import neo4j
 from langchain_text_splitters import CharacterTextSplitter
@@ -62,13 +62,13 @@ class Neo4jGraph(DataModel):
 
 
 class ERExtractor(Component):
-    async def _process_chunk(self, chunk: str, schema: str) -> dict[str, Any]:
+    async def _process_chunk(self, chunk: str, schema: str) -> Dict[str, Any]:
         return {
             "entities": [{"label": "Person", "properties": {"name": "John Doe"}}],
             "relations": [],
         }
 
-    async def run(self, chunks: list[str], schema: str) -> Neo4jGraph:
+    async def run(self, chunks: List[str], schema: str) -> Neo4jGraph:
         tasks = [self._process_chunk(chunk, schema) for chunk in chunks]
         result = await asyncio.gather(*tasks)
         merged_result: dict[str, Any] = {"entities": [], "relations": []}
@@ -141,10 +141,7 @@ async def main(neo4j_driver: neo4j.Driver) -> dict[str, Any]:
     pipe = Pipeline()
     pipe.add_component(PdfLoader(), "pdf_loader")
     pipe.add_component(
-        LangChainTextSplitterAdapter(
-            # chunk_size=50 for the sake of this demo
-            CharacterTextSplitter(chunk_size=50, chunk_overlap=10, separator=".")
-        ),
+        LangChainTextSplitterAdapter(CharacterTextSplitter(separator=". \n")),
         "splitter",
     )
     pipe.add_component(SchemaBuilder(), "schema")
@@ -153,7 +150,7 @@ async def main(neo4j_driver: neo4j.Driver) -> dict[str, Any]:
             llm=OpenAILLM(
                 model_name="gpt-4o",
                 model_params={
-                    "max_tokens": 1000,
+                    "max_tokens": 2000,
                     "response_format": {"type": "json_object"},
                 },
             ),
@@ -164,7 +161,14 @@ async def main(neo4j_driver: neo4j.Driver) -> dict[str, Any]:
     pipe.add_component(Neo4jWriter(neo4j_driver), "writer")
     pipe.connect("pdf_loader", "splitter", input_config={"text": "pdf_loader.text"})
     pipe.connect("splitter", "extractor", input_config={"chunks": "splitter"})
-    pipe.connect("schema", "extractor", input_config={"schema": "schema"})
+    pipe.connect(
+        "schema",
+        "extractor",
+        input_config={
+            "schema": "schema",
+            "document_info": "pdf_loader.document_info",
+        },
+    )
     pipe.connect(
         "extractor",
         "writer",

diff --git a/examples/pipeline/kg_builder_from_text.py b/examples/pipeline/kg_builder_from_text.py
@@ -154,6 +154,11 @@ async def main(neo4j_driver: neo4j.Driver) -> dict[str, Any]:
                 ("Person", "WORKED_FOR", "Organization"),
             ],
         },
+        "extractor": {
+            "document_info": {
+                "path": "my text",
+            }
+        },
     }
     # run the pipeline
     return await pipe.run(pipe_inputs)

diff --git a/src/neo4j_genai/experimental/components/embedder.py b/src/neo4j_genai/experimental/components/embedder.py
@@ -56,7 +56,9 @@ def _embed_chunk(self, text_chunk: TextChunk) -> TextChunk:
         embedding = self._embedder.embed_query(text_chunk.text)
         metadata = text_chunk.metadata if text_chunk.metadata else {}
         metadata["embedding"] = embedding
-        return TextChunk(text=text_chunk.text, metadata=metadata)
+        return TextChunk(
+            text=text_chunk.text, index=text_chunk.index, metadata=metadata
+        )
 
     @validate_call
     async def run(self, text_chunks: TextChunks) -> TextChunks: