From 239283787f4280435d59c7b7148b2149ad42cf90 Mon Sep 17 00:00:00 2001
From: Tomaz Bratanic <bratanic.tomaz@gmail.com>
Date: Mon, 9 Dec 2024 15:19:38 +0100
Subject: [PATCH] Add effective_search_ratio to vectorstore (#18)

* Add effective_search_ratio to vectorstore

* Format

* Switch to query attribute

* add changelog

* fix test

---------

Co-authored-by: Alex Thomas <alexthomas93@users.noreply.github.com>
---
 CHANGELOG.md                                  |  1 +
 .../vectorstores/neo4j_vector.py              | 36 ++++++++++++++++---
 .../vectorstores/test_neo4jvector.py          | 23 ++++++++++++
 .../unit_tests/vectorstores/test_neo4j.py     |  6 ++--
 4 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 80e649a..f5affc5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@
 
 - Enhanced Neo4j driver connection management with more robust error handling.
 - Simplified connection state checking in Neo4jGraph.
+- Introduced `effective_search_ratio` parameter in Neo4jVector to enhance query accuracy by adjusting the candidate pool size during similarity searches.
 
 ### Fixed
 
diff --git a/libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py b/libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py
index e2beaef..452ffc1 100644
--- a/libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py
+++ b/libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py
@@ -89,15 +89,17 @@ def _get_search_index_query(
     if index_type == IndexType.NODE:
         if search_type == SearchType.VECTOR:
             return (
-                "CALL db.index.vector.queryNodes($index, $k, $embedding) "
+                "CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) "
                 "YIELD node, score "
+                "WITH node, score LIMIT $k "
             )
         elif search_type == SearchType.HYBRID:
             call_prefix = "CALL () { " if neo4j_version_is_5_23_or_above else "CALL { "
 
             query_body = (
-                "CALL db.index.vector.queryNodes($index, $k, $embedding) "
+                "CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) "
                 "YIELD node, score "
+                "WITH node, score LIMIT $k "
                 "WITH collect({node:node, score:score}) AS nodes, max(score) AS max "
                 "UNWIND nodes AS n "
                 "RETURN n.node AS node, (n.score / max) AS score UNION "
@@ -117,8 +119,9 @@ def _get_search_index_query(
             raise ValueError(f"Unsupported SearchType: {search_type}")
     else:
         return (
-            "CALL db.index.vector.queryRelationships($index, $k, $embedding) "
+            "CALL db.index.vector.queryRelationships($index, $k * $ef, $embedding) "
             "YIELD relationship, score "
+            "WITH relationship, score LIMIT $k "
         )
 
 
@@ -461,6 +464,8 @@ class Neo4jVector(VectorStore):
             'NODE' or 'RELATIONSHIP'
         pre_delete_collection: If True, will delete existing data if it exists.
             (default: False). Useful for testing.
+        effective_search_ratio: Controls the candidate pool size by multiplying $k
+            to balance query accuracy and performance.
 
     Example:
         .. code-block:: python
@@ -587,6 +592,7 @@ def __init__(
         self.retrieval_query = retrieval_query
         self.search_type = search_type
         self._index_type = index_type
+
         # Calculate embedding dimension
         self.embedding_dimension = len(embedding.embed_query("foo"))
 
@@ -984,6 +990,7 @@ def similarity_search(
         k: int = 4,
         params: Dict[str, Any] = {},
         filter: Optional[Dict[str, Any]] = None,
+        effective_search_ratio: int = 1,
         **kwargs: Any,
     ) -> List[Document]:
         """Run similarity search with Neo4jVector.
@@ -996,7 +1003,9 @@ def similarity_search(
             filter (Optional[Dict[str, Any]]): Dictionary of argument(s) to
                     filter on metadata.
                 Defaults to None.
-
+            effective_search_ratio (int): Controls the candidate pool size
+               by multiplying $k to balance query accuracy and performance.
+               Defaults to 1.
         Returns:
             List of Documents most similar to the query.
         """
@@ -1007,6 +1016,7 @@ def similarity_search(
             query=query,
             params=params,
             filter=filter,
+            effective_search_ratio=effective_search_ratio,
             **kwargs,
         )
 
@@ -1016,6 +1026,7 @@ def similarity_search_with_score(
         k: int = 4,
         params: Dict[str, Any] = {},
         filter: Optional[Dict[str, Any]] = None,
+        effective_search_ratio: int = 1,
         **kwargs: Any,
     ) -> List[Tuple[Document, float]]:
         """Return docs most similar to query.
@@ -1028,6 +1039,9 @@ def similarity_search_with_score(
             filter (Optional[Dict[str, Any]]): Dictionary of argument(s) to
                     filter on metadata.
                 Defaults to None.
+            effective_search_ratio (int): Controls the candidate pool size
+               by multiplying $k to balance query accuracy and performance.
+               Defaults to 1.
 
         Returns:
             List of Documents most similar to the query and score for each
@@ -1039,6 +1053,7 @@ def similarity_search_with_score(
             query=query,
             params=params,
             filter=filter,
+            effective_search_ratio=effective_search_ratio,
             **kwargs,
         )
         return docs
@@ -1049,6 +1064,7 @@ def similarity_search_with_score_by_vector(
         k: int = 4,
         filter: Optional[Dict[str, Any]] = None,
         params: Dict[str, Any] = {},
+        effective_search_ratio: int = 1,
         **kwargs: Any,
     ) -> List[Tuple[Document, float]]:
         """
@@ -1069,6 +1085,9 @@ def similarity_search_with_score_by_vector(
                 Defaults to None.
             params (Dict[str, Any]): The search params for the index type.
                 Defaults to empty dict.
+            effective_search_ratio (int): Controls the candidate pool size
+               by multiplying $k to balance query accuracy and performance.
+               Defaults to 1.
 
         Returns:
             List[Tuple[Document, float]]: A list of tuples, each containing
@@ -1154,6 +1173,7 @@ def similarity_search_with_score_by_vector(
             "embedding": embedding,
             "keyword_index": self.keyword_index_name,
             "query": remove_lucene_chars(kwargs["query"]),
+            "ef": effective_search_ratio,
             **params,
             **filter_params,
         }
@@ -1209,6 +1229,7 @@ def similarity_search_by_vector(
         k: int = 4,
         filter: Optional[Dict[str, Any]] = None,
         params: Dict[str, Any] = {},
+        effective_search_ratio: int = 1,
         **kwargs: Any,
     ) -> List[Document]:
         """Return docs most similar to embedding vector.
@@ -1226,7 +1247,12 @@ def similarity_search_by_vector(
             List of Documents most similar to the query vector.
         """
         docs_and_scores = self.similarity_search_with_score_by_vector(
-            embedding=embedding, k=k, filter=filter, params=params, **kwargs
+            embedding=embedding,
+            k=k,
+            filter=filter,
+            params=params,
+            effective_search_ratio=effective_search_ratio,
+            **kwargs,
         )
         return [doc for doc, _ in docs_and_scores]
 
diff --git a/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py b/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py
index 6007d0c..9e2d722 100644
--- a/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py
+++ b/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py
@@ -701,6 +701,7 @@ def test_hybrid_score_normalization() -> None:
             "index": "vector",
             "k": 1,
             "embedding": FakeEmbeddingsWithOsDimension().embed_query("foo"),
+            "ef": 1,
             "query": "foo",
             "keyword_index": "keyword",
         },
@@ -993,6 +994,28 @@ def test_neo4j_max_marginal_relevance_search() -> None:
     drop_vector_indexes(docsearch)
 
 
+def test_neo4jvector_effective_search_ratio() -> None:
+    """Test effective search parameter."""
+    docsearch = Neo4jVector.from_texts(
+        texts=texts,
+        embedding=FakeEmbeddingsWithOsDimension(),
+        url=url,
+        username=username,
+        password=password,
+        pre_delete_collection=True,
+    )
+    output = docsearch.similarity_search("foo", k=2, effective_search_ratio=2)
+    assert len(output) == 2
+
+    output1 = docsearch.similarity_search_with_score(
+        "foo", k=2, effective_search_ratio=2
+    )
+    assert len(output1) == 2
+    # Assert ordered by score
+    assert output1[0][1] > output1[1][1]
+    drop_vector_indexes(docsearch)
+
+
 def test_neo4jvector_passing_graph_object() -> None:
     """Test end to end construction and search with passing graph object."""
     graph = Neo4jGraph(url=url, username=username, password=password)
diff --git a/libs/neo4j/tests/unit_tests/vectorstores/test_neo4j.py b/libs/neo4j/tests/unit_tests/vectorstores/test_neo4j.py
index 82b0153..837cb79 100644
--- a/libs/neo4j/tests/unit_tests/vectorstores/test_neo4j.py
+++ b/libs/neo4j/tests/unit_tests/vectorstores/test_neo4j.py
@@ -203,8 +203,9 @@ def test_converting_to_yaml() -> None:
 def test_get_search_index_query_hybrid_node_neo4j_5_23_above() -> None:
     expected_query = (
         "CALL () { "
-        "CALL db.index.vector.queryNodes($index, $k, $embedding) "
+        "CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) "
         "YIELD node, score "
+        "WITH node, score LIMIT $k "
         "WITH collect({node:node, score:score}) AS nodes, max(score) AS max "
         "UNWIND nodes AS n "
         "RETURN n.node AS node, (n.score / max) AS score UNION "
@@ -225,8 +226,9 @@ def test_get_search_index_query_hybrid_node_neo4j_5_23_above() -> None:
 def test_get_search_index_query_hybrid_node_neo4j_5_23_below() -> None:
     expected_query = (
         "CALL { "
-        "CALL db.index.vector.queryNodes($index, $k, $embedding) "
+        "CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) "
         "YIELD node, score "
+        "WITH node, score LIMIT $k "
         "WITH collect({node:node, score:score}) AS nodes, max(score) AS max "
         "UNWIND nodes AS n "
         "RETURN n.node AS node, (n.score / max) AS score UNION "