From 239283787f4280435d59c7b7148b2149ad42cf90 Mon Sep 17 00:00:00 2001 From: Tomaz Bratanic Date: Mon, 9 Dec 2024 15:19:38 +0100 Subject: [PATCH] Add effective_search_ratio to vectorstore (#18) * Add effective_search_ratio to vectorstore * Format * Switch to query attribute * add changelog * fix test --------- Co-authored-by: Alex Thomas --- CHANGELOG.md | 1 + .../vectorstores/neo4j_vector.py | 36 ++++++++++++++++--- .../vectorstores/test_neo4jvector.py | 23 ++++++++++++ .../unit_tests/vectorstores/test_neo4j.py | 6 ++-- 4 files changed, 59 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 80e649a..f5affc5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - Enhanced Neo4j driver connection management with more robust error handling. - Simplified connection state checking in Neo4jGraph. +- Introduced `effective_search_ratio` parameter in Neo4jVector to enhance query accuracy by adjusting the candidate pool size during similarity searches. ### Fixed diff --git a/libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py b/libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py index e2beaef..452ffc1 100644 --- a/libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py +++ b/libs/neo4j/langchain_neo4j/vectorstores/neo4j_vector.py @@ -89,15 +89,17 @@ def _get_search_index_query( if index_type == IndexType.NODE: if search_type == SearchType.VECTOR: return ( - "CALL db.index.vector.queryNodes($index, $k, $embedding) " + "CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) " "YIELD node, score " + "WITH node, score LIMIT $k " ) elif search_type == SearchType.HYBRID: call_prefix = "CALL () { " if neo4j_version_is_5_23_or_above else "CALL { " query_body = ( - "CALL db.index.vector.queryNodes($index, $k, $embedding) " + "CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) " "YIELD node, score " + "WITH node, score LIMIT $k " "WITH collect({node:node, score:score}) AS nodes, max(score) AS max " "UNWIND nodes AS n " "RETURN n.node AS node, (n.score / max) AS score UNION " @@ -117,8 +119,9 @@ def _get_search_index_query( raise ValueError(f"Unsupported SearchType: {search_type}") else: return ( - "CALL db.index.vector.queryRelationships($index, $k, $embedding) " + "CALL db.index.vector.queryRelationships($index, $k * $ef, $embedding) " "YIELD relationship, score " + "WITH relationship, score LIMIT $k " ) @@ -461,6 +464,8 @@ class Neo4jVector(VectorStore): 'NODE' or 'RELATIONSHIP' pre_delete_collection: If True, will delete existing data if it exists. (default: False). Useful for testing. + effective_search_ratio: Controls the candidate pool size by multiplying $k + to balance query accuracy and performance. Example: .. code-block:: python @@ -587,6 +592,7 @@ def __init__( self.retrieval_query = retrieval_query self.search_type = search_type self._index_type = index_type + # Calculate embedding dimension self.embedding_dimension = len(embedding.embed_query("foo")) @@ -984,6 +990,7 @@ def similarity_search( k: int = 4, params: Dict[str, Any] = {}, filter: Optional[Dict[str, Any]] = None, + effective_search_ratio: int = 1, **kwargs: Any, ) -> List[Document]: """Run similarity search with Neo4jVector. @@ -996,7 +1003,9 @@ def similarity_search( filter (Optional[Dict[str, Any]]): Dictionary of argument(s) to filter on metadata. Defaults to None. - + effective_search_ratio (int): Controls the candidate pool size + by multiplying $k to balance query accuracy and performance. + Defaults to 1. Returns: List of Documents most similar to the query. """ @@ -1007,6 +1016,7 @@ def similarity_search( query=query, params=params, filter=filter, + effective_search_ratio=effective_search_ratio, **kwargs, ) @@ -1016,6 +1026,7 @@ def similarity_search_with_score( k: int = 4, params: Dict[str, Any] = {}, filter: Optional[Dict[str, Any]] = None, + effective_search_ratio: int = 1, **kwargs: Any, ) -> List[Tuple[Document, float]]: """Return docs most similar to query. @@ -1028,6 +1039,9 @@ def similarity_search_with_score( filter (Optional[Dict[str, Any]]): Dictionary of argument(s) to filter on metadata. Defaults to None. + effective_search_ratio (int): Controls the candidate pool size + by multiplying $k to balance query accuracy and performance. + Defaults to 1. Returns: List of Documents most similar to the query and score for each @@ -1039,6 +1053,7 @@ def similarity_search_with_score( query=query, params=params, filter=filter, + effective_search_ratio=effective_search_ratio, **kwargs, ) return docs @@ -1049,6 +1064,7 @@ def similarity_search_with_score_by_vector( k: int = 4, filter: Optional[Dict[str, Any]] = None, params: Dict[str, Any] = {}, + effective_search_ratio: int = 1, **kwargs: Any, ) -> List[Tuple[Document, float]]: """ @@ -1069,6 +1085,9 @@ def similarity_search_with_score_by_vector( Defaults to None. params (Dict[str, Any]): The search params for the index type. Defaults to empty dict. + effective_search_ratio (int): Controls the candidate pool size + by multiplying $k to balance query accuracy and performance. + Defaults to 1. Returns: List[Tuple[Document, float]]: A list of tuples, each containing @@ -1154,6 +1173,7 @@ def similarity_search_with_score_by_vector( "embedding": embedding, "keyword_index": self.keyword_index_name, "query": remove_lucene_chars(kwargs["query"]), + "ef": effective_search_ratio, **params, **filter_params, } @@ -1209,6 +1229,7 @@ def similarity_search_by_vector( k: int = 4, filter: Optional[Dict[str, Any]] = None, params: Dict[str, Any] = {}, + effective_search_ratio: int = 1, **kwargs: Any, ) -> List[Document]: """Return docs most similar to embedding vector. @@ -1226,7 +1247,12 @@ def similarity_search_by_vector( List of Documents most similar to the query vector. """ docs_and_scores = self.similarity_search_with_score_by_vector( - embedding=embedding, k=k, filter=filter, params=params, **kwargs + embedding=embedding, + k=k, + filter=filter, + params=params, + effective_search_ratio=effective_search_ratio, + **kwargs, ) return [doc for doc, _ in docs_and_scores] diff --git a/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py b/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py index 6007d0c..9e2d722 100644 --- a/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py +++ b/libs/neo4j/tests/integration_tests/vectorstores/test_neo4jvector.py @@ -701,6 +701,7 @@ def test_hybrid_score_normalization() -> None: "index": "vector", "k": 1, "embedding": FakeEmbeddingsWithOsDimension().embed_query("foo"), + "ef": 1, "query": "foo", "keyword_index": "keyword", }, @@ -993,6 +994,28 @@ def test_neo4j_max_marginal_relevance_search() -> None: drop_vector_indexes(docsearch) +def test_neo4jvector_effective_search_ratio() -> None: + """Test effective search parameter.""" + docsearch = Neo4jVector.from_texts( + texts=texts, + embedding=FakeEmbeddingsWithOsDimension(), + url=url, + username=username, + password=password, + pre_delete_collection=True, + ) + output = docsearch.similarity_search("foo", k=2, effective_search_ratio=2) + assert len(output) == 2 + + output1 = docsearch.similarity_search_with_score( + "foo", k=2, effective_search_ratio=2 + ) + assert len(output1) == 2 + # Assert ordered by score + assert output1[0][1] > output1[1][1] + drop_vector_indexes(docsearch) + + def test_neo4jvector_passing_graph_object() -> None: """Test end to end construction and search with passing graph object.""" graph = Neo4jGraph(url=url, username=username, password=password) diff --git a/libs/neo4j/tests/unit_tests/vectorstores/test_neo4j.py b/libs/neo4j/tests/unit_tests/vectorstores/test_neo4j.py index 82b0153..837cb79 100644 --- a/libs/neo4j/tests/unit_tests/vectorstores/test_neo4j.py +++ b/libs/neo4j/tests/unit_tests/vectorstores/test_neo4j.py @@ -203,8 +203,9 @@ def test_converting_to_yaml() -> None: def test_get_search_index_query_hybrid_node_neo4j_5_23_above() -> None: expected_query = ( "CALL () { " - "CALL db.index.vector.queryNodes($index, $k, $embedding) " + "CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) " "YIELD node, score " + "WITH node, score LIMIT $k " "WITH collect({node:node, score:score}) AS nodes, max(score) AS max " "UNWIND nodes AS n " "RETURN n.node AS node, (n.score / max) AS score UNION " @@ -225,8 +226,9 @@ def test_get_search_index_query_hybrid_node_neo4j_5_23_above() -> None: def test_get_search_index_query_hybrid_node_neo4j_5_23_below() -> None: expected_query = ( "CALL { " - "CALL db.index.vector.queryNodes($index, $k, $embedding) " + "CALL db.index.vector.queryNodes($index, $k * $ef, $embedding) " "YIELD node, score " + "WITH node, score LIMIT $k " "WITH collect({node:node, score:score}) AS nodes, max(score) AS max " "UNWIND nodes AS n " "RETURN n.node AS node, (n.score / max) AS score UNION "