From 02096a6aa5e3b78b237b51bcba923bb33f348f09 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Wed, 31 Jan 2024 17:02:56 +0100 Subject: [PATCH 1/5] Update integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py Co-authored-by: Massimiliano Pippi --- .../document_stores/pgvector/document_store.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py index 73da14bdc..40612a135 100644 --- a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py +++ b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py @@ -89,9 +89,9 @@ def __init__( :param table_name: The name of the table to use to store Haystack documents. Defaults to "haystack_documents". :param embedding_dimension: The dimension of the embedding. Defaults to 768. :param vector_function: The similarity function to use when searching for similar embeddings. - Defaults to "cosine_similarity". "cosine_similarity" and "inner_product" are similarity functions, - so the most similar documents are the ones with the lowest score. - "l2_distance" is a distance function, so the most similar documents are the ones with the smallest score. + Defaults to "cosine_similarity". "cosine_similarity" and "inner_product" are similarity functions and + higher scores indicate greater similarity between the documents. + "l2_distance" returns the straight-line distance between vectors, and the most similar documents are the ones with the smallest score. When using the "hnsw" search strategy, the vector_function value is used to build an appropriate index. :type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"] :param recreate_table: Whether to recreate the table if it already exists. Defaults to False. From 7dd59799beaa661b91ab08225fdd88da2d06c6e2 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Wed, 31 Jan 2024 17:03:14 +0100 Subject: [PATCH 2/5] Update integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py Co-authored-by: Massimiliano Pippi --- .../document_stores/pgvector/document_store.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py index 40612a135..5e24168ea 100644 --- a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py +++ b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py @@ -92,7 +92,8 @@ def __init__( Defaults to "cosine_similarity". "cosine_similarity" and "inner_product" are similarity functions and higher scores indicate greater similarity between the documents. "l2_distance" returns the straight-line distance between vectors, and the most similar documents are the ones with the smallest score. - When using the "hnsw" search strategy, the vector_function value is used to build an appropriate index. + + Important: when using the "hnsw" search strategy, an index will be created that depends on the `vector_function` passed here. Make sure subsequent queries will keep using the same vector similarity function in order to take advantage of the index. :type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"] :param recreate_table: Whether to recreate the table if it already exists. Defaults to False. :param search_strategy: The search strategy to use when searching for similar embeddings. From 9b7ee61cc4aade84abb837ac9d790d4a0bf015d0 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Wed, 31 Jan 2024 17:03:22 +0100 Subject: [PATCH 3/5] Update integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py Co-authored-by: Massimiliano Pippi --- .../document_stores/pgvector/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py index 5e24168ea..033f867a2 100644 --- a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py +++ b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py @@ -99,7 +99,7 @@ def __init__( :param search_strategy: The search strategy to use when searching for similar embeddings. Defaults to "exact_nearest_neighbor". "hnsw" is an approximate nearest neighbor search strategy, which trades off some accuracy for speed; it is recommended for large numbers of documents. - When using the "hnsw" search strategy, the vector_function value is used to build an appropriate index. + Important: when using the "hnsw" search strategy, an index will be created that depends on the `vector_function` passed here. Make sure subsequent queries will keep using the same vector similarity function in order to take advantage of the index. :type search_strategy: Literal["exact_nearest_neighbor", "hnsw"] :param hnsw_recreate_index_if_exists: Whether to recreate the HNSW index if it already exists. Defaults to False. Only used if search_strategy is set to "hnsw". From eacb97f90a7754b06f1401087a6607d08c6247a0 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Wed, 31 Jan 2024 17:03:54 +0100 Subject: [PATCH 4/5] Update integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py Co-authored-by: Massimiliano Pippi --- .../pgvector/document_store.py | 21 ++----------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py index 033f867a2..eba2f5c21 100644 --- a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py +++ b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py @@ -445,26 +445,9 @@ def _embedding_retrieval( """ Retrieves documents that are most similar to the query embedding using a vector similarity metric. - This method is not mean to be part of the public interface of - `PgvectorDocumentStore` nor called directly. + This method is not meant to be part of the public interface of + `PgvectorDocumentStore` and it should not be called directly. `PgvectorEmbeddingRetriever` uses this method directly and is the public interface for it. - - :param query_embedding: Embedding of the query. - :param filters: Filters applied to the retrieved Documents. Defaults to None. - When using the "hnsw" search strategy, filters are applied after the most similar Documents are retrieved, - so the number of results may be less than `top_k`. - To better understand HNSW index creation and configuration, refer to the pgvector documentation: - https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw - :param top_k: Maximum number of Documents to return, defaults to 10 - :param vector_function: The similarity function to use when searching for similar embeddings. - Defaults to the PgvectorDocumentStore's vector_function. - Since vector_function is used to build the HNSW index (when using the "hnsw" search strategy), - if a vector_function other than the one used to build the index is chosen, - the index will not be used and the search will be slower. - "cosine_similarity" and "inner_product" are similarity functions, - so the most similar documents are the ones with the lowest score. - "l2_distance" is a distance function, so the most similar documents are the ones with the smallest score. - :type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"] :raises ValueError :return: List of Documents that are most similar to `query_embedding` """ From 68b7a11985fa0268df635c93d02175347affad95 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Wed, 31 Jan 2024 17:07:44 +0100 Subject: [PATCH 5/5] fix fmt --- .../document_stores/pgvector/document_store.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py index eba2f5c21..0abaaecce 100644 --- a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py +++ b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py @@ -91,15 +91,21 @@ def __init__( :param vector_function: The similarity function to use when searching for similar embeddings. Defaults to "cosine_similarity". "cosine_similarity" and "inner_product" are similarity functions and higher scores indicate greater similarity between the documents. - "l2_distance" returns the straight-line distance between vectors, and the most similar documents are the ones with the smallest score. - - Important: when using the "hnsw" search strategy, an index will be created that depends on the `vector_function` passed here. Make sure subsequent queries will keep using the same vector similarity function in order to take advantage of the index. + "l2_distance" returns the straight-line distance between vectors, + and the most similar documents are the ones with the smallest score. + + Important: when using the "hnsw" search strategy, an index will be created that depends on the + `vector_function` passed here. Make sure subsequent queries will keep using the same + vector similarity function in order to take advantage of the index. :type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"] :param recreate_table: Whether to recreate the table if it already exists. Defaults to False. :param search_strategy: The search strategy to use when searching for similar embeddings. Defaults to "exact_nearest_neighbor". "hnsw" is an approximate nearest neighbor search strategy, which trades off some accuracy for speed; it is recommended for large numbers of documents. - Important: when using the "hnsw" search strategy, an index will be created that depends on the `vector_function` passed here. Make sure subsequent queries will keep using the same vector similarity function in order to take advantage of the index. + + Important: when using the "hnsw" search strategy, an index will be created that depends on the + `vector_function` passed here. Make sure subsequent queries will keep using the same + vector similarity function in order to take advantage of the index. :type search_strategy: Literal["exact_nearest_neighbor", "hnsw"] :param hnsw_recreate_index_if_exists: Whether to recreate the HNSW index if it already exists. Defaults to False. Only used if search_strategy is set to "hnsw".