From 205300a0bd3b69b07fa227d02de8085fdaf26c9d Mon Sep 17 00:00:00 2001 From: wassim-mechergui-shift Date: Tue, 3 Dec 2024 16:48:43 +0100 Subject: [PATCH 01/11] add fix to mmr search --- .../vectorstores/azure_cosmos_db_no_sql.py | 30 +++++++++++++++++-- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py index 2317af9da0250..d0866ebde6249 100644 --- a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py +++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py @@ -2,7 +2,7 @@ import uuid import warnings -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Callable import numpy as np from langchain_core.documents import Document @@ -121,6 +121,8 @@ def __init__( self._embedding_key = self._vector_embedding_policy["vectorEmbeddings"][0][ "path" ][1:] + self._distance_strategy = self._vector_embedding_policy[ + 'vectorEmbeddings'][0]['distanceFunction'] def add_texts( self, @@ -260,6 +262,28 @@ def delete_document_by_id(self, document_id: Optional[str] = None) -> None: raise ValueError("No document ids provided to delete.") self._container.delete_item(document_id, partition_key=document_id) + def _select_relevance_score_fn(self) -> Callable[[float], float]: + """ + The 'correct' relevance function + may differ depending on a few things, including: + - the distance / similarity metric used by the VectorStore + - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) + - embedding dimensionality + - etc. + """ + if self._distance_strategy == 'cosine': + return self._cosine_relevance_score_fn + elif self._distance_strategy == "euclidean": + # Default behavior is to use euclidean distance relevancy + return self._euclidean_relevance_score_fn + elif self._distance_strategy == "dot product": + return self._max_inner_product_relevance_score_fn + else: + raise ValueError( + "Unknown distance strategy, must be cosine, max_inner_product," + " or euclidean" + ) + def _similarity_search_with_score( self, embeddings: List[float], @@ -274,7 +298,7 @@ def _similarity_search_with_score( query += "TOP @limit " query += ( - "c.id, c[@embeddingKey], c.text, c.metadata, " + f"c.id, c[@embeddingKey] as embeddingKey, c.text, c.metadata, " "VectorDistance(c[@embeddingKey], @embeddings) AS SimilarityScore FROM c" ) @@ -305,7 +329,7 @@ def _similarity_search_with_score( metadata = item["metadata"] score = item["SimilarityScore"] if with_embedding: - metadata[self._embedding_key] = item[self._embedding_key] + metadata[self._embedding_key] = item["embeddingKey"] docs_and_scores.append( (Document(page_content=text, metadata=metadata), score) ) From 296f1c3bb700c0afda2e538947f4a12c00f9e5a3 Mon Sep 17 00:00:00 2001 From: wassim-mechergui-shift Date: Tue, 3 Dec 2024 17:09:49 +0100 Subject: [PATCH 02/11] cleanup trailing character --- .../langchain_community/vectorstores/azure_cosmos_db_no_sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py index d0866ebde6249..33ed4d736e636 100644 --- a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py +++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py @@ -298,7 +298,7 @@ def _similarity_search_with_score( query += "TOP @limit " query += ( - f"c.id, c[@embeddingKey] as embeddingKey, c.text, c.metadata, " + "c.id, c[@embeddingKey] as embeddingKey, c.text, c.metadata, " "VectorDistance(c[@embeddingKey], @embeddings) AS SimilarityScore FROM c" ) From f00171783835b0ef177aab43a24192bfd8eecddd Mon Sep 17 00:00:00 2001 From: wassim-mechergui-shift Date: Tue, 3 Dec 2024 17:21:35 +0100 Subject: [PATCH 03/11] fix linting --- .../langchain_community/vectorstores/azure_cosmos_db_no_sql.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py index 33ed4d736e636..844368a7355cc 100644 --- a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py +++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py @@ -2,7 +2,8 @@ import uuid import warnings -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Callable +from typing import (TYPE_CHECKING, Any, Callable, Dict, + Iterable, List, Optional, Tuple) import numpy as np from langchain_core.documents import Document From 24f39c538ab3c3c1c3a7e692c4fde098ab4dd71d Mon Sep 17 00:00:00 2001 From: wassim-mechergui-shift Date: Tue, 3 Dec 2024 17:25:57 +0100 Subject: [PATCH 04/11] fix linting --- .../vectorstores/azure_cosmos_db_no_sql.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py index 844368a7355cc..44ee4fd3a7ddc 100644 --- a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py +++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py @@ -2,8 +2,7 @@ import uuid import warnings -from typing import (TYPE_CHECKING, Any, Callable, Dict, - Iterable, List, Optional, Tuple) +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Tuple import numpy as np from langchain_core.documents import Document @@ -122,8 +121,9 @@ def __init__( self._embedding_key = self._vector_embedding_policy["vectorEmbeddings"][0][ "path" ][1:] - self._distance_strategy = self._vector_embedding_policy[ - 'vectorEmbeddings'][0]['distanceFunction'] + self._distance_strategy = self._vector_embedding_policy["vectorEmbeddings"][0][ + "distanceFunction" + ] def add_texts( self, @@ -272,7 +272,7 @@ def _select_relevance_score_fn(self) -> Callable[[float], float]: - embedding dimensionality - etc. """ - if self._distance_strategy == 'cosine': + if self._distance_strategy == "cosine": return self._cosine_relevance_score_fn elif self._distance_strategy == "euclidean": # Default behavior is to use euclidean distance relevancy From 477a79e1c9d168d708c1bb14064e9c94d9b566c8 Mon Sep 17 00:00:00 2001 From: wassim-mechergui-shift Date: Fri, 6 Dec 2024 11:10:30 +0100 Subject: [PATCH 05/11] change used naming --- .../langchain_community/vectorstores/azure_cosmos_db_no_sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py index 44ee4fd3a7ddc..3320d38497cfd 100644 --- a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py +++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py @@ -281,7 +281,7 @@ def _select_relevance_score_fn(self) -> Callable[[float], float]: return self._max_inner_product_relevance_score_fn else: raise ValueError( - "Unknown distance strategy, must be cosine, max_inner_product," + "Unknown distance strategy, must be cosine, dot product," " or euclidean" ) From d1e32cbfe367300cb2134556c41b5660a034d149 Mon Sep 17 00:00:00 2001 From: wassim-mechergui-shift Date: Mon, 9 Dec 2024 12:21:12 +0100 Subject: [PATCH 06/11] enforce correct property management --- .../vectorstores/azure_cosmos_db_no_sql.py | 33 ++++++++++++++++--- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py index 3320d38497cfd..b1f0fae3ac5cb 100644 --- a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py +++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import uuid import warnings from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Tuple @@ -14,6 +15,8 @@ if TYPE_CHECKING: from azure.cosmos.cosmos_client import CosmosClient +logger = logging.getLogger(__name__) + class AzureCosmosDBNoSqlVectorSearch(VectorStore): """`Azure Cosmos DB for NoSQL` vector store. @@ -30,7 +33,7 @@ def __init__( *, cosmos_client: CosmosClient, embedding: Embeddings, - vector_embedding_policy: Dict[str, Any], + vector_embedding_policy: Optional[Dict[str, Any]], indexing_policy: Dict[str, Any], cosmos_container_properties: Dict[str, Any], cosmos_database_properties: Dict[str, Any], @@ -50,12 +53,12 @@ def __init__( indexing_policy: Indexing Policy for the container. cosmos_container_properties: Container Properties for the container. cosmos_database_properties: Database Properties for the container. + create_container: If True validates Properties for container creation. """ self._cosmos_client = cosmos_client self._database_name = database_name self._container_name = container_name self._embedding = embedding - self._vector_embedding_policy = vector_embedding_policy self._indexing_policy = indexing_policy self._cosmos_container_properties = cosmos_container_properties self._cosmos_database_properties = cosmos_database_properties @@ -115,9 +118,27 @@ def __init__( match_condition=self._cosmos_container_properties.get("match_condition"), session_token=self._cosmos_container_properties.get("session_token"), initial_headers=self._cosmos_container_properties.get("initial_headers"), - vector_embedding_policy=self._vector_embedding_policy, + vector_embedding_policy=vector_embedding_policy, ) + # Validate that the created container has the correct vector embedding policy properties + properties = self._container.read() + container_vector_embedding_policy = properties.get("vector_embedding_policy") + if container_vector_embedding_policy is None: + raise ValueError( + "The created container does not have vector search enabled." + ) + if vector_embedding_policy is not None and not all( + key in container_vector_embedding_policy + and container_vector_embedding_policy[key] == vector_embedding_policy[key] + for key in vector_embedding_policy + ): + logger.warning( + "The created container's vector embedding policy does not match the specified configuration." + ) + + # Set vector embedding policy fields + self._vector_embedding_policy = container_vector_embedding_policy self._embedding_key = self._vector_embedding_policy["vectorEmbeddings"][0][ "path" ][1:] @@ -298,8 +319,12 @@ def _similarity_search_with_score( if pre_filter is None or pre_filter.get("limit_offset_clause") is None: query += "TOP @limit " + embedding_field = "" + if with_embedding: + embedding_field = "c[@embeddingKey] as embeddingKey, " + query += ( - "c.id, c[@embeddingKey] as embeddingKey, c.text, c.metadata, " + f"c.id, {embedding_field}c.text, c.metadata, " "VectorDistance(c[@embeddingKey], @embeddings) AS SimilarityScore FROM c" ) From d6179dd96f90503f72fe115b90b854b13d912746 Mon Sep 17 00:00:00 2001 From: wassim-mechergui-shift Date: Mon, 9 Dec 2024 13:09:45 +0100 Subject: [PATCH 07/11] reflect vector policy specifications --- .../vectorstores/azure_cosmos_db_no_sql.py | 51 +++++++++++-------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py index b1f0fae3ac5cb..5cd1607dd587e 100644 --- a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py +++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py @@ -64,6 +64,16 @@ def __init__( self._cosmos_database_properties = cosmos_database_properties self._create_container = create_container + # validate vector_embedding_policy if specified + if (vector_embedding_policy is not None) and ( + "vectorEmbeddings" not in vector_embedding_policy + or len(vector_embedding_policy["vectorEmbeddings"]) == 0 + ): + raise ValueError( + "vectorEmbeddings must be present and cannot be null or empty" + " in the vector_embedding_policy if specified." + ) + if self._create_container: if ( indexing_policy["vectorIndexes"] is None @@ -72,13 +82,9 @@ def __init__( raise ValueError( "vectorIndexes cannot be null or empty in the indexing_policy." ) - if ( - vector_embedding_policy is None - or len(vector_embedding_policy["vectorEmbeddings"]) == 0 - ): + if vector_embedding_policy is None: raise ValueError( - "vectorEmbeddings cannot be null " - "or empty in the vector_embedding_policy." + "vector_embedding_policy cannot be null when creating a container." ) if self._cosmos_container_properties["partition_key"] is None: raise ValueError( @@ -122,20 +128,25 @@ def __init__( ) # Validate that the created container has the correct vector embedding policy properties - properties = self._container.read() - container_vector_embedding_policy = properties.get("vector_embedding_policy") - if container_vector_embedding_policy is None: - raise ValueError( - "The created container does not have vector search enabled." - ) - if vector_embedding_policy is not None and not all( - key in container_vector_embedding_policy - and container_vector_embedding_policy[key] == vector_embedding_policy[key] - for key in vector_embedding_policy - ): - logger.warning( - "The created container's vector embedding policy does not match the specified configuration." - ) + container_vector_embedding_policy = self._container.read().get( + "vector_embedding_policy" + ) + if container_vector_embedding_policy is not None: + # Container already has vector search exposed, verify it matches if specified + if ( + vector_embedding_policy is not None + and container_vector_embedding_policy != vector_embedding_policy + ): + logger.warning( + "The created container's vector embedding policy does not match the specified configuration." + ) + else: + # Container doesn't have vector search exposed, assume specified policy + if vector_embedding_policy is None: + raise ValueError( + "The created container does not have vector search enabled and no vector_embedding_policy was specified." + ) + container_vector_embedding_policy = vector_embedding_policy # Set vector embedding policy fields self._vector_embedding_policy = container_vector_embedding_policy From 6ac62bc19f2cea1bb55fc68f0805a431a2e294c6 Mon Sep 17 00:00:00 2001 From: wassim-mechergui-shift Date: Mon, 9 Dec 2024 13:14:55 +0100 Subject: [PATCH 08/11] update warnings to reflect situation --- .../vectorstores/azure_cosmos_db_no_sql.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py index 5cd1607dd587e..8d50812b1e6ea 100644 --- a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py +++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py @@ -141,10 +141,12 @@ def __init__( "The created container's vector embedding policy does not match the specified configuration." ) else: - # Container doesn't have vector search exposed, assume specified policy + # Container doesn't have vector search exposed + # (may be available but not exposed), use specified policy if vector_embedding_policy is None: raise ValueError( - "The created container does not have vector search enabled and no vector_embedding_policy was specified." + "The created container does not have vector search exposed" + " and no vector_embedding_policy was specified." ) container_vector_embedding_policy = vector_embedding_policy From 9bbbd79eb7b8bcfbe8041a415781894a2f708c75 Mon Sep 17 00:00:00 2001 From: wassim-mechergui-shift Date: Mon, 9 Dec 2024 13:17:46 +0100 Subject: [PATCH 09/11] specify container name in warning --- .../langchain_community/vectorstores/azure_cosmos_db_no_sql.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py index 8d50812b1e6ea..920d7affc967a 100644 --- a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py +++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py @@ -138,7 +138,8 @@ def __init__( and container_vector_embedding_policy != vector_embedding_policy ): logger.warning( - "The created container's vector embedding policy does not match the specified configuration." + f"The specified container's vector embedding policy '{self._container_name}'" + " does not match the specified configuration." ) else: # Container doesn't have vector search exposed From a417523ae3160f2076b8b23f9e71cc7249084284 Mon Sep 17 00:00:00 2001 From: wassim-mechergui-shift Date: Mon, 9 Dec 2024 13:23:11 +0100 Subject: [PATCH 10/11] fix linting --- .../vectorstores/azure_cosmos_db_no_sql.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py index 920d7affc967a..798937c0887f6 100644 --- a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py +++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py @@ -127,18 +127,21 @@ def __init__( vector_embedding_policy=vector_embedding_policy, ) - # Validate that the created container has the correct vector embedding policy properties + # Validate that the created container has the correct vector embedding policy + # properties container_vector_embedding_policy = self._container.read().get( "vector_embedding_policy" ) if container_vector_embedding_policy is not None: - # Container already has vector search exposed, verify it matches if specified + # Container already has vector search exposed, verify it matches if + # specified if ( vector_embedding_policy is not None and container_vector_embedding_policy != vector_embedding_policy ): logger.warning( - f"The specified container's vector embedding policy '{self._container_name}'" + "The specified container's vector embedding policy" + f" '{self._container_name}'" " does not match the specified configuration." ) else: From 2aa26eebd8744bf26bdf97d3b6eb00e12bb4d5f1 Mon Sep 17 00:00:00 2001 From: wassim-mechergui-shift Date: Mon, 9 Dec 2024 13:27:35 +0100 Subject: [PATCH 11/11] lint --- .../vectorstores/azure_cosmos_db_no_sql.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py index 798937c0887f6..fe2091f7f2976 100644 --- a/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py +++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db_no_sql.py @@ -71,7 +71,7 @@ def __init__( ): raise ValueError( "vectorEmbeddings must be present and cannot be null or empty" - " in the vector_embedding_policy if specified." + " in the vector_embedding_policy if specified." ) if self._create_container: @@ -140,12 +140,12 @@ def __init__( and container_vector_embedding_policy != vector_embedding_policy ): logger.warning( - "The specified container's vector embedding policy" + "The specified container's vector embedding policy" f" '{self._container_name}'" " does not match the specified configuration." ) else: - # Container doesn't have vector search exposed + # Container doesn't have vector search exposed # (may be available but not exposed), use specified policy if vector_embedding_policy is None: raise ValueError(