Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Community][fix] Fix Azure cosmos db no SQL similarity search with score and mmr #28479

Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from __future__ import annotations

import logging
import uuid
import warnings
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple
from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Tuple

import numpy as np
from langchain_core.documents import Document
Expand All @@ -14,6 +15,8 @@
if TYPE_CHECKING:
from azure.cosmos.cosmos_client import CosmosClient

logger = logging.getLogger(__name__)


class AzureCosmosDBNoSqlVectorSearch(VectorStore):
"""`Azure Cosmos DB for NoSQL` vector store.
Expand All @@ -30,7 +33,7 @@ def __init__(
*,
cosmos_client: CosmosClient,
embedding: Embeddings,
vector_embedding_policy: Dict[str, Any],
vector_embedding_policy: Optional[Dict[str, Any]],
indexing_policy: Dict[str, Any],
cosmos_container_properties: Dict[str, Any],
cosmos_database_properties: Dict[str, Any],
Expand All @@ -50,17 +53,27 @@ def __init__(
indexing_policy: Indexing Policy for the container.
cosmos_container_properties: Container Properties for the container.
cosmos_database_properties: Database Properties for the container.
create_container: If True validates Properties for container creation.
"""
self._cosmos_client = cosmos_client
self._database_name = database_name
self._container_name = container_name
self._embedding = embedding
self._vector_embedding_policy = vector_embedding_policy
self._indexing_policy = indexing_policy
self._cosmos_container_properties = cosmos_container_properties
self._cosmos_database_properties = cosmos_database_properties
self._create_container = create_container

# validate vector_embedding_policy if specified
if (vector_embedding_policy is not None) and (
"vectorEmbeddings" not in vector_embedding_policy
or len(vector_embedding_policy["vectorEmbeddings"]) == 0
):
raise ValueError(
"vectorEmbeddings must be present and cannot be null or empty"
" in the vector_embedding_policy if specified."
)

if self._create_container:
if (
indexing_policy["vectorIndexes"] is None
Expand All @@ -69,13 +82,9 @@ def __init__(
raise ValueError(
"vectorIndexes cannot be null or empty in the indexing_policy."
)
if (
vector_embedding_policy is None
or len(vector_embedding_policy["vectorEmbeddings"]) == 0
):
if vector_embedding_policy is None:
raise ValueError(
"vectorEmbeddings cannot be null "
"or empty in the vector_embedding_policy."
"vector_embedding_policy cannot be null when creating a container."
)
if self._cosmos_container_properties["partition_key"] is None:
raise ValueError(
Expand Down Expand Up @@ -115,12 +124,44 @@ def __init__(
match_condition=self._cosmos_container_properties.get("match_condition"),
session_token=self._cosmos_container_properties.get("session_token"),
initial_headers=self._cosmos_container_properties.get("initial_headers"),
vector_embedding_policy=self._vector_embedding_policy,
vector_embedding_policy=vector_embedding_policy,
)

# Validate that the created container has the correct vector embedding policy
# properties
container_vector_embedding_policy = self._container.read().get(
"vector_embedding_policy"
)
if container_vector_embedding_policy is not None:
# Container already has vector search exposed, verify it matches if
# specified
if (
vector_embedding_policy is not None
and container_vector_embedding_policy != vector_embedding_policy
):
logger.warning(
"The specified container's vector embedding policy"
f" '{self._container_name}'"
" does not match the specified configuration."
)
else:
# Container doesn't have vector search exposed
# (may be available but not exposed), use specified policy
if vector_embedding_policy is None:
raise ValueError(
"The created container does not have vector search exposed"
" and no vector_embedding_policy was specified."
)
container_vector_embedding_policy = vector_embedding_policy

# Set vector embedding policy fields
self._vector_embedding_policy = container_vector_embedding_policy
self._embedding_key = self._vector_embedding_policy["vectorEmbeddings"][0][
"path"
][1:]
self._distance_strategy = self._vector_embedding_policy["vectorEmbeddings"][0][
"distanceFunction"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this guaranteed to always be specified? seems like we only check that self._vector_embedding_policy isn't empty if self._create_container is True

Copy link
Author

@wassim-mechergui-shift wassim-mechergui-shift Dec 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not really sure on what's the intended logic of this code, when I wrote it I followed the embedding_key up above.
But you are right. vector_embedding_policy in this case is a dict. Nothing enforces what this dict needs to be if create_container is not specified. I think there's a minor issue in the code here that can be addressed.

In essence, if we want to use an azure cosmos db no sql vector store, it must have the vector search feature enabled.
To enable and specify it, when we create the container, we must specify (among other things) a Vector Policy (the dict in question). microsoft link
Once created, it's possible to query for these info in the properties of the container. (through container.read()) (vector search may be activated but not exposed, see comment down below)

Additionally, once created, _database.create_container_if_not_exists (the method we use here to create our connection to the container) will not be overriden by the new specified policy (and it's totally valid also in this case to give it None policy).

So here are my suggested changes:

  • make vector_embedding_policy optional since we only force the checks if create container is true
  • self._vector_embedding_policy shouldn't be acquired from vector_embedding_policy but from the container itself self._vector_embedding_policy should be acquired from container if it is exposed otherwise from user specified options
  • optionally add a warning if there's a mismatch between container exposed vector_embedding_policy and used specified vector_embedding_policy

LMK if this is okay with you ! (I added these changes)

Copy link
Author

@wassim-mechergui-shift wassim-mechergui-shift Dec 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I was testing and this assumption is not always valid

Once created, it's possible to query for these info in the properties of the container. (through container.read())

It seems there is a bug in azure cosmos db no sql but if the container was the result of a partition_key migration then it seems the information on the vector policy is lost (so not possible to read through container.read()). However, the container still fully supports vector search !

I'll readapt the implementation to account for this case. It seems no longer possible to use container.read() as the one source of truth.

I'm sorry for the long post I wanted to be thorooughly descriptive and argumentative in my implementation choices!

]

def add_texts(
self,
Expand Down Expand Up @@ -260,6 +301,28 @@ def delete_document_by_id(self, document_id: Optional[str] = None) -> None:
raise ValueError("No document ids provided to delete.")
self._container.delete_item(document_id, partition_key=document_id)

def _select_relevance_score_fn(self) -> Callable[[float], float]:
"""
The 'correct' relevance function

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

may differ depending on a few things, including:
- the distance / similarity metric used by the VectorStore
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
- embedding dimensionality
- etc.
"""
if self._distance_strategy == "cosine":
return self._cosine_relevance_score_fn
elif self._distance_strategy == "euclidean":
# Default behavior is to use euclidean distance relevancy
return self._euclidean_relevance_score_fn
elif self._distance_strategy == "dot product":
return self._max_inner_product_relevance_score_fn
else:
raise ValueError(
"Unknown distance strategy, must be cosine, dot product,"
" or euclidean"
)

def _similarity_search_with_score(
self,
embeddings: List[float],
Expand All @@ -273,8 +336,12 @@ def _similarity_search_with_score(
if pre_filter is None or pre_filter.get("limit_offset_clause") is None:
query += "TOP @limit "

embedding_field = ""
if with_embedding:
embedding_field = "c[@embeddingKey] as embeddingKey, "
Copy link
Author

@wassim-mechergui-shift wassim-mechergui-shift Dec 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a minor optimization but its impact can be substantial: embeddings needed only when the with_embedding argument is specified as True
Removing it from default because it adds a heavier weight on the query

When we ran our tests, even with chunk of size 4K tokens, the retrieval is still so much faster when we don't query for the embeddings (we go from 0.3s/chunk on a test to retrieve 100 chunks to ~0.02s/chunk)


query += (
"c.id, c[@embeddingKey], c.text, c.metadata, "
f"c.id, {embedding_field}c.text, c.metadata, "
"VectorDistance(c[@embeddingKey], @embeddings) AS SimilarityScore FROM c"
)

Expand Down Expand Up @@ -305,7 +372,7 @@ def _similarity_search_with_score(
metadata = item["metadata"]
score = item["SimilarityScore"]
if with_embedding:
metadata[self._embedding_key] = item[self._embedding_key]
metadata[self._embedding_key] = item["embeddingKey"]
docs_and_scores.append(
(Document(page_content=text, metadata=metadata), score)
)
Expand Down
Loading