From 868f2f6932691957b1f4bd2f373122d67ca9a011 Mon Sep 17 00:00:00 2001
From: Eric Pinzur <epinzur@gmail.com>
Date: Thu, 7 Nov 2024 11:46:30 -0600
Subject: [PATCH 1/2] fixed integration tests

---
 .../integration_tests/test_vectorstores.py    | 37 +++++++++++++------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/libs/partners/chroma/tests/integration_tests/test_vectorstores.py b/libs/partners/chroma/tests/integration_tests/test_vectorstores.py
index 382b24cb54b47..4393d5f339b09 100644
--- a/libs/partners/chroma/tests/integration_tests/test_vectorstores.py
+++ b/libs/partners/chroma/tests/integration_tests/test_vectorstores.py
@@ -1,12 +1,16 @@
 """Test Chroma functionality."""
 
 import uuid
-from typing import Generator
+from typing import (
+    Generator,
+    cast,
+)
 
 import chromadb
 import pytest  # type: ignore[import-not-found]
 import requests
 from chromadb.api.client import SharedSystemClient
+from chromadb.api.types import Embeddable
 from langchain_core.documents import Document
 from langchain_core.embeddings.fake import FakeEmbeddings as Fak
 
@@ -17,6 +21,15 @@
 )
 
 
+class MyEmbeddingFunction:
+    def __init__(self, fak: Fak):
+        self.fak = fak
+
+    def __call__(self, input: Embeddable) -> list[list[float]]:
+        texts = cast(list[str], input)
+        return self.fak.embed_documents(texts=texts)
+
+
 @pytest.fixture()
 def client() -> Generator[chromadb.ClientAPI, None, None]:
     SharedSystemClient.clear_system_cache()
@@ -254,8 +267,8 @@ def test_chroma_update_document() -> None:
     # Assert that the updated document is returned by the search
     assert output == [Document(page_content=updated_content, metadata={"page": "0"})]
 
-    assert new_embedding == embedding.embed_documents([updated_content])[0]
-    assert new_embedding != old_embedding
+    assert list(new_embedding) == list(embedding.embed_documents([updated_content])[0])
+    assert list(new_embedding) != list(old_embedding)
 
 
 # TODO: RELEVANCE SCORE IS BROKEN. FIX TEST
@@ -341,17 +354,17 @@ def batch_support_chroma_version() -> bool:
 )
 def test_chroma_large_batch() -> None:
     client = chromadb.HttpClient()
-    embedding_function = Fak(size=255)
+    embedding_function = MyEmbeddingFunction(fak=Fak(size=255))
     col = client.get_or_create_collection(
         "my_collection",
-        embedding_function=embedding_function.embed_documents,  # type: ignore
+        embedding_function=embedding_function,  # type: ignore
     )
-    docs = ["This is a test document"] * (client.max_batch_size + 100)  # type: ignore
+    docs = ["This is a test document"] * (client.get_max_batch_size() + 100)  # type: ignore
     db = Chroma.from_texts(
         client=client,
         collection_name=col.name,
         texts=docs,
-        embedding=embedding_function,
+        embedding=embedding_function.fak,
         ids=[str(uuid.uuid4()) for _ in range(len(docs))],
     )
 
@@ -369,18 +382,18 @@ def test_chroma_large_batch() -> None:
 )
 def test_chroma_large_batch_update() -> None:
     client = chromadb.HttpClient()
-    embedding_function = Fak(size=255)
+    embedding_function = MyEmbeddingFunction(fak=Fak(size=255))
     col = client.get_or_create_collection(
         "my_collection",
-        embedding_function=embedding_function.embed_documents,  # type: ignore
+        embedding_function=embedding_function,  # type: ignore
     )
-    docs = ["This is a test document"] * (client.max_batch_size + 100)  # type: ignore
+    docs = ["This is a test document"] * (client.get_max_batch_size() + 100)  # type: ignore
     ids = [str(uuid.uuid4()) for _ in range(len(docs))]
     db = Chroma.from_texts(
         client=client,
         collection_name=col.name,
         texts=docs,
-        embedding=embedding_function,
+        embedding=embedding_function.fak,
         ids=ids,
     )
     new_docs = [
@@ -408,7 +421,7 @@ def test_chroma_legacy_batching() -> None:
     embedding_function = Fak(size=255)
     col = client.get_or_create_collection(
         "my_collection",
-        embedding_function=embedding_function.embed_documents,  # type: ignore
+        embedding_function=MyEmbeddingFunction,  # type: ignore
     )
     docs = ["This is a test document"] * 100
     db = Chroma.from_texts(

From 637cf792ab767c09454cea522de20c5be9f32f82 Mon Sep 17 00:00:00 2001
From: Eric Pinzur <epinzur@gmail.com>
Date: Fri, 8 Nov 2024 12:09:21 -0600
Subject: [PATCH 2/2] added document.id support

---
 .../chroma/langchain_chroma/vectorstores.py   |   8 +-
 .../integration_tests/test_vectorstores.py    | 192 ++++++++++++++++--
 2 files changed, 178 insertions(+), 22 deletions(-)

diff --git a/libs/partners/chroma/langchain_chroma/vectorstores.py b/libs/partners/chroma/langchain_chroma/vectorstores.py
index 35146fdcc7603..5405e7074b9de 100644
--- a/libs/partners/chroma/langchain_chroma/vectorstores.py
+++ b/libs/partners/chroma/langchain_chroma/vectorstores.py
@@ -44,10 +44,14 @@ def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]:
     return [
         # TODO: Chroma can do batch querying,
         # we shouldn't hard code to the 1st result
-        (Document(page_content=result[0], metadata=result[1] or {}), result[2])
+        (
+            Document(page_content=result[0], metadata=result[1] or {}, id=result[2]),
+            result[3],
+        )
         for result in zip(
             results["documents"][0],
             results["metadatas"][0],
+            results["ids"][0],
             results["distances"][0],
         )
     ]
@@ -1129,6 +1133,8 @@ def from_documents(
         """
         texts = [doc.page_content for doc in documents]
         metadatas = [doc.metadata for doc in documents]
+        if ids is None:
+            ids = [doc.id if doc.id else "" for doc in documents]
         return cls.from_texts(
             texts=texts,
             embedding=embedding,
diff --git a/libs/partners/chroma/tests/integration_tests/test_vectorstores.py b/libs/partners/chroma/tests/integration_tests/test_vectorstores.py
index 4393d5f339b09..2cb1b31ebf602 100644
--- a/libs/partners/chroma/tests/integration_tests/test_vectorstores.py
+++ b/libs/partners/chroma/tests/integration_tests/test_vectorstores.py
@@ -46,8 +46,27 @@ def test_chroma() -> None:
     output = docsearch.similarity_search("foo", k=1)
 
     docsearch.delete_collection()
+    assert len(output) == 1
+    assert output[0].page_content == "foo"
+    assert output[0].id is not None
 
-    assert output == [Document(page_content="foo")]
+
+def test_chroma_with_ids() -> None:
+    """Test end to end construction and search."""
+    texts = ["foo", "bar", "baz"]
+    ids = [f"id_{i}" for i in range(len(texts))]
+    docsearch = Chroma.from_texts(
+        collection_name="test_collection",
+        texts=texts,
+        embedding=FakeEmbeddings(),
+        ids=ids,
+    )
+    output = docsearch.similarity_search("foo", k=1)
+
+    docsearch.delete_collection()
+    assert len(output) == 1
+    assert output[0].page_content == "foo"
+    assert output[0].id == "id_0"
 
 
 async def test_chroma_async() -> None:
@@ -59,7 +78,27 @@ async def test_chroma_async() -> None:
     output = await docsearch.asimilarity_search("foo", k=1)
 
     docsearch.delete_collection()
-    assert output == [Document(page_content="foo")]
+    assert len(output) == 1
+    assert output[0].page_content == "foo"
+    assert output[0].id is not None
+
+
+async def test_chroma_async_with_ids() -> None:
+    """Test end to end construction and search."""
+    texts = ["foo", "bar", "baz"]
+    ids = [f"id_{i}" for i in range(len(texts))]
+    docsearch = Chroma.from_texts(
+        collection_name="test_collection",
+        texts=texts,
+        embedding=FakeEmbeddings(),
+        ids=ids,
+    )
+    output = await docsearch.asimilarity_search("foo", k=1)
+
+    docsearch.delete_collection()
+    assert len(output) == 1
+    assert output[0].page_content == "foo"
+    assert output[0].id == "id_0"
 
 
 def test_chroma_with_metadatas() -> None:
@@ -74,28 +113,56 @@ def test_chroma_with_metadatas() -> None:
     )
     output = docsearch.similarity_search("foo", k=1)
     docsearch.delete_collection()
-    assert output == [Document(page_content="foo", metadata={"page": "0"})]
+    assert len(output) == 1
+    assert output[0].page_content == "foo"
+    assert output[0].metadata == {"page": "0"}
+    assert output[0].id is not None
 
 
-def test_chroma_with_metadatas_with_scores() -> None:
+def test_chroma_with_metadatas_and_ids() -> None:
+    """Test end to end construction and search."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    ids = [f"id_{i}" for i in range(len(texts))]
+    docsearch = Chroma.from_texts(
+        collection_name="test_collection",
+        texts=texts,
+        embedding=FakeEmbeddings(),
+        metadatas=metadatas,
+        ids=ids,
+    )
+    output = docsearch.similarity_search("foo", k=1)
+    docsearch.delete_collection()
+    assert len(output) == 1
+    assert output[0].page_content == "foo"
+    assert output[0].metadata == {"page": "0"}
+    assert output[0].id == "id_0"
+
+
+def test_chroma_with_metadatas_with_scores_and_ids() -> None:
     """Test end to end construction and scored search."""
     texts = ["foo", "bar", "baz"]
     metadatas = [{"page": str(i)} for i in range(len(texts))]
+    ids = [f"id_{i}" for i in range(len(texts))]
     docsearch = Chroma.from_texts(
         collection_name="test_collection",
         texts=texts,
         embedding=FakeEmbeddings(),
         metadatas=metadatas,
+        ids=ids,
     )
     output = docsearch.similarity_search_with_score("foo", k=1)
     docsearch.delete_collection()
-    assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]
+    assert output == [
+        (Document(page_content="foo", metadata={"page": "0"}, id="id_0"), 0.0)
+    ]
 
 
 def test_chroma_with_metadatas_with_scores_using_vector() -> None:
     """Test end to end construction and scored search, using embedding vector."""
     texts = ["foo", "bar", "baz"]
     metadatas = [{"page": str(i)} for i in range(len(texts))]
+    ids = [f"id_{i}" for i in range(len(texts))]
     embeddings = FakeEmbeddings()
 
     docsearch = Chroma.from_texts(
@@ -103,41 +170,52 @@ def test_chroma_with_metadatas_with_scores_using_vector() -> None:
         texts=texts,
         embedding=embeddings,
         metadatas=metadatas,
+        ids=ids,
     )
     embedded_query = embeddings.embed_query("foo")
     output = docsearch.similarity_search_by_vector_with_relevance_scores(
         embedding=embedded_query, k=1
     )
     docsearch.delete_collection()
-    assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]
+    assert output == [
+        (Document(page_content="foo", metadata={"page": "0"}, id="id_0"), 0.0)
+    ]
 
 
 def test_chroma_search_filter() -> None:
     """Test end to end construction and search with metadata filtering."""
     texts = ["far", "bar", "baz"]
     metadatas = [{"first_letter": "{}".format(text[0])} for text in texts]
+    ids = [f"id_{i}" for i in range(len(texts))]
     docsearch = Chroma.from_texts(
         collection_name="test_collection",
         texts=texts,
         embedding=FakeEmbeddings(),
         metadatas=metadatas,
+        ids=ids,
     )
     output1 = docsearch.similarity_search("far", k=1, filter={"first_letter": "f"})
     output2 = docsearch.similarity_search("far", k=1, filter={"first_letter": "b"})
     docsearch.delete_collection()
-    assert output1 == [Document(page_content="far", metadata={"first_letter": "f"})]
-    assert output2 == [Document(page_content="bar", metadata={"first_letter": "b"})]
+    assert output1 == [
+        Document(page_content="far", metadata={"first_letter": "f"}, id="id_0")
+    ]
+    assert output2 == [
+        Document(page_content="bar", metadata={"first_letter": "b"}, id="id_1")
+    ]
 
 
 def test_chroma_search_filter_with_scores() -> None:
     """Test end to end construction and scored search with metadata filtering."""
     texts = ["far", "bar", "baz"]
     metadatas = [{"first_letter": "{}".format(text[0])} for text in texts]
+    ids = [f"id_{i}" for i in range(len(texts))]
     docsearch = Chroma.from_texts(
         collection_name="test_collection",
         texts=texts,
         embedding=FakeEmbeddings(),
         metadatas=metadatas,
+        ids=ids,
     )
     output1 = docsearch.similarity_search_with_score(
         "far", k=1, filter={"first_letter": "f"}
@@ -147,10 +225,10 @@ def test_chroma_search_filter_with_scores() -> None:
     )
     docsearch.delete_collection()
     assert output1 == [
-        (Document(page_content="far", metadata={"first_letter": "f"}), 0.0)
+        (Document(page_content="far", metadata={"first_letter": "f"}, id="id_0"), 0.0)
     ]
     assert output2 == [
-        (Document(page_content="bar", metadata={"first_letter": "b"}), 1.0)
+        (Document(page_content="bar", metadata={"first_letter": "b"}, id="id_1"), 1.0)
     ]
 
 
@@ -159,15 +237,18 @@ def test_chroma_with_persistence() -> None:
     chroma_persist_dir = "./tests/persist_dir"
     collection_name = "test_collection"
     texts = ["foo", "bar", "baz"]
+    ids = [f"id_{i}" for i in range(len(texts))]
+
     docsearch = Chroma.from_texts(
         collection_name=collection_name,
         texts=texts,
         embedding=FakeEmbeddings(),
         persist_directory=chroma_persist_dir,
+        ids=ids,
     )
 
     output = docsearch.similarity_search("foo", k=1)
-    assert output == [Document(page_content="foo")]
+    assert output == [Document(page_content="foo", id="id_0")]
 
     # Get a new VectorStore from the persisted directory
     docsearch = Chroma(
@@ -176,6 +257,7 @@ def test_chroma_with_persistence() -> None:
         persist_directory=chroma_persist_dir,
     )
     output = docsearch.similarity_search("foo", k=1)
+    assert output == [Document(page_content="foo", id="id_0")]
 
     # Clean up
     docsearch.delete_collection()
@@ -193,7 +275,9 @@ def test_chroma_mmr() -> None:
     )
     output = docsearch.max_marginal_relevance_search("foo", k=1)
     docsearch.delete_collection()
-    assert output == [Document(page_content="foo")]
+    assert len(output) == 1
+    assert output[0].page_content == "foo"
+    assert output[0].id is not None
 
 
 def test_chroma_mmr_by_vector() -> None:
@@ -206,7 +290,9 @@ def test_chroma_mmr_by_vector() -> None:
     embedded_query = embeddings.embed_query("foo")
     output = docsearch.max_marginal_relevance_search_by_vector(embedded_query, k=1)
     docsearch.delete_collection()
-    assert output == [Document(page_content="foo")]
+    assert len(output) == 1
+    assert output[0].page_content == "foo"
+    assert output[0].id is not None
 
 
 def test_chroma_with_include_parameter() -> None:
@@ -223,7 +309,10 @@ def test_chroma_with_include_parameter() -> None:
 
 
 def test_chroma_update_document() -> None:
-    """Test the update_document function in the Chroma class."""
+    """Test the update_document function in the Chroma class.
+
+    Uses an external document id.
+    """
     # Make a consistent embedding
     embedding = ConsistentFakeEmbeddings()
 
@@ -265,7 +354,66 @@ def test_chroma_update_document() -> None:
     docsearch.delete_collection()
 
     # Assert that the updated document is returned by the search
-    assert output == [Document(page_content=updated_content, metadata={"page": "0"})]
+    assert output == [
+        Document(page_content=updated_content, metadata={"page": "0"}, id=document_id)
+    ]
+
+    assert list(new_embedding) == list(embedding.embed_documents([updated_content])[0])
+    assert list(new_embedding) != list(old_embedding)
+
+
+def test_chroma_update_document_with_id() -> None:
+    """Test the update_document function in the Chroma class.
+
+    Uses an internal document id.
+    """
+    # Make a consistent embedding
+    embedding = ConsistentFakeEmbeddings()
+
+    # Initial document content and id
+    initial_content = "foo"
+    document_id = "doc1"
+
+    # Create an instance of Document with initial content and metadata
+    original_doc = Document(
+        page_content=initial_content, metadata={"page": "0"}, id=document_id
+    )
+
+    # Initialize a Chroma instance with the original document
+    docsearch = Chroma.from_documents(
+        collection_name="test_collection",
+        documents=[original_doc],
+        embedding=embedding,
+    )
+    old_embedding = docsearch._collection.peek()["embeddings"][  # type: ignore
+        docsearch._collection.peek()["ids"].index(document_id)
+    ]
+
+    # Define updated content for the document
+    updated_content = "updated foo"
+
+    # Create a new Document instance with the updated content and the same id
+    updated_doc = Document(
+        page_content=updated_content, metadata={"page": "0"}, id=document_id
+    )
+
+    # Update the document in the Chroma instance
+    docsearch.update_document(document_id=document_id, document=updated_doc)
+
+    # Perform a similarity search with the updated content
+    output = docsearch.similarity_search(updated_content, k=1)
+
+    # Assert that the new embedding is correct
+    new_embedding = docsearch._collection.peek()["embeddings"][  # type: ignore
+        docsearch._collection.peek()["ids"].index(document_id)
+    ]
+
+    docsearch.delete_collection()
+
+    # Assert that the updated document is returned by the search
+    assert output == [
+        Document(page_content=updated_content, metadata={"page": "0"}, id=document_id)
+    ]
 
     assert list(new_embedding) == list(embedding.embed_documents([updated_content])[0])
     assert list(new_embedding) != list(old_embedding)
@@ -276,20 +424,22 @@ def test_chroma_with_relevance_score_custom_normalization_fn() -> None:
     """Test searching with relevance score and custom normalization function."""
     texts = ["foo", "bar", "baz"]
     metadatas = [{"page": str(i)} for i in range(len(texts))]
+    ids = [f"id_{i}" for i in range(len(texts))]
     docsearch = Chroma.from_texts(
         collection_name="test1_collection",
         texts=texts,
         embedding=FakeEmbeddings(),
         metadatas=metadatas,
+        ids=ids,
         relevance_score_fn=lambda d: d * 0,
         collection_metadata={"hnsw:space": "l2"},
     )
     output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
     docsearch.delete_collection()
     assert output == [
-        (Document(page_content="foo", metadata={"page": "0"}), 0.0),
-        (Document(page_content="bar", metadata={"page": "1"}), 0.0),
-        (Document(page_content="baz", metadata={"page": "2"}), 0.0),
+        (Document(page_content="foo", metadata={"page": "0"}, id="id_0"), 0.0),
+        (Document(page_content="bar", metadata={"page": "1"}, id="id_1"), 0.0),
+        (Document(page_content="baz", metadata={"page": "2"}, id="id_2"), 0.0),
     ]
 
 
@@ -314,11 +464,11 @@ def test_chroma_add_documents_no_metadata() -> None:
 def test_chroma_add_documents_mixed_metadata() -> None:
     db = Chroma(embedding_function=FakeEmbeddings())
     docs = [
-        Document(page_content="foo"),
-        Document(page_content="bar", metadata={"baz": 1}),
+        Document(page_content="foo", id="0"),
+        Document(page_content="bar", metadata={"baz": 1}, id="1"),
     ]
     ids = ["0", "1"]
-    actual_ids = db.add_documents(docs, ids=ids)
+    actual_ids = db.add_documents(docs)
     search = db.similarity_search("foo bar")
     db.delete_collection()