From 868f2f6932691957b1f4bd2f373122d67ca9a011 Mon Sep 17 00:00:00 2001 From: Eric Pinzur Date: Thu, 7 Nov 2024 11:46:30 -0600 Subject: [PATCH 1/2] fixed integration tests --- .../integration_tests/test_vectorstores.py | 37 +++++++++++++------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/libs/partners/chroma/tests/integration_tests/test_vectorstores.py b/libs/partners/chroma/tests/integration_tests/test_vectorstores.py index 382b24cb54b47..4393d5f339b09 100644 --- a/libs/partners/chroma/tests/integration_tests/test_vectorstores.py +++ b/libs/partners/chroma/tests/integration_tests/test_vectorstores.py @@ -1,12 +1,16 @@ """Test Chroma functionality.""" import uuid -from typing import Generator +from typing import ( + Generator, + cast, +) import chromadb import pytest # type: ignore[import-not-found] import requests from chromadb.api.client import SharedSystemClient +from chromadb.api.types import Embeddable from langchain_core.documents import Document from langchain_core.embeddings.fake import FakeEmbeddings as Fak @@ -17,6 +21,15 @@ ) +class MyEmbeddingFunction: + def __init__(self, fak: Fak): + self.fak = fak + + def __call__(self, input: Embeddable) -> list[list[float]]: + texts = cast(list[str], input) + return self.fak.embed_documents(texts=texts) + + @pytest.fixture() def client() -> Generator[chromadb.ClientAPI, None, None]: SharedSystemClient.clear_system_cache() @@ -254,8 +267,8 @@ def test_chroma_update_document() -> None: # Assert that the updated document is returned by the search assert output == [Document(page_content=updated_content, metadata={"page": "0"})] - assert new_embedding == embedding.embed_documents([updated_content])[0] - assert new_embedding != old_embedding + assert list(new_embedding) == list(embedding.embed_documents([updated_content])[0]) + assert list(new_embedding) != list(old_embedding) # TODO: RELEVANCE SCORE IS BROKEN. FIX TEST @@ -341,17 +354,17 @@ def batch_support_chroma_version() -> bool: ) def test_chroma_large_batch() -> None: client = chromadb.HttpClient() - embedding_function = Fak(size=255) + embedding_function = MyEmbeddingFunction(fak=Fak(size=255)) col = client.get_or_create_collection( "my_collection", - embedding_function=embedding_function.embed_documents, # type: ignore + embedding_function=embedding_function, # type: ignore ) - docs = ["This is a test document"] * (client.max_batch_size + 100) # type: ignore + docs = ["This is a test document"] * (client.get_max_batch_size() + 100) # type: ignore db = Chroma.from_texts( client=client, collection_name=col.name, texts=docs, - embedding=embedding_function, + embedding=embedding_function.fak, ids=[str(uuid.uuid4()) for _ in range(len(docs))], ) @@ -369,18 +382,18 @@ def test_chroma_large_batch() -> None: ) def test_chroma_large_batch_update() -> None: client = chromadb.HttpClient() - embedding_function = Fak(size=255) + embedding_function = MyEmbeddingFunction(fak=Fak(size=255)) col = client.get_or_create_collection( "my_collection", - embedding_function=embedding_function.embed_documents, # type: ignore + embedding_function=embedding_function, # type: ignore ) - docs = ["This is a test document"] * (client.max_batch_size + 100) # type: ignore + docs = ["This is a test document"] * (client.get_max_batch_size() + 100) # type: ignore ids = [str(uuid.uuid4()) for _ in range(len(docs))] db = Chroma.from_texts( client=client, collection_name=col.name, texts=docs, - embedding=embedding_function, + embedding=embedding_function.fak, ids=ids, ) new_docs = [ @@ -408,7 +421,7 @@ def test_chroma_legacy_batching() -> None: embedding_function = Fak(size=255) col = client.get_or_create_collection( "my_collection", - embedding_function=embedding_function.embed_documents, # type: ignore + embedding_function=MyEmbeddingFunction, # type: ignore ) docs = ["This is a test document"] * 100 db = Chroma.from_texts( From 637cf792ab767c09454cea522de20c5be9f32f82 Mon Sep 17 00:00:00 2001 From: Eric Pinzur Date: Fri, 8 Nov 2024 12:09:21 -0600 Subject: [PATCH 2/2] added document.id support --- .../chroma/langchain_chroma/vectorstores.py | 8 +- .../integration_tests/test_vectorstores.py | 192 ++++++++++++++++-- 2 files changed, 178 insertions(+), 22 deletions(-) diff --git a/libs/partners/chroma/langchain_chroma/vectorstores.py b/libs/partners/chroma/langchain_chroma/vectorstores.py index 35146fdcc7603..5405e7074b9de 100644 --- a/libs/partners/chroma/langchain_chroma/vectorstores.py +++ b/libs/partners/chroma/langchain_chroma/vectorstores.py @@ -44,10 +44,14 @@ def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]: return [ # TODO: Chroma can do batch querying, # we shouldn't hard code to the 1st result - (Document(page_content=result[0], metadata=result[1] or {}), result[2]) + ( + Document(page_content=result[0], metadata=result[1] or {}, id=result[2]), + result[3], + ) for result in zip( results["documents"][0], results["metadatas"][0], + results["ids"][0], results["distances"][0], ) ] @@ -1129,6 +1133,8 @@ def from_documents( """ texts = [doc.page_content for doc in documents] metadatas = [doc.metadata for doc in documents] + if ids is None: + ids = [doc.id if doc.id else "" for doc in documents] return cls.from_texts( texts=texts, embedding=embedding, diff --git a/libs/partners/chroma/tests/integration_tests/test_vectorstores.py b/libs/partners/chroma/tests/integration_tests/test_vectorstores.py index 4393d5f339b09..2cb1b31ebf602 100644 --- a/libs/partners/chroma/tests/integration_tests/test_vectorstores.py +++ b/libs/partners/chroma/tests/integration_tests/test_vectorstores.py @@ -46,8 +46,27 @@ def test_chroma() -> None: output = docsearch.similarity_search("foo", k=1) docsearch.delete_collection() + assert len(output) == 1 + assert output[0].page_content == "foo" + assert output[0].id is not None - assert output == [Document(page_content="foo")] + +def test_chroma_with_ids() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + ids = [f"id_{i}" for i in range(len(texts))] + docsearch = Chroma.from_texts( + collection_name="test_collection", + texts=texts, + embedding=FakeEmbeddings(), + ids=ids, + ) + output = docsearch.similarity_search("foo", k=1) + + docsearch.delete_collection() + assert len(output) == 1 + assert output[0].page_content == "foo" + assert output[0].id == "id_0" async def test_chroma_async() -> None: @@ -59,7 +78,27 @@ async def test_chroma_async() -> None: output = await docsearch.asimilarity_search("foo", k=1) docsearch.delete_collection() - assert output == [Document(page_content="foo")] + assert len(output) == 1 + assert output[0].page_content == "foo" + assert output[0].id is not None + + +async def test_chroma_async_with_ids() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + ids = [f"id_{i}" for i in range(len(texts))] + docsearch = Chroma.from_texts( + collection_name="test_collection", + texts=texts, + embedding=FakeEmbeddings(), + ids=ids, + ) + output = await docsearch.asimilarity_search("foo", k=1) + + docsearch.delete_collection() + assert len(output) == 1 + assert output[0].page_content == "foo" + assert output[0].id == "id_0" def test_chroma_with_metadatas() -> None: @@ -74,28 +113,56 @@ def test_chroma_with_metadatas() -> None: ) output = docsearch.similarity_search("foo", k=1) docsearch.delete_collection() - assert output == [Document(page_content="foo", metadata={"page": "0"})] + assert len(output) == 1 + assert output[0].page_content == "foo" + assert output[0].metadata == {"page": "0"} + assert output[0].id is not None -def test_chroma_with_metadatas_with_scores() -> None: +def test_chroma_with_metadatas_and_ids() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + ids = [f"id_{i}" for i in range(len(texts))] + docsearch = Chroma.from_texts( + collection_name="test_collection", + texts=texts, + embedding=FakeEmbeddings(), + metadatas=metadatas, + ids=ids, + ) + output = docsearch.similarity_search("foo", k=1) + docsearch.delete_collection() + assert len(output) == 1 + assert output[0].page_content == "foo" + assert output[0].metadata == {"page": "0"} + assert output[0].id == "id_0" + + +def test_chroma_with_metadatas_with_scores_and_ids() -> None: """Test end to end construction and scored search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] + ids = [f"id_{i}" for i in range(len(texts))] docsearch = Chroma.from_texts( collection_name="test_collection", texts=texts, embedding=FakeEmbeddings(), metadatas=metadatas, + ids=ids, ) output = docsearch.similarity_search_with_score("foo", k=1) docsearch.delete_collection() - assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}, id="id_0"), 0.0) + ] def test_chroma_with_metadatas_with_scores_using_vector() -> None: """Test end to end construction and scored search, using embedding vector.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] + ids = [f"id_{i}" for i in range(len(texts))] embeddings = FakeEmbeddings() docsearch = Chroma.from_texts( @@ -103,41 +170,52 @@ def test_chroma_with_metadatas_with_scores_using_vector() -> None: texts=texts, embedding=embeddings, metadatas=metadatas, + ids=ids, ) embedded_query = embeddings.embed_query("foo") output = docsearch.similarity_search_by_vector_with_relevance_scores( embedding=embedded_query, k=1 ) docsearch.delete_collection() - assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}, id="id_0"), 0.0) + ] def test_chroma_search_filter() -> None: """Test end to end construction and search with metadata filtering.""" texts = ["far", "bar", "baz"] metadatas = [{"first_letter": "{}".format(text[0])} for text in texts] + ids = [f"id_{i}" for i in range(len(texts))] docsearch = Chroma.from_texts( collection_name="test_collection", texts=texts, embedding=FakeEmbeddings(), metadatas=metadatas, + ids=ids, ) output1 = docsearch.similarity_search("far", k=1, filter={"first_letter": "f"}) output2 = docsearch.similarity_search("far", k=1, filter={"first_letter": "b"}) docsearch.delete_collection() - assert output1 == [Document(page_content="far", metadata={"first_letter": "f"})] - assert output2 == [Document(page_content="bar", metadata={"first_letter": "b"})] + assert output1 == [ + Document(page_content="far", metadata={"first_letter": "f"}, id="id_0") + ] + assert output2 == [ + Document(page_content="bar", metadata={"first_letter": "b"}, id="id_1") + ] def test_chroma_search_filter_with_scores() -> None: """Test end to end construction and scored search with metadata filtering.""" texts = ["far", "bar", "baz"] metadatas = [{"first_letter": "{}".format(text[0])} for text in texts] + ids = [f"id_{i}" for i in range(len(texts))] docsearch = Chroma.from_texts( collection_name="test_collection", texts=texts, embedding=FakeEmbeddings(), metadatas=metadatas, + ids=ids, ) output1 = docsearch.similarity_search_with_score( "far", k=1, filter={"first_letter": "f"} @@ -147,10 +225,10 @@ def test_chroma_search_filter_with_scores() -> None: ) docsearch.delete_collection() assert output1 == [ - (Document(page_content="far", metadata={"first_letter": "f"}), 0.0) + (Document(page_content="far", metadata={"first_letter": "f"}, id="id_0"), 0.0) ] assert output2 == [ - (Document(page_content="bar", metadata={"first_letter": "b"}), 1.0) + (Document(page_content="bar", metadata={"first_letter": "b"}, id="id_1"), 1.0) ] @@ -159,15 +237,18 @@ def test_chroma_with_persistence() -> None: chroma_persist_dir = "./tests/persist_dir" collection_name = "test_collection" texts = ["foo", "bar", "baz"] + ids = [f"id_{i}" for i in range(len(texts))] + docsearch = Chroma.from_texts( collection_name=collection_name, texts=texts, embedding=FakeEmbeddings(), persist_directory=chroma_persist_dir, + ids=ids, ) output = docsearch.similarity_search("foo", k=1) - assert output == [Document(page_content="foo")] + assert output == [Document(page_content="foo", id="id_0")] # Get a new VectorStore from the persisted directory docsearch = Chroma( @@ -176,6 +257,7 @@ def test_chroma_with_persistence() -> None: persist_directory=chroma_persist_dir, ) output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", id="id_0")] # Clean up docsearch.delete_collection() @@ -193,7 +275,9 @@ def test_chroma_mmr() -> None: ) output = docsearch.max_marginal_relevance_search("foo", k=1) docsearch.delete_collection() - assert output == [Document(page_content="foo")] + assert len(output) == 1 + assert output[0].page_content == "foo" + assert output[0].id is not None def test_chroma_mmr_by_vector() -> None: @@ -206,7 +290,9 @@ def test_chroma_mmr_by_vector() -> None: embedded_query = embeddings.embed_query("foo") output = docsearch.max_marginal_relevance_search_by_vector(embedded_query, k=1) docsearch.delete_collection() - assert output == [Document(page_content="foo")] + assert len(output) == 1 + assert output[0].page_content == "foo" + assert output[0].id is not None def test_chroma_with_include_parameter() -> None: @@ -223,7 +309,10 @@ def test_chroma_with_include_parameter() -> None: def test_chroma_update_document() -> None: - """Test the update_document function in the Chroma class.""" + """Test the update_document function in the Chroma class. + + Uses an external document id. + """ # Make a consistent embedding embedding = ConsistentFakeEmbeddings() @@ -265,7 +354,66 @@ def test_chroma_update_document() -> None: docsearch.delete_collection() # Assert that the updated document is returned by the search - assert output == [Document(page_content=updated_content, metadata={"page": "0"})] + assert output == [ + Document(page_content=updated_content, metadata={"page": "0"}, id=document_id) + ] + + assert list(new_embedding) == list(embedding.embed_documents([updated_content])[0]) + assert list(new_embedding) != list(old_embedding) + + +def test_chroma_update_document_with_id() -> None: + """Test the update_document function in the Chroma class. + + Uses an internal document id. + """ + # Make a consistent embedding + embedding = ConsistentFakeEmbeddings() + + # Initial document content and id + initial_content = "foo" + document_id = "doc1" + + # Create an instance of Document with initial content and metadata + original_doc = Document( + page_content=initial_content, metadata={"page": "0"}, id=document_id + ) + + # Initialize a Chroma instance with the original document + docsearch = Chroma.from_documents( + collection_name="test_collection", + documents=[original_doc], + embedding=embedding, + ) + old_embedding = docsearch._collection.peek()["embeddings"][ # type: ignore + docsearch._collection.peek()["ids"].index(document_id) + ] + + # Define updated content for the document + updated_content = "updated foo" + + # Create a new Document instance with the updated content and the same id + updated_doc = Document( + page_content=updated_content, metadata={"page": "0"}, id=document_id + ) + + # Update the document in the Chroma instance + docsearch.update_document(document_id=document_id, document=updated_doc) + + # Perform a similarity search with the updated content + output = docsearch.similarity_search(updated_content, k=1) + + # Assert that the new embedding is correct + new_embedding = docsearch._collection.peek()["embeddings"][ # type: ignore + docsearch._collection.peek()["ids"].index(document_id) + ] + + docsearch.delete_collection() + + # Assert that the updated document is returned by the search + assert output == [ + Document(page_content=updated_content, metadata={"page": "0"}, id=document_id) + ] assert list(new_embedding) == list(embedding.embed_documents([updated_content])[0]) assert list(new_embedding) != list(old_embedding) @@ -276,20 +424,22 @@ def test_chroma_with_relevance_score_custom_normalization_fn() -> None: """Test searching with relevance score and custom normalization function.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] + ids = [f"id_{i}" for i in range(len(texts))] docsearch = Chroma.from_texts( collection_name="test1_collection", texts=texts, embedding=FakeEmbeddings(), metadatas=metadatas, + ids=ids, relevance_score_fn=lambda d: d * 0, collection_metadata={"hnsw:space": "l2"}, ) output = docsearch.similarity_search_with_relevance_scores("foo", k=3) docsearch.delete_collection() assert output == [ - (Document(page_content="foo", metadata={"page": "0"}), 0.0), - (Document(page_content="bar", metadata={"page": "1"}), 0.0), - (Document(page_content="baz", metadata={"page": "2"}), 0.0), + (Document(page_content="foo", metadata={"page": "0"}, id="id_0"), 0.0), + (Document(page_content="bar", metadata={"page": "1"}, id="id_1"), 0.0), + (Document(page_content="baz", metadata={"page": "2"}, id="id_2"), 0.0), ] @@ -314,11 +464,11 @@ def test_chroma_add_documents_no_metadata() -> None: def test_chroma_add_documents_mixed_metadata() -> None: db = Chroma(embedding_function=FakeEmbeddings()) docs = [ - Document(page_content="foo"), - Document(page_content="bar", metadata={"baz": 1}), + Document(page_content="foo", id="0"), + Document(page_content="bar", metadata={"baz": 1}, id="1"), ] ids = ["0", "1"] - actual_ids = db.add_documents(docs, ids=ids) + actual_ids = db.add_documents(docs) search = db.similarity_search("foo bar") db.delete_collection()