Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chroma[patch]: add get_by_ids and fix bug #28516

Merged
merged 4 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions libs/partners/chroma/langchain_chroma/vectorstores.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
Iterable,
List,
Optional,
Sequence,
Tuple,
Type,
Union,
Expand Down Expand Up @@ -517,6 +518,11 @@ def add_texts(
"""
if ids is None:
ids = [str(uuid.uuid4()) for _ in texts]
else:
# Assign strings to any null IDs
for idx, _id in enumerate(ids):
if _id is None:
ids[idx] = str(uuid.uuid4())
embeddings = None
texts = list(texts)
if self._embedding_function is not None:
Expand Down Expand Up @@ -1028,6 +1034,38 @@ def get(

return self._collection.get(**kwargs) # type: ignore

def get_by_ids(self, ids: Sequence[str], /) -> list[Document]:
"""Get documents by their IDs.

The returned documents are expected to have the ID field set to the ID of the
document in the vector store.

Fewer documents may be returned than requested if some IDs are not found or
if there are duplicated IDs.

Users should not assume that the order of the returned documents matches
the order of the input IDs. Instead, users should rely on the ID field of the
returned documents.

This method should **NOT** raise exceptions if no documents are found for
some IDs.

Args:
ids: List of ids to retrieve.

Returns:
List of Documents.

.. versionadded:: 0.2.1
"""
results = self.get(ids=list(ids))
return [
Document(page_content=doc, metadata=meta, id=doc_id)
for doc, meta, doc_id in zip(
results["documents"], results["metadatas"], results["ids"]
)
]

def update_document(self, document_id: str, document: Document) -> None:
"""Update a document in the collection.

Expand Down
23 changes: 21 additions & 2 deletions libs/partners/chroma/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions libs/partners/chroma/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ python = ">=3.9"
version = ">=0.1.40,<0.3"
python = "<3.9"

[[tool.poetry.group.test.dependencies.langchain-tests]]
path = "../../standard-tests"
develop = true

[tool.poetry.group.codespell.dependencies]
codespell = "^2.2.0"

Expand Down
37 changes: 37 additions & 0 deletions libs/partners/chroma/tests/integration_tests/test_standard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from typing import AsyncGenerator, Generator

import pytest
from langchain_core.embeddings.fake import DeterministicFakeEmbedding
from langchain_core.vectorstores import VectorStore
from langchain_tests.integration_tests.vectorstores import (
AsyncReadWriteTestSuite,
ReadWriteTestSuite,
)

from langchain_chroma import Chroma


class TestSync(ReadWriteTestSuite):
@pytest.fixture()
def vectorstore(self) -> Generator[VectorStore, None, None]: # type: ignore
"""Get an empty vectorstore for unit tests."""
embeddings = DeterministicFakeEmbedding(size=10)
store = Chroma(embedding_function=embeddings)
try:
yield store
finally:
store.delete_collection()
pass


class TestAsync(AsyncReadWriteTestSuite):
@pytest.fixture()
async def vectorstore(self) -> AsyncGenerator[VectorStore, None]: # type: ignore
"""Get an empty vectorstore for unit tests."""
embeddings = DeterministicFakeEmbedding(size=10)
store = Chroma(embedding_function=embeddings)
try:
yield store
finally:
store.delete_collection()
pass
Loading