diff --git a/libs/standard-tests/langchain_tests/integration_tests/vectorstores.py b/libs/standard-tests/langchain_tests/integration_tests/vectorstores.py index 08b0358dcfb9e..791e1f570afcd 100644 --- a/libs/standard-tests/langchain_tests/integration_tests/vectorstores.py +++ b/libs/standard-tests/langchain_tests/integration_tests/vectorstores.py @@ -26,7 +26,61 @@ class ReadWriteTestSuite(BaseStandardTests): The fixture should use the `get_embeddings` method to get a pre-defined embeddings model that should be used for this test suite. - """ + + Here is a template: + + .. code-block:: python + + from typing import Generator + + import pytest + from langchain_core.vectorstores import VectorStore + from langchain_parrot_link.vectorstores import ParrotVectorStore + from langchain_tests.integration_tests.vectorstores import ReadWriteTestSuite + + + class TestSync(ReadWriteTestSuite): + @pytest.fixture() + def vectorstore(self) -> Generator[VectorStore, None, None]: # type: ignore + \"\"\"Get an empty vectorstore.\"\"\" + store = ParrotVectorStore(self.get_embeddings()) + # note: store should be EMPTY at this point + # if you need to delete data, you may do so here + try: + yield store + finally: + # cleanup operations, or deleting data + pass + + In the fixture, before the ``yield`` we instantiate an empty vector store. In the + ``finally`` block, we call whatever logic is necessary to bring the vector store + to a clean state. + + Example: + + .. code-block:: python + + from typing import Generator + + import pytest + from langchain_core.vectorstores import VectorStore + from langchain_tests.integration_tests.vectorstores import ReadWriteTestSuite + + from langchain_chroma import Chroma + + + class TestSync(ReadWriteTestSuite): + @pytest.fixture() + def vectorstore(self) -> Generator[VectorStore, None, None]: # type: ignore + \"\"\"Get an empty vectorstore.\"\"\" + store = Chroma(embedding_function=self.get_embeddings()) + try: + yield store + finally: + store.delete_collection() + pass + + """ # noqa: E501 @abstractmethod @pytest.fixture @@ -38,17 +92,39 @@ def vectorstore(self) -> VectorStore: @staticmethod def get_embeddings() -> Embeddings: - """A pre-defined embeddings model that should be used for this test.""" + """A pre-defined embeddings model that should be used for this test. + + This currently uses ``DeterministicFakeEmbedding`` from ``langchain-core``, + which uses numpy to generate random numbers based on a hash of the input text. + + The resulting embeddings are not meaningful, but they are deterministic. + """ return DeterministicFakeEmbedding( size=EMBEDDING_SIZE, ) def test_vectorstore_is_empty(self, vectorstore: VectorStore) -> None: - """Test that the vectorstore is empty.""" + """Test that the vectorstore is empty. + + .. dropdown:: Troubleshooting + + If this test fails, check that the test class (i.e., sub class of + ReadWriteTestSuite) initializes an empty vector store in the + ``vectorestore`` fixture. + """ assert vectorstore.similarity_search("foo", k=1) == [] def test_add_documents(self, vectorstore: VectorStore) -> None: - """Test adding documents into the vectorstore.""" + """Test adding documents into the vectorstore. + + .. dropdown:: Troubleshooting + + If this test fails, check that: + + 1. We correctly initialize an empty vector store in the ``vectorestore`` fixture. + 2. Calling ``.similarity_search`` for the top ``k`` similar documents does not threshold by score. + 3. We do not mutate the original document object when adding it to the vector store (e.g., by adding an ID). + """ # noqa: E501 original_documents = [ Document(page_content="foo", metadata={"id": 1}), Document(page_content="bar", metadata={"id": 2}), @@ -71,11 +147,24 @@ def test_vectorstore_still_empty(self, vectorstore: VectorStore) -> None: This just verifies that the fixture is set up properly to be empty after each test. + + .. dropdown:: Troubleshooting + + If this test fails, check that the test class (i.e., sub class of + ReadWriteTestSuite) correctly clears the vector store in the ``finally`` + block. """ assert vectorstore.similarity_search("foo", k=1) == [] def test_deleting_documents(self, vectorstore: VectorStore) -> None: - """Test deleting documents from the vectorstore.""" + """Test deleting documents from the vectorstore. + + .. dropdown:: Troubleshooting + + If this test fails, check that ``add_documents`` preserves identifiers + passed in through ``ids``, and that ``delete`` correctly removes + documents. + """ documents = [ Document(page_content="foo", metadata={"id": 1}), Document(page_content="bar", metadata={"id": 2}), @@ -87,7 +176,13 @@ def test_deleting_documents(self, vectorstore: VectorStore) -> None: assert documents == [Document(page_content="bar", metadata={"id": 2}, id="2")] def test_deleting_bulk_documents(self, vectorstore: VectorStore) -> None: - """Test that we can delete several documents at once.""" + """Test that we can delete several documents at once. + + .. dropdown:: Troubleshooting + + If this test fails, check that ``delete`` correctly removes multiple + documents when givena list of IDs. + """ documents = [ Document(page_content="foo", metadata={"id": 1}), Document(page_content="bar", metadata={"id": 2}), @@ -100,14 +195,27 @@ def test_deleting_bulk_documents(self, vectorstore: VectorStore) -> None: assert documents == [Document(page_content="baz", metadata={"id": 3}, id="3")] def test_delete_missing_content(self, vectorstore: VectorStore) -> None: - """Deleting missing content should not raise an exception.""" + """Deleting missing content should not raise an exception. + + .. dropdown:: Troubleshooting + + If this test fails, check that ``delete`` does not raise an exception + when deleting IDs that do not exist. + """ vectorstore.delete(["1"]) vectorstore.delete(["1", "2", "3"]) def test_add_documents_with_ids_is_idempotent( self, vectorstore: VectorStore ) -> None: - """Adding by ID should be idempotent.""" + """Adding by ID should be idempotent. + + .. dropdown:: Troubleshooting + + If this test fails, check that adding the same document twice with the + same IDs has the same effect as adding it once (i.e., it does not + duplicate the documents). + """ documents = [ Document(page_content="foo", metadata={"id": 1}), Document(page_content="bar", metadata={"id": 2}), @@ -121,7 +229,14 @@ def test_add_documents_with_ids_is_idempotent( ] def test_add_documents_by_id_with_mutation(self, vectorstore: VectorStore) -> None: - """Test that we can overwrite by ID using add_documents.""" + """Test that we can overwrite by ID using add_documents. + + .. dropdown:: Troubleshooting + + If this test fails, check that when ``add_documents`` is called with an + ID that already exists in the vector store, the content is updated + rather than duplicated. + """ documents = [ Document(page_content="foo", metadata={"id": 1}), Document(page_content="bar", metadata={"id": 2}), @@ -150,7 +265,26 @@ def test_add_documents_by_id_with_mutation(self, vectorstore: VectorStore) -> No ] def test_get_by_ids(self, vectorstore: VectorStore) -> None: - """Test get by IDs.""" + """Test get by IDs. + + This test requires that ``get_by_ids`` be implemented on the vector store. + + .. dropdown:: Troubleshooting + + If this test fails, check that ``get_by_ids`` is implemented and returns + documents in the same order as the IDs passed in. + + .. note:: + ``get_by_ids`` was added to the ``VectorStore`` interface in + ``langchain-core`` version 0.2.11. If difficult to implement, this + test can be skipped using a pytest ``xfail`` on the test class: + + .. code-block:: python + + @pytest.mark.xfail(reason=("get_by_ids not implemented.")) + def test_get_by_ids(self, vectorstore: VectorStore) -> None: + super().test_get_by_ids(vectorstore) + """ documents = [ Document(page_content="foo", metadata={"id": 1}), Document(page_content="bar", metadata={"id": 2}), @@ -163,13 +297,50 @@ def test_get_by_ids(self, vectorstore: VectorStore) -> None: ] def test_get_by_ids_missing(self, vectorstore: VectorStore) -> None: - """Test get by IDs with missing IDs.""" + """Test get by IDs with missing IDs. + + .. dropdown:: Troubleshooting + + If this test fails, check that ``get_by_ids`` is implemented and does not + raise an exception when given IDs that do not exist. + + .. note:: + ``get_by_ids`` was added to the ``VectorStore`` interface in + ``langchain-core`` version 0.2.11. If difficult to implement, this + test can be skipped using a pytest ``xfail`` on the test class: + + .. code-block:: python + + @pytest.mark.xfail(reason=("get_by_ids not implemented.")) + def test_get_by_ids_missing(self, vectorstore: VectorStore) -> None: + super().test_get_by_ids_missing(vectorstore) + """ # noqa: E501 # This should not raise an exception documents = vectorstore.get_by_ids(["1", "2", "3"]) assert documents == [] def test_add_documents_documents(self, vectorstore: VectorStore) -> None: - """Run add_documents tests.""" + """Run add_documents tests. + + .. dropdown:: Troubleshooting + + If this test fails, check that ``get_by_ids`` is implemented and returns + documents in the same order as the IDs passed in. + + Check also that ``add_documents`` will correctly generate string IDs if + none are provided. + + .. note:: + ``get_by_ids`` was added to the ``VectorStore`` interface in + ``langchain-core`` version 0.2.11. If difficult to implement, this + test can be skipped using a pytest ``xfail`` on the test class: + + .. code-block:: python + + @pytest.mark.xfail(reason=("get_by_ids not implemented.")) + def test_add_documents_documents(self, vectorstore: VectorStore) -> None: + super().test_add_documents_documents(vectorstore) + """ # noqa: E501 documents = [ Document(page_content="foo", metadata={"id": 1}), Document(page_content="bar", metadata={"id": 2}), @@ -181,7 +352,29 @@ def test_add_documents_documents(self, vectorstore: VectorStore) -> None: ] def test_add_documents_with_existing_ids(self, vectorstore: VectorStore) -> None: - """Test that add_documentsing with existing IDs is idempotent.""" + """Test that add_documents with existing IDs is idempotent. + + .. dropdown:: Troubleshooting + + If this test fails, check that ``get_by_ids`` is implemented and returns + documents in the same order as the IDs passed in. + + This test also verifies that: + + 1. IDs specified in the ``Document.id`` field are assigned when adding documents. + 2. If some documents include IDs and others don't string IDs are generated for the latter. + + .. note:: + ``get_by_ids`` was added to the ``VectorStore`` interface in + ``langchain-core`` version 0.2.11. If difficult to implement, this + test can be skipped using a pytest ``xfail`` on the test class: + + .. code-block:: python + + @pytest.mark.xfail(reason=("get_by_ids not implemented.")) + def test_add_documents_with_existing_ids(self, vectorstore: VectorStore) -> None: + super().test_add_documents_with_existing_ids(vectorstore) + """ # noqa: E501 documents = [ Document(id="foo", page_content="foo", metadata={"id": 1}), Document(page_content="bar", metadata={"id": 2}),