From e17d426f7ffca670f5783db4938fe6a5f5adeed1 Mon Sep 17 00:00:00 2001 From: Shukri Date: Wed, 7 Feb 2024 22:21:18 +0100 Subject: [PATCH] Log documents that failed to be ingested (#88) If a list of documents have some good and bad documents e.g. using a reserved keyword, then: 1. the good documents will be ingested 2. the uuid and the reason the bad docs could not be ingested are logged (ERROR level) to stdout --- langchain_weaviate/vectorstores.py | 9 +++++ tests/integration_tests/test_vectorstores.py | 41 ++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/langchain_weaviate/vectorstores.py b/langchain_weaviate/vectorstores.py index f2b2f9f..a55997e 100644 --- a/langchain_weaviate/vectorstores.py +++ b/langchain_weaviate/vectorstores.py @@ -189,6 +189,15 @@ def add_texts( ) ids.append(_id) + + failed_objs = self._client.batch.failed_objects + for obj in failed_objs: + err_message = ( + f"Failed to add object: {obj.original_uuid}\nReason: {obj.message}" + ) + + logger.error(err_message) + return ids def _perform_search( diff --git a/tests/integration_tests/test_vectorstores.py b/tests/integration_tests/test_vectorstores.py index 1115821..9350026 100644 --- a/tests/integration_tests/test_vectorstores.py +++ b/tests/integration_tests/test_vectorstores.py @@ -1,6 +1,7 @@ """Test Weaviate functionality.""" import logging import os +import re import uuid from typing import List, Union @@ -693,3 +694,43 @@ def test_documents_with_many_properties(weaviate_client, embedding_openai): "foo", k=1, return_uuids=True, return_properties=["ticker", "categoryid"] )[0] assert set(doc.metadata.keys()) == {"uuid", "ticker", "categoryid"} + + +def test_ingest_bad_documents(weaviate_client, embedding_openai, caplog): + # try to ingest 2 documents + docs = [ + Document(page_content="foo", metadata={"page": 0}), + Document(page_content="bar", metadata={"_additional": 1}), + ] + uuids = [weaviate.util.generate_uuid5(doc) for doc in docs] + + index_name = f"TestIndex_{uuid.uuid4().hex}" + text_key = "page_content" + + with caplog.at_level(logging.ERROR): + _ = WeaviateVectorStore.from_documents( + documents=docs, + embedding=embedding_openai, + client=weaviate_client, + index_name=index_name, + text_key=text_key, + uuids=uuids, + ) + + good_doc_uuid, bad_doc_uuid = uuids + + # the bad doc should generate a log message + pattern = r"ERROR.*Failed to add object: {}".format(bad_doc_uuid) + assert re.search(pattern, caplog.text) + assert good_doc_uuid not in caplog.text + + # the good doc should still be ingested + total_docs = ( + weaviate_client.collections.get(index_name) + .aggregate.over_all(total_count=True) + .total_count + ) + assert total_docs == 1 + assert weaviate_client.collections.get(index_name).query.fetch_object_by_id( + good_doc_uuid + )