From eca4146fd60299000eb0e2602585cdc270df1529 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 20 Sep 2023 10:45:18 +0200 Subject: [PATCH 1/6] add UrlCacheChecker --- .../preview/components/caching/__init__.py | 0 .../components/caching/cache_checker.py | 66 +++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 haystack/preview/components/caching/__init__.py create mode 100644 haystack/preview/components/caching/cache_checker.py diff --git a/haystack/preview/components/caching/__init__.py b/haystack/preview/components/caching/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/haystack/preview/components/caching/cache_checker.py b/haystack/preview/components/caching/cache_checker.py new file mode 100644 index 0000000000..08c6216a77 --- /dev/null +++ b/haystack/preview/components/caching/cache_checker.py @@ -0,0 +1,66 @@ +from typing import List, Dict, Any + +from haystack.preview import component, Document, default_from_dict, default_to_dict, DeserializationError +from haystack.preview.document_stores import DocumentStore, DuplicatePolicy, document_store + + +@component +class UrlCacheChecker: + """ + A component that check if a document coming from a given URL is already present in the store. + + Can be used to implement a caching functionality with a Document Store in web retrieval pipelines. + """ + + def __init__(self, document_store: DocumentStore, url_field: str = "url"): + """ + Create a UrlCacheChecker component. + + :param policy: The policy to use when encountering duplicate documents (default is DuplicatePolicy.FAIL). + """ + self.document_store = document_store + self.url_field = url_field + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict(self, document_store=self.document_store.to_dict(), url_field=self.url_field) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "UrlCacheChecker": + """ + Deserialize this component from a dictionary. + """ + init_params = data.get("init_parameters", {}) + if "document_store" not in init_params: + raise DeserializationError("Missing 'document_store' in serialization data") + if "type" not in init_params["document_store"]: + raise DeserializationError("Missing 'type' in document store's serialization data") + if init_params["document_store"]["type"] not in document_store.registry: + raise DeserializationError(f"DocumentStore of type '{init_params['document_store']['type']}' not found.") + docstore_class = document_store.registry[init_params["document_store"]["type"]] + docstore = docstore_class.from_dict(init_params["document_store"]) + + data["init_parameters"]["document_store"] = docstore + data["init_parameters"]["policy"] = DuplicatePolicy[data["init_parameters"]["policy"]] + return default_from_dict(cls, data) + + @component.output_types(found=List[Document], missing=List[str]) + def run(self, urls: List[str]): + """ + Checks if any document coming from the given URL is already present in the store and if so, returns it. + + :param urls: All the URLs the documents may be coming from to hit this cache. + """ + found_documents = [] + missing_urls = [] + + for url in urls: + filters = {self.url_field: url} + found = self.document_store.filter_documents(filters=filters) + if found: + found_documents.append(found) + else: + missing_urls.append(url) + return {"found": found_documents, "missing": missing_urls} From aed094da43e54d358675e26567100637484eb1f1 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 20 Sep 2023 10:45:50 +0200 Subject: [PATCH 2/6] rename --- .../components/caching/{cache_checker.py => url_cache_checker.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename haystack/preview/components/caching/{cache_checker.py => url_cache_checker.py} (100%) diff --git a/haystack/preview/components/caching/cache_checker.py b/haystack/preview/components/caching/url_cache_checker.py similarity index 100% rename from haystack/preview/components/caching/cache_checker.py rename to haystack/preview/components/caching/url_cache_checker.py From e562e0f97bb7154b542b7c28177361e7b5a140b2 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 20 Sep 2023 10:54:24 +0200 Subject: [PATCH 3/6] add tests --- .../components/caching/url_cache_checker.py | 3 +- .../caching/test_url_cache_checker.py | 86 +++++++++++++++++++ 2 files changed, 87 insertions(+), 2 deletions(-) create mode 100644 test/preview/components/caching/test_url_cache_checker.py diff --git a/haystack/preview/components/caching/url_cache_checker.py b/haystack/preview/components/caching/url_cache_checker.py index 08c6216a77..187444342b 100644 --- a/haystack/preview/components/caching/url_cache_checker.py +++ b/haystack/preview/components/caching/url_cache_checker.py @@ -43,7 +43,6 @@ def from_dict(cls, data: Dict[str, Any]) -> "UrlCacheChecker": docstore = docstore_class.from_dict(init_params["document_store"]) data["init_parameters"]["document_store"] = docstore - data["init_parameters"]["policy"] = DuplicatePolicy[data["init_parameters"]["policy"]] return default_from_dict(cls, data) @component.output_types(found=List[Document], missing=List[str]) @@ -60,7 +59,7 @@ def run(self, urls: List[str]): filters = {self.url_field: url} found = self.document_store.filter_documents(filters=filters) if found: - found_documents.append(found) + found_documents.extend(found) else: missing_urls.append(url) return {"found": found_documents, "missing": missing_urls} diff --git a/test/preview/components/caching/test_url_cache_checker.py b/test/preview/components/caching/test_url_cache_checker.py new file mode 100644 index 0000000000..42450d184c --- /dev/null +++ b/test/preview/components/caching/test_url_cache_checker.py @@ -0,0 +1,86 @@ +from unittest.mock import MagicMock + +import pytest + +from haystack.preview import Document, DeserializationError +from haystack.preview.testing.factory import document_store_class +from haystack.preview.document_stores.memory import MemoryDocumentStore +from haystack.preview.components.caching.url_cache_checker import UrlCacheChecker +from haystack.preview.document_stores import DuplicatePolicy + + +class TestUrlCacheChecker: + @pytest.mark.unit + def test_to_dict(self): + mocked_docstore_class = document_store_class("MockedDocumentStore") + component = UrlCacheChecker(document_store=mocked_docstore_class()) + data = component.to_dict() + assert data == { + "type": "UrlCacheChecker", + "init_parameters": { + "document_store": {"type": "MockedDocumentStore", "init_parameters": {}}, + "url_field": "url", + }, + } + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + mocked_docstore_class = document_store_class("MockedDocumentStore") + component = UrlCacheChecker(document_store=mocked_docstore_class(), url_field="my_url_field") + data = component.to_dict() + assert data == { + "type": "UrlCacheChecker", + "init_parameters": { + "document_store": {"type": "MockedDocumentStore", "init_parameters": {}}, + "url_field": "my_url_field", + }, + } + + @pytest.mark.unit + def test_from_dict(self): + mocked_docstore_class = document_store_class("MockedDocumentStore") + data = { + "type": "UrlCacheChecker", + "init_parameters": { + "document_store": {"type": "MockedDocumentStore", "init_parameters": {}}, + "url_field": "my_url_field", + }, + } + component = UrlCacheChecker.from_dict(data) + assert isinstance(component.document_store, mocked_docstore_class) + assert component.url_field == "my_url_field" + + @pytest.mark.unit + def test_from_dict_without_docstore(self): + data = {"type": "UrlCacheChecker", "init_parameters": {}} + with pytest.raises(DeserializationError, match="Missing 'document_store' in serialization data"): + UrlCacheChecker.from_dict(data) + + @pytest.mark.unit + def test_from_dict_without_docstore_type(self): + data = {"type": "UrlCacheChecker", "init_parameters": {"document_store": {"init_parameters": {}}}} + with pytest.raises(DeserializationError, match="Missing 'type' in document store's serialization data"): + UrlCacheChecker.from_dict(data) + + @pytest.mark.unit + def test_from_dict_nonexisting_docstore(self): + data = { + "type": "UrlCacheChecker", + "init_parameters": {"document_store": {"type": "NonexistingDocumentStore", "init_parameters": {}}}, + } + with pytest.raises(DeserializationError, match="DocumentStore of type 'NonexistingDocumentStore' not found."): + UrlCacheChecker.from_dict(data) + + @pytest.mark.unit + def test_run(self): + docstore = MemoryDocumentStore() + documents = [ + Document(text="doc1", metadata={"url": "https://example.com/1"}), + Document(text="doc2", metadata={"url": "https://example.com/2"}), + Document(text="doc3", metadata={"url": "https://example.com/1"}), + Document(text="doc4", metadata={"url": "https://example.com/2"}), + ] + docstore.write_documents(documents) + checker = UrlCacheChecker(docstore) + results = checker.run(urls=["https://example.com/1", "https://example.com/5"]) + assert results == {"found": [documents[0], documents[2]], "missing": ["https://example.com/5"]} From 991cbe88e892a4f3534b98c038ea3c132f4c8ab5 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 20 Sep 2023 11:00:33 +0200 Subject: [PATCH 4/6] reno --- releasenotes/notes/url-cache-checker-a0fb3d7ad0bdb8c2.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 releasenotes/notes/url-cache-checker-a0fb3d7ad0bdb8c2.yaml diff --git a/releasenotes/notes/url-cache-checker-a0fb3d7ad0bdb8c2.yaml b/releasenotes/notes/url-cache-checker-a0fb3d7ad0bdb8c2.yaml new file mode 100644 index 0000000000..d872c8b801 --- /dev/null +++ b/releasenotes/notes/url-cache-checker-a0fb3d7ad0bdb8c2.yaml @@ -0,0 +1,6 @@ +--- +preview: + - | + Add `UrlCacheChecker` to support Web retrieval pipelines. + Check if documents coming from a given list of URLs are already present in the store and if so, returns them. + All URLs with no matching documents are returned on a separate connection. From c834798912e4bd891ea4c5d5563e61d0917b8995 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 20 Sep 2023 11:34:55 +0200 Subject: [PATCH 5/6] pylint --- haystack/preview/components/caching/url_cache_checker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/preview/components/caching/url_cache_checker.py b/haystack/preview/components/caching/url_cache_checker.py index 187444342b..eb203c8cc7 100644 --- a/haystack/preview/components/caching/url_cache_checker.py +++ b/haystack/preview/components/caching/url_cache_checker.py @@ -1,7 +1,7 @@ from typing import List, Dict, Any from haystack.preview import component, Document, default_from_dict, default_to_dict, DeserializationError -from haystack.preview.document_stores import DocumentStore, DuplicatePolicy, document_store +from haystack.preview.document_stores import DocumentStore, document_store @component From ef1f55e283ba7d5234057ced9c773d671cbbbf0a Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 20 Sep 2023 12:00:42 +0200 Subject: [PATCH 6/6] review feedback --- .../components/caching/url_cache_checker.py | 14 ++++++-------- .../components/caching/test_url_cache_checker.py | 3 +-- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/haystack/preview/components/caching/url_cache_checker.py b/haystack/preview/components/caching/url_cache_checker.py index eb203c8cc7..c3d87bcfcc 100644 --- a/haystack/preview/components/caching/url_cache_checker.py +++ b/haystack/preview/components/caching/url_cache_checker.py @@ -7,16 +7,13 @@ @component class UrlCacheChecker: """ - A component that check if a document coming from a given URL is already present in the store. - - Can be used to implement a caching functionality with a Document Store in web retrieval pipelines. + A component checks for the presence of a document from a specific URL in the store. UrlCacheChecker can thus + implement caching functionality within web retrieval pipelines that use a Document Store. """ def __init__(self, document_store: DocumentStore, url_field: str = "url"): """ Create a UrlCacheChecker component. - - :param policy: The policy to use when encountering duplicate documents (default is DuplicatePolicy.FAIL). """ self.document_store = document_store self.url_field = url_field @@ -45,10 +42,11 @@ def from_dict(cls, data: Dict[str, Any]) -> "UrlCacheChecker": data["init_parameters"]["document_store"] = docstore return default_from_dict(cls, data) - @component.output_types(found=List[Document], missing=List[str]) + @component.output_types(hits=List[Document], misses=List[str]) def run(self, urls: List[str]): """ - Checks if any document coming from the given URL is already present in the store and if so, returns it. + Checks if any document coming from the given URL is already present in the store. If matching documents are + found, they are returned. If not, the URL is returned as a miss. :param urls: All the URLs the documents may be coming from to hit this cache. """ @@ -62,4 +60,4 @@ def run(self, urls: List[str]): found_documents.extend(found) else: missing_urls.append(url) - return {"found": found_documents, "missing": missing_urls} + return {"hits": found_documents, "misses": missing_urls} diff --git a/test/preview/components/caching/test_url_cache_checker.py b/test/preview/components/caching/test_url_cache_checker.py index 42450d184c..d1318c50af 100644 --- a/test/preview/components/caching/test_url_cache_checker.py +++ b/test/preview/components/caching/test_url_cache_checker.py @@ -6,7 +6,6 @@ from haystack.preview.testing.factory import document_store_class from haystack.preview.document_stores.memory import MemoryDocumentStore from haystack.preview.components.caching.url_cache_checker import UrlCacheChecker -from haystack.preview.document_stores import DuplicatePolicy class TestUrlCacheChecker: @@ -83,4 +82,4 @@ def test_run(self): docstore.write_documents(documents) checker = UrlCacheChecker(docstore) results = checker.run(urls=["https://example.com/1", "https://example.com/5"]) - assert results == {"found": [documents[0], documents[2]], "missing": ["https://example.com/5"]} + assert results == {"hits": [documents[0], documents[2]], "misses": ["https://example.com/5"]}