Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: UrlCacheChecker #5841

Merged
merged 6 commits into from
Sep 20, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
65 changes: 65 additions & 0 deletions haystack/preview/components/caching/url_cache_checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from typing import List, Dict, Any

from haystack.preview import component, Document, default_from_dict, default_to_dict, DeserializationError
from haystack.preview.document_stores import DocumentStore, document_store


@component
class UrlCacheChecker:
"""
ZanSara marked this conversation as resolved.
Show resolved Hide resolved
A component that check if a document coming from a given URL is already present in the store.

Can be used to implement a caching functionality with a Document Store in web retrieval pipelines.
"""

def __init__(self, document_store: DocumentStore, url_field: str = "url"):
"""
Create a UrlCacheChecker component.

:param policy: The policy to use when encountering duplicate documents (default is DuplicatePolicy.FAIL).
"""
self.document_store = document_store
self.url_field = url_field

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
"""
return default_to_dict(self, document_store=self.document_store.to_dict(), url_field=self.url_field)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "UrlCacheChecker":
"""
Deserialize this component from a dictionary.
"""
init_params = data.get("init_parameters", {})
if "document_store" not in init_params:
raise DeserializationError("Missing 'document_store' in serialization data")
if "type" not in init_params["document_store"]:
raise DeserializationError("Missing 'type' in document store's serialization data")
if init_params["document_store"]["type"] not in document_store.registry:
raise DeserializationError(f"DocumentStore of type '{init_params['document_store']['type']}' not found.")
docstore_class = document_store.registry[init_params["document_store"]["type"]]
docstore = docstore_class.from_dict(init_params["document_store"])

data["init_parameters"]["document_store"] = docstore
return default_from_dict(cls, data)

@component.output_types(found=List[Document], missing=List[str])
def run(self, urls: List[str]):
"""
Checks if any document coming from the given URL is already present in the store and if so, returns it.
ZanSara marked this conversation as resolved.
Show resolved Hide resolved

:param urls: All the URLs the documents may be coming from to hit this cache.
"""
found_documents = []
missing_urls = []

for url in urls:
filters = {self.url_field: url}
found = self.document_store.filter_documents(filters=filters)
if found:
found_documents.extend(found)
else:
missing_urls.append(url)
return {"found": found_documents, "missing": missing_urls}
ZanSara marked this conversation as resolved.
Show resolved Hide resolved
6 changes: 6 additions & 0 deletions releasenotes/notes/url-cache-checker-a0fb3d7ad0bdb8c2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
preview:
- |
Add `UrlCacheChecker` to support Web retrieval pipelines.
Check if documents coming from a given list of URLs are already present in the store and if so, returns them.
All URLs with no matching documents are returned on a separate connection.
86 changes: 86 additions & 0 deletions test/preview/components/caching/test_url_cache_checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from unittest.mock import MagicMock

import pytest

from haystack.preview import Document, DeserializationError
from haystack.preview.testing.factory import document_store_class
from haystack.preview.document_stores.memory import MemoryDocumentStore
from haystack.preview.components.caching.url_cache_checker import UrlCacheChecker
from haystack.preview.document_stores import DuplicatePolicy


class TestUrlCacheChecker:
@pytest.mark.unit
def test_to_dict(self):
mocked_docstore_class = document_store_class("MockedDocumentStore")
component = UrlCacheChecker(document_store=mocked_docstore_class())
data = component.to_dict()
assert data == {
"type": "UrlCacheChecker",
"init_parameters": {
"document_store": {"type": "MockedDocumentStore", "init_parameters": {}},
"url_field": "url",
},
}

@pytest.mark.unit
def test_to_dict_with_custom_init_parameters(self):
mocked_docstore_class = document_store_class("MockedDocumentStore")
component = UrlCacheChecker(document_store=mocked_docstore_class(), url_field="my_url_field")
data = component.to_dict()
assert data == {
"type": "UrlCacheChecker",
"init_parameters": {
"document_store": {"type": "MockedDocumentStore", "init_parameters": {}},
"url_field": "my_url_field",
},
}

@pytest.mark.unit
def test_from_dict(self):
mocked_docstore_class = document_store_class("MockedDocumentStore")
data = {
"type": "UrlCacheChecker",
"init_parameters": {
"document_store": {"type": "MockedDocumentStore", "init_parameters": {}},
"url_field": "my_url_field",
},
}
component = UrlCacheChecker.from_dict(data)
assert isinstance(component.document_store, mocked_docstore_class)
assert component.url_field == "my_url_field"

@pytest.mark.unit
def test_from_dict_without_docstore(self):
data = {"type": "UrlCacheChecker", "init_parameters": {}}
with pytest.raises(DeserializationError, match="Missing 'document_store' in serialization data"):
UrlCacheChecker.from_dict(data)

@pytest.mark.unit
def test_from_dict_without_docstore_type(self):
data = {"type": "UrlCacheChecker", "init_parameters": {"document_store": {"init_parameters": {}}}}
with pytest.raises(DeserializationError, match="Missing 'type' in document store's serialization data"):
UrlCacheChecker.from_dict(data)

@pytest.mark.unit
def test_from_dict_nonexisting_docstore(self):
data = {
"type": "UrlCacheChecker",
"init_parameters": {"document_store": {"type": "NonexistingDocumentStore", "init_parameters": {}}},
}
with pytest.raises(DeserializationError, match="DocumentStore of type 'NonexistingDocumentStore' not found."):
UrlCacheChecker.from_dict(data)

@pytest.mark.unit
def test_run(self):
docstore = MemoryDocumentStore()
documents = [
Document(text="doc1", metadata={"url": "https://example.com/1"}),
Document(text="doc2", metadata={"url": "https://example.com/2"}),
Document(text="doc3", metadata={"url": "https://example.com/1"}),
Document(text="doc4", metadata={"url": "https://example.com/2"}),
]
docstore.write_documents(documents)
checker = UrlCacheChecker(docstore)
results = checker.run(urls=["https://example.com/1", "https://example.com/5"])
assert results == {"found": [documents[0], documents[2]], "missing": ["https://example.com/5"]}