-
Notifications
You must be signed in to change notification settings - Fork 2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add
LinkContentFetcher
Haystack 2.0 component (#5724)
* Add LinkContentFetcher * Add release note * Small fixes * Fix pydocs * PR feedback * Remove handlers registration * PR feedback * adjustments * improve tests * initial draft * tests * add proposal * proposal number * reno * fix tests and usage of content and content_type * update branch & fix more tests * mypy * use the new document * add docstring * fix more tests * mypy * fix tests * add e2e * review feedback * improve __str__ * Apply suggestions from code review Co-authored-by: Daria Fokina <[email protected]> * Update haystack/preview/dataclasses/document.py Co-authored-by: Daria Fokina <[email protected]> * improve __str__ * fix tests * fix more tests * fix test * Fix end-of-file-fixer * Post merge fixes * Move e2e tests back into component --------- Co-authored-by: ZanSara <[email protected]> Co-authored-by: Daria Fokina <[email protected]>
- Loading branch information
1 parent
bf6d306
commit 0983fb6
Showing
9 changed files
with
348 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from haystack.preview.components.fetchers.link_content import LinkContentFetcher |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
import io | ||
import logging | ||
from collections import defaultdict | ||
from datetime import datetime | ||
from typing import Optional, Dict, List, Callable, Any, IO | ||
|
||
import requests | ||
from requests import Response | ||
from requests.exceptions import HTTPError | ||
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, RetryCallState | ||
from haystack.preview import component, default_from_dict, default_to_dict | ||
|
||
from haystack import __version__ | ||
from haystack.preview import Document | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
DEFAULT_USER_AGENT = f"haystack/LinkContentFetcher/{__version__}" | ||
|
||
REQUEST_HEADERS = { | ||
"accept": "*/*", | ||
"User-Agent": DEFAULT_USER_AGENT, | ||
"Accept-Language": "en-US,en;q=0.9,it;q=0.8,es;q=0.7", | ||
"referer": "https://www.google.com/", | ||
} | ||
|
||
|
||
def text_content_handler(response: Response) -> Dict[str, str]: | ||
""" | ||
:param response: Response object from the request. | ||
:return: The extracted text. | ||
""" | ||
return {"text": response.text} | ||
|
||
|
||
def binary_content_handler(response: Response) -> Dict[str, IO[bytes]]: | ||
""" | ||
:param response: Response object from the request. | ||
:return: The extracted binary file-like object. | ||
""" | ||
return {"blob": io.BytesIO(response.content)} | ||
|
||
|
||
@component | ||
class LinkContentFetcher: | ||
""" | ||
LinkContentFetcher fetches content from a URL link and converts it to a Document object. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
raise_on_failure: bool = True, | ||
user_agents: Optional[List[str]] = None, | ||
retry_attempts: int = 2, | ||
timeout: int = 3, | ||
): | ||
""" | ||
Creates a LinkContentFetcher instance. | ||
:param raise_on_failure: A boolean indicating whether to raise an exception when a failure occurs | ||
during content extraction. If False, the error is simply logged and the program continues. | ||
Defaults to False. | ||
:param user_agents: A list of user agents to use when fetching content. Defaults to None, in which case a | ||
default user agent is used. | ||
:param retry_attempts: The number of times to retry fetching content. Defaults to 2. | ||
:param timeout: The timeout in seconds for the request. Defaults to 3. | ||
""" | ||
self.raise_on_failure = raise_on_failure | ||
self.user_agents = user_agents or [DEFAULT_USER_AGENT] | ||
self.current_user_agent_idx: int = 0 | ||
self.retry_attempts = retry_attempts | ||
self.timeout = timeout | ||
|
||
# register default content handlers that extract data from the response | ||
self.handlers: Dict[str, Callable[[Response], Dict[str, Any]]] = defaultdict(lambda: text_content_handler) | ||
self.handlers["text/html"] = text_content_handler | ||
self.handlers["text/plain"] = text_content_handler | ||
self.handlers["application/pdf"] = binary_content_handler | ||
self.handlers["application/octet-stream"] = binary_content_handler | ||
|
||
@retry( | ||
reraise=True, | ||
stop=stop_after_attempt(self.retry_attempts), | ||
wait=wait_exponential(multiplier=1, min=2, max=10), | ||
retry=(retry_if_exception_type((HTTPError, requests.RequestException))), | ||
# This method is invoked only after failed requests (exception raised) | ||
after=self._switch_user_agent, | ||
) | ||
def get_response(url): | ||
# we need to copy because we modify the headers | ||
headers = REQUEST_HEADERS.copy() | ||
headers["User-Agent"] = self.user_agents[self.current_user_agent_idx] | ||
response = requests.get(url, headers=headers, timeout=timeout or 3) | ||
response.raise_for_status() | ||
return response | ||
|
||
self._get_response: Callable = get_response | ||
|
||
def to_dict(self) -> Dict[str, Any]: | ||
""" | ||
Serialize this component to a dictionary. | ||
""" | ||
return default_to_dict( | ||
self, | ||
raise_on_failure=self.raise_on_failure, | ||
user_agents=self.user_agents, | ||
retry_attempts=self.retry_attempts, | ||
timeout=self.timeout, | ||
) | ||
|
||
@classmethod | ||
def from_dict(cls, data: Dict[str, Any]) -> "LinkContentFetcher": | ||
""" | ||
Deserialize this component from a dictionary. | ||
""" | ||
return default_from_dict(cls, data) | ||
|
||
@component.output_types(documents=Optional[Document]) | ||
def run(self, url: str): | ||
""" | ||
Fetches content from a URL and converts it to a Document objects. If no content is extracted, | ||
an empty Document object is returned (if raise_on_failure is False). | ||
:param url: URL to fetch content from. | ||
:param timeout: Timeout in seconds for the request. | ||
:return: List of Document objects or an empty list if no content is extracted. | ||
""" | ||
document_data: Dict[str, Any] = {"metadata": {"url": url, "timestamp": int(datetime.utcnow().timestamp())}} | ||
try: | ||
response = self._get_response(url) | ||
content_type = self._get_content_type(response) | ||
document_data["mime_type"] = content_type | ||
handler: Callable = self.handlers[content_type] | ||
document_data.update(handler(response)) | ||
return {"document": Document(**document_data)} | ||
|
||
except Exception as e: | ||
if self.raise_on_failure: | ||
raise e | ||
logger.debug("Couldn't retrieve content from %s", url) | ||
return {"document": None} | ||
|
||
finally: | ||
self.current_user_agent_idx = 0 | ||
|
||
def _get_content_type(self, response: Response): | ||
""" | ||
Get the content type of the response. | ||
:param response: The response object. | ||
:return: The content type of the response. | ||
""" | ||
content_type = response.headers.get("Content-Type", "") | ||
return content_type.split(";")[0] | ||
|
||
def _switch_user_agent(self, retry_state: RetryCallState) -> None: | ||
""" | ||
Switches the User-Agent for this LinkContentRetriever to the next one in the list of user agents. | ||
Used by tenacity to retry the requests with a different user agent. | ||
:param retry_state: The retry state (unused, required by tenacity). | ||
""" | ||
self.current_user_agent_idx = (self.current_user_agent_idx + 1) % len(self.user_agents) | ||
logger.debug("Switched user agent to %s", self.user_agents[self.current_user_agent_idx]) |
5 changes: 5 additions & 0 deletions
5
releasenotes/notes/add-link-content-fetcher-145915976f38e1e0.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
--- | ||
preview: | ||
- | | ||
Adds LinkContentFetcher component to Haystack 2.0. LinkContentFetcher fetches content from a given URL and | ||
converts it into a Document object, which can then be used within the Haystack 2.0 pipeline. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
--- | ||
enhancements: | ||
- | | ||
Refactor PineconeDocumentStore to use metadata instead of namespaces | ||
for distinction between documents with embeddings, documents without | ||
embeddings and labels | ||
Refactor PineconeDocumentStore to use metadata instead of namespaces | ||
for distinction between documents with embeddings, documents without | ||
embeddings and labels |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,4 +2,3 @@ | |
fixes: | ||
- | | ||
gpt-35-turbo-16k model from Azure can integrate correctly | ||
Empty file.
170 changes: 170 additions & 0 deletions
170
test/preview/components/fetchers/test_link_content_fetcher.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
import io | ||
from unittest.mock import patch, Mock | ||
|
||
import pytest | ||
|
||
from haystack.preview.components.fetchers.link_content import ( | ||
LinkContentFetcher, | ||
text_content_handler, | ||
binary_content_handler, | ||
DEFAULT_USER_AGENT, | ||
) | ||
|
||
HTML_URL = "https://docs.haystack.deepset.ai/docs" | ||
TEXT_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/main/README.md" | ||
PDF_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/b5987a6d8d0714eb2f3011183ab40093d2e4a41a/e2e/samples/pipelines/sample_pdf_1.pdf" | ||
|
||
|
||
@pytest.fixture | ||
def mock_get_link_text_content(): | ||
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run: | ||
mock_run.get.return_value = Mock( | ||
status_code=200, text="Example test response", headers={"Content-Type": "text/plain"} | ||
) | ||
yield mock_run | ||
|
||
|
||
@pytest.fixture | ||
def mock_get_link_content(test_files_path): | ||
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run: | ||
mock_run.get.return_value = Mock( | ||
status_code=200, | ||
content=open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read(), | ||
headers={"Content-Type": "application/pdf"}, | ||
) | ||
yield mock_run | ||
|
||
|
||
class TestLinkContentFetcher: | ||
@pytest.mark.unit | ||
def test_init(self): | ||
fetcher = LinkContentFetcher() | ||
assert fetcher.raise_on_failure is True | ||
assert fetcher.user_agents == [DEFAULT_USER_AGENT] | ||
assert fetcher.retry_attempts == 2 | ||
assert fetcher.timeout == 3 | ||
assert fetcher.handlers == { | ||
"text/html": text_content_handler, | ||
"text/plain": text_content_handler, | ||
"application/pdf": binary_content_handler, | ||
"application/octet-stream": binary_content_handler, | ||
} | ||
assert hasattr(fetcher, "_get_response") | ||
|
||
@pytest.mark.unit | ||
def test_init_with_params(self): | ||
fetcher = LinkContentFetcher(raise_on_failure=False, user_agents=["test"], retry_attempts=1, timeout=2) | ||
assert fetcher.raise_on_failure is False | ||
assert fetcher.user_agents == ["test"] | ||
assert fetcher.retry_attempts == 1 | ||
assert fetcher.timeout == 2 | ||
|
||
@pytest.mark.unit | ||
def test_to_dict(self): | ||
fetcher = LinkContentFetcher() | ||
assert fetcher.to_dict() == { | ||
"type": "LinkContentFetcher", | ||
"init_parameters": { | ||
"raise_on_failure": True, | ||
"user_agents": [DEFAULT_USER_AGENT], | ||
"retry_attempts": 2, | ||
"timeout": 3, | ||
}, | ||
} | ||
|
||
@pytest.mark.unit | ||
def test_to_dict_with_params(self): | ||
fetcher = LinkContentFetcher(raise_on_failure=False, user_agents=["test"], retry_attempts=1, timeout=2) | ||
assert fetcher.to_dict() == { | ||
"type": "LinkContentFetcher", | ||
"init_parameters": {"raise_on_failure": False, "user_agents": ["test"], "retry_attempts": 1, "timeout": 2}, | ||
} | ||
|
||
@pytest.mark.unit | ||
def test_from_dict(self): | ||
fetcher = LinkContentFetcher.from_dict( | ||
{ | ||
"type": "LinkContentFetcher", | ||
"init_parameters": { | ||
"raise_on_failure": False, | ||
"user_agents": ["test"], | ||
"retry_attempts": 1, | ||
"timeout": 2, | ||
}, | ||
} | ||
) | ||
assert fetcher.raise_on_failure is False | ||
assert fetcher.user_agents == ["test"] | ||
assert fetcher.retry_attempts == 1 | ||
|
||
@pytest.mark.unit | ||
def test_run_text(self): | ||
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run: | ||
mock_run.get.return_value = Mock( | ||
status_code=200, text="Example test response", headers={"Content-Type": "text/plain"} | ||
) | ||
fetcher = LinkContentFetcher() | ||
document = fetcher.run("https://www.example.com")["document"] | ||
assert document.text == "Example test response" | ||
assert document.metadata["url"] == "https://www.example.com" | ||
assert "timestamp" in document.metadata | ||
|
||
@pytest.mark.unit | ||
def test_run_html(self): | ||
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run: | ||
mock_run.get.return_value = Mock( | ||
status_code=200, text="<h1>Example test response</h1>", headers={"Content-Type": "text/html"} | ||
) | ||
fetcher = LinkContentFetcher() | ||
document = fetcher.run("https://www.example.com")["document"] | ||
assert document.text == "<h1>Example test response</h1>" | ||
assert document.metadata["url"] == "https://www.example.com" | ||
assert "timestamp" in document.metadata | ||
|
||
@pytest.mark.unit | ||
def test_run_binary(self, test_files_path): | ||
file_bytes = open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read() | ||
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run: | ||
mock_run.get.return_value = Mock( | ||
status_code=200, content=file_bytes, headers={"Content-Type": "application/pdf"} | ||
) | ||
fetcher = LinkContentFetcher() | ||
document = fetcher.run("https://www.example.com")["document"] | ||
# casting to list to make the blobs comparable | ||
assert list(document.blob) == list(io.BytesIO(file_bytes)) | ||
assert document.metadata["url"] == "https://www.example.com" | ||
assert "timestamp" in document.metadata | ||
|
||
@pytest.mark.unit | ||
def test_run_bad_status_code(self): | ||
fetcher = LinkContentFetcher(raise_on_failure=False) | ||
mock_response = Mock(status_code=403) | ||
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run: | ||
mock_run.get.return_value = mock_response | ||
document = fetcher.run("https://www.example.com")["document"] | ||
assert document is None | ||
|
||
@pytest.mark.integration | ||
def test_link_content_fetcher_html(self): | ||
fetcher = LinkContentFetcher() | ||
document = fetcher.run(HTML_URL)["document"] | ||
assert document.mime_type == "text/html" | ||
assert "Introduction to Haystack" in document.text | ||
assert document.metadata["url"] == HTML_URL | ||
|
||
@pytest.mark.integration | ||
def test_link_content_fetcher_text(self): | ||
fetcher = LinkContentFetcher() | ||
document = fetcher.run(TEXT_URL)["document"] | ||
assert document.mime_type == "text/plain" | ||
assert "Haystack" in document.text | ||
assert document.metadata["url"] == TEXT_URL | ||
|
||
@pytest.mark.integration | ||
def test_link_content_fetcher_pdf(self): | ||
fetcher = LinkContentFetcher() | ||
document = fetcher.run(PDF_URL)["document"] | ||
assert document.mime_type == "application/octet-stream" # FIXME Should be "application/pdf"? | ||
assert document.text is None | ||
assert document.blob is not None | ||
assert document.metadata["url"] == PDF_URL |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.