Skip to content

Commit

Permalink
feat: Add LinkContentFetcher Haystack 2.0 component (#5724)
Browse files Browse the repository at this point in the history
* Add LinkContentFetcher

* Add release note

* Small fixes

* Fix pydocs

* PR feedback

* Remove handlers registration

* PR feedback

* adjustments

* improve tests

* initial draft

* tests

* add proposal

* proposal number

* reno

* fix tests and usage of content and content_type

* update branch & fix more tests

* mypy

* use the new document

* add docstring

* fix more tests

* mypy

* fix tests

* add e2e

* review feedback

* improve __str__

* Apply suggestions from code review

Co-authored-by: Daria Fokina <[email protected]>

* Update haystack/preview/dataclasses/document.py

Co-authored-by: Daria Fokina <[email protected]>

* improve __str__

* fix tests

* fix more tests

* fix test

* Fix end-of-file-fixer

* Post merge fixes

* Move e2e tests back into component

---------

Co-authored-by: ZanSara <[email protected]>
Co-authored-by: Daria Fokina <[email protected]>
  • Loading branch information
3 people authored Sep 20, 2023
1 parent bf6d306 commit 0983fb6
Show file tree
Hide file tree
Showing 9 changed files with 348 additions and 4 deletions.
1 change: 1 addition & 0 deletions haystack/preview/components/fetchers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from haystack.preview.components.fetchers.link_content import LinkContentFetcher
163 changes: 163 additions & 0 deletions haystack/preview/components/fetchers/link_content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import io
import logging
from collections import defaultdict
from datetime import datetime
from typing import Optional, Dict, List, Callable, Any, IO

import requests
from requests import Response
from requests.exceptions import HTTPError
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, RetryCallState
from haystack.preview import component, default_from_dict, default_to_dict

from haystack import __version__
from haystack.preview import Document

logger = logging.getLogger(__name__)


DEFAULT_USER_AGENT = f"haystack/LinkContentFetcher/{__version__}"

REQUEST_HEADERS = {
"accept": "*/*",
"User-Agent": DEFAULT_USER_AGENT,
"Accept-Language": "en-US,en;q=0.9,it;q=0.8,es;q=0.7",
"referer": "https://www.google.com/",
}


def text_content_handler(response: Response) -> Dict[str, str]:
"""
:param response: Response object from the request.
:return: The extracted text.
"""
return {"text": response.text}


def binary_content_handler(response: Response) -> Dict[str, IO[bytes]]:
"""
:param response: Response object from the request.
:return: The extracted binary file-like object.
"""
return {"blob": io.BytesIO(response.content)}


@component
class LinkContentFetcher:
"""
LinkContentFetcher fetches content from a URL link and converts it to a Document object.
"""

def __init__(
self,
raise_on_failure: bool = True,
user_agents: Optional[List[str]] = None,
retry_attempts: int = 2,
timeout: int = 3,
):
"""
Creates a LinkContentFetcher instance.
:param raise_on_failure: A boolean indicating whether to raise an exception when a failure occurs
during content extraction. If False, the error is simply logged and the program continues.
Defaults to False.
:param user_agents: A list of user agents to use when fetching content. Defaults to None, in which case a
default user agent is used.
:param retry_attempts: The number of times to retry fetching content. Defaults to 2.
:param timeout: The timeout in seconds for the request. Defaults to 3.
"""
self.raise_on_failure = raise_on_failure
self.user_agents = user_agents or [DEFAULT_USER_AGENT]
self.current_user_agent_idx: int = 0
self.retry_attempts = retry_attempts
self.timeout = timeout

# register default content handlers that extract data from the response
self.handlers: Dict[str, Callable[[Response], Dict[str, Any]]] = defaultdict(lambda: text_content_handler)
self.handlers["text/html"] = text_content_handler
self.handlers["text/plain"] = text_content_handler
self.handlers["application/pdf"] = binary_content_handler
self.handlers["application/octet-stream"] = binary_content_handler

@retry(
reraise=True,
stop=stop_after_attempt(self.retry_attempts),
wait=wait_exponential(multiplier=1, min=2, max=10),
retry=(retry_if_exception_type((HTTPError, requests.RequestException))),
# This method is invoked only after failed requests (exception raised)
after=self._switch_user_agent,
)
def get_response(url):
# we need to copy because we modify the headers
headers = REQUEST_HEADERS.copy()
headers["User-Agent"] = self.user_agents[self.current_user_agent_idx]
response = requests.get(url, headers=headers, timeout=timeout or 3)
response.raise_for_status()
return response

self._get_response: Callable = get_response

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
"""
return default_to_dict(
self,
raise_on_failure=self.raise_on_failure,
user_agents=self.user_agents,
retry_attempts=self.retry_attempts,
timeout=self.timeout,
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "LinkContentFetcher":
"""
Deserialize this component from a dictionary.
"""
return default_from_dict(cls, data)

@component.output_types(documents=Optional[Document])
def run(self, url: str):
"""
Fetches content from a URL and converts it to a Document objects. If no content is extracted,
an empty Document object is returned (if raise_on_failure is False).
:param url: URL to fetch content from.
:param timeout: Timeout in seconds for the request.
:return: List of Document objects or an empty list if no content is extracted.
"""
document_data: Dict[str, Any] = {"metadata": {"url": url, "timestamp": int(datetime.utcnow().timestamp())}}
try:
response = self._get_response(url)
content_type = self._get_content_type(response)
document_data["mime_type"] = content_type
handler: Callable = self.handlers[content_type]
document_data.update(handler(response))
return {"document": Document(**document_data)}

except Exception as e:
if self.raise_on_failure:
raise e
logger.debug("Couldn't retrieve content from %s", url)
return {"document": None}

finally:
self.current_user_agent_idx = 0

def _get_content_type(self, response: Response):
"""
Get the content type of the response.
:param response: The response object.
:return: The content type of the response.
"""
content_type = response.headers.get("Content-Type", "")
return content_type.split(";")[0]

def _switch_user_agent(self, retry_state: RetryCallState) -> None:
"""
Switches the User-Agent for this LinkContentRetriever to the next one in the list of user agents.
Used by tenacity to retry the requests with a different user agent.
:param retry_state: The retry state (unused, required by tenacity).
"""
self.current_user_agent_idx = (self.current_user_agent_idx + 1) % len(self.user_agents)
logger.debug("Switched user agent to %s", self.user_agents[self.current_user_agent_idx])
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
preview:
- |
Adds LinkContentFetcher component to Haystack 2.0. LinkContentFetcher fetches content from a given URL and
converts it into a Document object, which can then be used within the Haystack 2.0 pipeline.
6 changes: 3 additions & 3 deletions releasenotes/notes/refactor-pinecone-document-store.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
enhancements:
- |
Refactor PineconeDocumentStore to use metadata instead of namespaces
for distinction between documents with embeddings, documents without
embeddings and labels
Refactor PineconeDocumentStore to use metadata instead of namespaces
for distinction between documents with embeddings, documents without
embeddings and labels
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,3 @@
fixes:
- |
gpt-35-turbo-16k model from Azure can integrate correctly
Empty file.
170 changes: 170 additions & 0 deletions test/preview/components/fetchers/test_link_content_fetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import io
from unittest.mock import patch, Mock

import pytest

from haystack.preview.components.fetchers.link_content import (
LinkContentFetcher,
text_content_handler,
binary_content_handler,
DEFAULT_USER_AGENT,
)

HTML_URL = "https://docs.haystack.deepset.ai/docs"
TEXT_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/main/README.md"
PDF_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/b5987a6d8d0714eb2f3011183ab40093d2e4a41a/e2e/samples/pipelines/sample_pdf_1.pdf"


@pytest.fixture
def mock_get_link_text_content():
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
mock_run.get.return_value = Mock(
status_code=200, text="Example test response", headers={"Content-Type": "text/plain"}
)
yield mock_run


@pytest.fixture
def mock_get_link_content(test_files_path):
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
mock_run.get.return_value = Mock(
status_code=200,
content=open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read(),
headers={"Content-Type": "application/pdf"},
)
yield mock_run


class TestLinkContentFetcher:
@pytest.mark.unit
def test_init(self):
fetcher = LinkContentFetcher()
assert fetcher.raise_on_failure is True
assert fetcher.user_agents == [DEFAULT_USER_AGENT]
assert fetcher.retry_attempts == 2
assert fetcher.timeout == 3
assert fetcher.handlers == {
"text/html": text_content_handler,
"text/plain": text_content_handler,
"application/pdf": binary_content_handler,
"application/octet-stream": binary_content_handler,
}
assert hasattr(fetcher, "_get_response")

@pytest.mark.unit
def test_init_with_params(self):
fetcher = LinkContentFetcher(raise_on_failure=False, user_agents=["test"], retry_attempts=1, timeout=2)
assert fetcher.raise_on_failure is False
assert fetcher.user_agents == ["test"]
assert fetcher.retry_attempts == 1
assert fetcher.timeout == 2

@pytest.mark.unit
def test_to_dict(self):
fetcher = LinkContentFetcher()
assert fetcher.to_dict() == {
"type": "LinkContentFetcher",
"init_parameters": {
"raise_on_failure": True,
"user_agents": [DEFAULT_USER_AGENT],
"retry_attempts": 2,
"timeout": 3,
},
}

@pytest.mark.unit
def test_to_dict_with_params(self):
fetcher = LinkContentFetcher(raise_on_failure=False, user_agents=["test"], retry_attempts=1, timeout=2)
assert fetcher.to_dict() == {
"type": "LinkContentFetcher",
"init_parameters": {"raise_on_failure": False, "user_agents": ["test"], "retry_attempts": 1, "timeout": 2},
}

@pytest.mark.unit
def test_from_dict(self):
fetcher = LinkContentFetcher.from_dict(
{
"type": "LinkContentFetcher",
"init_parameters": {
"raise_on_failure": False,
"user_agents": ["test"],
"retry_attempts": 1,
"timeout": 2,
},
}
)
assert fetcher.raise_on_failure is False
assert fetcher.user_agents == ["test"]
assert fetcher.retry_attempts == 1

@pytest.mark.unit
def test_run_text(self):
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
mock_run.get.return_value = Mock(
status_code=200, text="Example test response", headers={"Content-Type": "text/plain"}
)
fetcher = LinkContentFetcher()
document = fetcher.run("https://www.example.com")["document"]
assert document.text == "Example test response"
assert document.metadata["url"] == "https://www.example.com"
assert "timestamp" in document.metadata

@pytest.mark.unit
def test_run_html(self):
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
mock_run.get.return_value = Mock(
status_code=200, text="<h1>Example test response</h1>", headers={"Content-Type": "text/html"}
)
fetcher = LinkContentFetcher()
document = fetcher.run("https://www.example.com")["document"]
assert document.text == "<h1>Example test response</h1>"
assert document.metadata["url"] == "https://www.example.com"
assert "timestamp" in document.metadata

@pytest.mark.unit
def test_run_binary(self, test_files_path):
file_bytes = open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read()
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
mock_run.get.return_value = Mock(
status_code=200, content=file_bytes, headers={"Content-Type": "application/pdf"}
)
fetcher = LinkContentFetcher()
document = fetcher.run("https://www.example.com")["document"]
# casting to list to make the blobs comparable
assert list(document.blob) == list(io.BytesIO(file_bytes))
assert document.metadata["url"] == "https://www.example.com"
assert "timestamp" in document.metadata

@pytest.mark.unit
def test_run_bad_status_code(self):
fetcher = LinkContentFetcher(raise_on_failure=False)
mock_response = Mock(status_code=403)
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
mock_run.get.return_value = mock_response
document = fetcher.run("https://www.example.com")["document"]
assert document is None

@pytest.mark.integration
def test_link_content_fetcher_html(self):
fetcher = LinkContentFetcher()
document = fetcher.run(HTML_URL)["document"]
assert document.mime_type == "text/html"
assert "Introduction to Haystack" in document.text
assert document.metadata["url"] == HTML_URL

@pytest.mark.integration
def test_link_content_fetcher_text(self):
fetcher = LinkContentFetcher()
document = fetcher.run(TEXT_URL)["document"]
assert document.mime_type == "text/plain"
assert "Haystack" in document.text
assert document.metadata["url"] == TEXT_URL

@pytest.mark.integration
def test_link_content_fetcher_pdf(self):
fetcher = LinkContentFetcher()
document = fetcher.run(PDF_URL)["document"]
assert document.mime_type == "application/octet-stream" # FIXME Should be "application/pdf"?
assert document.text is None
assert document.blob is not None
assert document.metadata["url"] == PDF_URL
6 changes: 6 additions & 0 deletions test/preview/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pathlib import Path
from unittest.mock import Mock, patch
import pytest

Expand All @@ -11,3 +12,8 @@ def mock_tokenizer():
tokenizer.encode = lambda text: text.split()
tokenizer.decode = lambda tokens: " ".join(tokens)
return tokenizer


@pytest.fixture()
def test_files_path():
return Path(__file__).parent / "test_files"
Binary file added test/preview/test_files/pdf/sample_pdf_1.pdf
Binary file not shown.

0 comments on commit 0983fb6

Please sign in to comment.