Skip to content
This repository has been archived by the owner on Apr 11, 2022. It is now read-only.

Commit

Permalink
Merge pull request #793 from NYPL-Simplified/import-from-unglueit
Browse files Browse the repository at this point in the history
Be cautious when downloading resources from an OPDS feed of free books
  • Loading branch information
leonardr authored Feb 1, 2018
2 parents bfbae76 + 2def5f3 commit b094d79
Show file tree
Hide file tree
Showing 4 changed files with 258 additions and 1 deletion.
105 changes: 105 additions & 0 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8657,6 +8657,111 @@ def browser_http_get(cls, url, headers, **kwargs):
headers['User-Agent'] = cls.BROWSER_USER_AGENT
return cls.simple_http_get(url, headers, **kwargs)

@classmethod
def cautious_http_get(cls, url, headers, **kwargs):
"""Examine the URL we're about to GET, possibly going so far as to
perform a HEAD request, to avoid making a request (or
following a redirect) to a site known to cause problems.
The motivating case is that unglue.it contains gutenberg.org
links that appear to be direct links to EPUBs, but 1) they're
not direct links to EPUBs, and 2) automated requests to
gutenberg.org quickly result in IP bans. So we don't make those
requests.
"""
do_not_access = kwargs.pop(
'do_not_access', cls.AVOID_WHEN_CAUTIOUS_DOMAINS
)
check_for_redirect = kwargs.pop(
'check_for_redirect', cls.EXERCISE_CAUTION_DOMAINS
)
do_get = kwargs.pop('do_get', cls.simple_http_get)
head_client = kwargs.pop('cautious_head_client', requests.head)

if cls.get_would_be_useful(
url, headers, do_not_access, check_for_redirect,
head_client
):
# Go ahead and make the GET request.
return do_get(url, headers, **kwargs)
else:
logging.info(
"Declining to make non-useful HTTP request to %s", url
)
# 417 Expectation Failed - "... if the server is a proxy,
# the server has unambiguous evidence that the request
# could not be met by the next-hop server."
#
# Not quite accurate, but I think it's the closest match
# to "the HTTP client decided to not even make your
# request".
return (
417,
{"content-type" :
"application/vnd.librarysimplified-did-not-make-request"},
"Cautiously decided not to make a GET request to %s" % url
)

# Sites known to host both free books and redirects to a domain in
# AVOID_WHEN_CAUTIOUS_DOMAINS.
EXERCISE_CAUTION_DOMAINS = ['unglue.it']

# Sites that cause problems for us if we make automated
# HTTP requests to them while trying to find free books.
AVOID_WHEN_CAUTIOUS_DOMAINS = ['gutenberg.org']

@classmethod
def get_would_be_useful(
cls, url, headers, do_not_access=None, check_for_redirect=None,
head_client=None
):
"""Determine whether making a GET request to a given URL is likely to
have a useful result.
:param URL: URL under consideration.
:param headers: Headers that would be sent with the GET request.
:param do_not_access: Domains to which GET requests are not useful.
:param check_for_redirect: Domains to which we should make a HEAD
request, in case they redirect to a `do_not_access` domain.
:param head_client: Function for making the HEAD request, if
one becomes necessary. Should return requests.Response or a mock.
"""
do_not_access = do_not_access or cls.AVOID_WHEN_CAUTIOUS_DOMAINS
check_for_redirect = check_for_redirect or cls.EXERCISE_CAUTION_DOMAINS
head_client = head_client or requests.head

def has_domain(domain, check_against):
"""Is the given `domain` in `check_against`,
or maybe a subdomain of one of the domains in `check_against`?
"""
return any(domain == x or domain.endswith('.' + x)
for x in check_against)

netloc = urlparse.urlparse(url).netloc
if has_domain(netloc, do_not_access):
# The link points directly to a domain we don't want to
# access.
return False

if not has_domain(netloc, check_for_redirect):
# We trust this domain not to redirect to a domain we don't
# want to access.
return True

# We might be fine, or we might get redirected to a domain we
# don't want to access. Make a HEAD request to see what
# happens.
head_response = head_client(url, headers=headers)
if head_response.status_code / 100 != 3:
# It's not a redirect. Go ahead and make the GET request.
return True

# Yes, it's a redirect. Does it redirect to a
# domain we don't want to access?
location = head_response.headers.get('location', '')
netloc = urlparse.urlparse(location).netloc
return not has_domain(netloc, do_not_access)

@property
def is_image(self):
return self.media_type and self.media_type.startswith("image/")
Expand Down
7 changes: 6 additions & 1 deletion opds_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
Identifier,
LicensePool,
Measurement,
Representation,
Subject,
RightsStatus,
)
Expand Down Expand Up @@ -427,7 +428,11 @@ def __init__(self, _db, collection, data_source_name=None,
mirror = MirrorUploader.for_collection(collection)
self.mirror = mirror
self.content_modifier = content_modifier
self.http_get = http_get

# In general, we are cautious when mirroring resources so that
# we don't, e.g. accidentally get our IP banned from
# gutenberg.org.
self.http_get = http_get or Representation.cautious_http_get
self.map_from_collection = map_from_collection

@property
Expand Down
136 changes: 136 additions & 0 deletions tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@
DummyHTTPClient,
)

from testing import MockRequestsResponse

from mock_analytics_provider import MockAnalyticsProvider

class TestSessionManager(DatabaseTest):
Expand Down Expand Up @@ -5112,6 +5114,140 @@ def test_automatic_conversion_svg_to_png(self):
assert thumbnail != hyperlink.resource.representation
eq_(Representation.PNG_MEDIA_TYPE, thumbnail.media_type)

def test_cautious_http_get(self):

h = DummyHTTPClient()
h.queue_response(200, content="yay")

# If the domain is obviously safe, the GET request goes through,
# with no HEAD request being made.
m = Representation.cautious_http_get
status, headers, content = m(
"http://safe.org/", {}, do_not_access=['unsafe.org'],
do_get=h.do_get, cautious_head_client=object()
)
eq_(200, status)
eq_("yay", content)

# If the domain is obviously unsafe, no GET request or HEAD
# request is made.
status, headers, content = m(
"http://unsafe.org/", {}, do_not_access=['unsafe.org'],
do_get=object(), cautious_head_client=object()
)
eq_(417, status)
eq_("Cautiously decided not to make a GET request to http://unsafe.org/",
content)

# If the domain is potentially unsafe, a HEAD request is made,
# and the answer depends on its outcome.

# Here, the HEAD request redirects to a prohibited site.
def mock_redirect(*args, **kwargs):
return MockRequestsResponse(
301, dict(location="http://unsafe.org/")
)
status, headers, content = m(
"http://caution.org/", {},
do_not_access=['unsafe.org'],
check_for_redirect=['caution.org'],
do_get=object(), cautious_head_client=mock_redirect
)
eq_(417, status)
eq_("application/vnd.librarysimplified-did-not-make-request",
headers['content-type'])
eq_("Cautiously decided not to make a GET request to http://caution.org/",
content)

# Here, the HEAD request redirects to an allowed site.
h.queue_response(200, content="good content")
def mock_redirect(*args, **kwargs):
return MockRequestsResponse(
301, dict(location="http://safe.org/")
)
status, headers, content = m(
"http://caution.org/", {},
do_not_access=['unsafe.org'],
check_for_redirect=['caution.org'],
do_get=h.do_get, cautious_head_client=mock_redirect
)
eq_(200, status)
eq_("good content", content)

def test_get_would_be_useful(self):
"""Test the method that determines whether a GET request will go (or
redirect) to a site we don't to make requests to.
"""
safe = Representation.get_would_be_useful

# If get_would_be_useful tries to use this object to make a HEAD
# request, the test will blow up.
fake_head = object()

# Most sites are safe with no HEAD request necessary.
eq_(True, safe("http://www.safe-site.org/book.epub", {},
head_client=fake_head))

# gutenberg.org is problematic, no HEAD request necessary.
eq_(False, safe("http://www.gutenberg.org/book.epub", {},
head_client=fake_head))

# do_not_access controls which domains should always be
# considered unsafe.
eq_(
False, safe(
"http://www.safe-site.org/book.epub", {},
do_not_access=['safe-site.org'], head_client=fake_head
)
)
eq_(
True, safe(
"http://www.gutenberg.org/book.epub", {},
do_not_access=['safe-site.org'], head_client=fake_head
)
)

# Domain match is based on a subdomain match, not a substring
# match.
eq_(True, safe("http://www.not-unsafe-site.org/book.epub", {},
do_not_access=['unsafe-site.org'],
head_client=fake_head))

# Some domains (unglue.it) are known to make surprise
# redirects to unsafe domains. For these, we must make a HEAD
# request to check.

def bad_redirect(*args, **kwargs):
return MockRequestsResponse(
301, dict(
location="http://www.gutenberg.org/a-book.html"
)
)
eq_(False, safe("http://www.unglue.it/book", {},
head_client=bad_redirect))

def good_redirect(*args, **kwargs):
return MockRequestsResponse(
301,
dict(location="http://www.some-other-site.org/a-book.epub")
)
eq_(
True,
safe("http://www.unglue.it/book", {}, head_client=good_redirect)
)

def not_a_redirect(*args, **kwargs):
return MockRequestsResponse(200)
eq_(True, safe("http://www.unglue.it/book", {},
head_client=not_a_redirect))

# The `check_for_redirect` argument controls which domains are
# checked using HEAD requests. Here, we customise it to check
# a site other than unglue.it.
eq_(False, safe("http://www.questionable-site.org/book.epub", {},
check_for_redirect=['questionable-site.org'],
head_client=bad_redirect))


class TestCoverResource(DatabaseTest):

Expand Down
11 changes: 11 additions & 0 deletions tests/test_opds_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,17 @@ def setup(self):

class TestOPDSImporter(OPDSImporterTest):

def test_constructor(self):
# The default way of making HTTP requests is with
# Representation.cautious_http_get.
importer = OPDSImporter(self._db, collection=None)
eq_(Representation.cautious_http_get, importer.http_get)

# But you can pass in anything you want.
do_get = object()
importer = OPDSImporter(self._db, collection=None, http_get=do_get)
eq_(do_get, importer.http_get)

def test_data_source_autocreated(self):
name = "New data source " + self._str
importer = OPDSImporter(
Expand Down

0 comments on commit b094d79

Please sign in to comment.