Merge pull request #793 from NYPL-Simplified/import-from-unglueit

Be cautious when downloading resources from an OPDS feed of free books
NYPL-Simplified · Feb 1, 2018 · b094d79 · b094d79
2 parents bfbae76 + 2def5f3
commit b094d79
Show file tree

Hide file tree

Showing 4 changed files with 258 additions and 1 deletion.
diff --git a/model.py b/model.py
@@ -8657,6 +8657,111 @@ def browser_http_get(cls, url, headers, **kwargs):
         headers['User-Agent'] = cls.BROWSER_USER_AGENT
         return cls.simple_http_get(url, headers, **kwargs)
 
+    @classmethod
+    def cautious_http_get(cls, url, headers, **kwargs):
+        """Examine the URL we're about to GET, possibly going so far as to
+        perform a HEAD request, to avoid making a request (or
+        following a redirect) to a site known to cause problems.
+
+        The motivating case is that unglue.it contains gutenberg.org
+        links that appear to be direct links to EPUBs, but 1) they're
+        not direct links to EPUBs, and 2) automated requests to
+        gutenberg.org quickly result in IP bans. So we don't make those
+        requests.
+        """
+        do_not_access = kwargs.pop(
+            'do_not_access', cls.AVOID_WHEN_CAUTIOUS_DOMAINS
+        )
+        check_for_redirect = kwargs.pop(
+            'check_for_redirect', cls.EXERCISE_CAUTION_DOMAINS
+        )
+        do_get = kwargs.pop('do_get', cls.simple_http_get)
+        head_client = kwargs.pop('cautious_head_client', requests.head)
+
+        if cls.get_would_be_useful(
+                url, headers, do_not_access, check_for_redirect,
+                head_client
+        ):
+            # Go ahead and make the GET request.
+            return do_get(url, headers, **kwargs)
+        else:
+            logging.info(
+                "Declining to make non-useful HTTP request to %s", url
+            )
+            # 417 Expectation Failed - "... if the server is a proxy,
+            # the server has unambiguous evidence that the request
+            # could not be met by the next-hop server."
+            #
+            # Not quite accurate, but I think it's the closest match
+            # to "the HTTP client decided to not even make your
+            # request".
+            return (
+                417,
+                {"content-type" :
+                 "application/vnd.librarysimplified-did-not-make-request"},
+                "Cautiously decided not to make a GET request to %s" % url
+            )
+
+    # Sites known to host both free books and redirects to a domain in
+    # AVOID_WHEN_CAUTIOUS_DOMAINS.
+    EXERCISE_CAUTION_DOMAINS = ['unglue.it']
+
+    # Sites that cause problems for us if we make automated
+    # HTTP requests to them while trying to find free books.
+    AVOID_WHEN_CAUTIOUS_DOMAINS = ['gutenberg.org']
+
+    @classmethod
+    def get_would_be_useful(
+            cls, url, headers, do_not_access=None, check_for_redirect=None,
+            head_client=None
+    ):
+        """Determine whether making a GET request to a given URL is likely to
+        have a useful result.
+
+        :param URL: URL under consideration.
+        :param headers: Headers that would be sent with the GET request.
+        :param do_not_access: Domains to which GET requests are not useful.
+        :param check_for_redirect: Domains to which we should make a HEAD
+            request, in case they redirect to a `do_not_access` domain.
+        :param head_client: Function for making the HEAD request, if 
+            one becomes necessary. Should return requests.Response or a mock.
+        """
+        do_not_access = do_not_access or cls.AVOID_WHEN_CAUTIOUS_DOMAINS
+        check_for_redirect = check_for_redirect or cls.EXERCISE_CAUTION_DOMAINS
+        head_client = head_client or requests.head
+
+        def has_domain(domain, check_against):
+            """Is the given `domain` in `check_against`, 
+            or maybe a subdomain of one of the domains in `check_against`?
+            """
+            return any(domain == x or domain.endswith('.' + x)
+                       for x in check_against)
+
+        netloc = urlparse.urlparse(url).netloc
+        if has_domain(netloc, do_not_access):
+            # The link points directly to a domain we don't want to
+            # access.
+            return False
+
+        if not has_domain(netloc, check_for_redirect):
+            # We trust this domain not to redirect to a domain we don't
+            # want to access.
+            return True
+
+        # We might be fine, or we might get redirected to a domain we
+        # don't want to access. Make a HEAD request to see what
+        # happens.
+        head_response = head_client(url, headers=headers)
+        if head_response.status_code / 100 != 3:
+            # It's not a redirect. Go ahead and make the GET request.
+            return True
+
+        # Yes, it's a redirect. Does it redirect to a 
+        # domain we don't want to access?
+        location = head_response.headers.get('location', '')
+        netloc = urlparse.urlparse(location).netloc
+        return not has_domain(netloc, do_not_access)
+
     @property
     def is_image(self):
         return self.media_type and self.media_type.startswith("image/")

diff --git a/opds_import.py b/opds_import.py
@@ -46,6 +46,7 @@
     Identifier,
     LicensePool,
     Measurement,
+    Representation,
     Subject,
     RightsStatus,
 )
@@ -427,7 +428,11 @@ def __init__(self, _db, collection, data_source_name=None,
             mirror = MirrorUploader.for_collection(collection)
         self.mirror = mirror
         self.content_modifier = content_modifier
-        self.http_get = http_get
+
+        # In general, we are cautious when mirroring resources so that
+        # we don't, e.g. accidentally get our IP banned from
+        # gutenberg.org.
+        self.http_get = http_get or Representation.cautious_http_get
         self.map_from_collection = map_from_collection
 
     @property

diff --git a/tests/test_model.py b/tests/test_model.py
@@ -109,6 +109,8 @@
     DummyHTTPClient,
 )
 
+from testing import MockRequestsResponse
+
 from mock_analytics_provider import MockAnalyticsProvider
 
 class TestSessionManager(DatabaseTest):
@@ -5112,6 +5114,140 @@ def test_automatic_conversion_svg_to_png(self):
         assert thumbnail != hyperlink.resource.representation
         eq_(Representation.PNG_MEDIA_TYPE, thumbnail.media_type)
 
+    def test_cautious_http_get(self):
+
+        h = DummyHTTPClient()
+        h.queue_response(200, content="yay")
+
+        # If the domain is obviously safe, the GET request goes through,
+        # with no HEAD request being made.
+        m = Representation.cautious_http_get
+        status, headers, content = m(
+            "http://safe.org/", {}, do_not_access=['unsafe.org'],
+            do_get=h.do_get, cautious_head_client=object()
+        )
+        eq_(200, status)
+        eq_("yay", content)
+
+        # If the domain is obviously unsafe, no GET request or HEAD
+        # request is made.
+        status, headers, content = m(
+            "http://unsafe.org/", {}, do_not_access=['unsafe.org'],
+            do_get=object(), cautious_head_client=object()
+        )
+        eq_(417, status)
+        eq_("Cautiously decided not to make a GET request to http://unsafe.org/",
+            content)
+
+        # If the domain is potentially unsafe, a HEAD request is made,
+        # and the answer depends on its outcome.
+
+        # Here, the HEAD request redirects to a prohibited site.
+        def mock_redirect(*args, **kwargs):
+            return MockRequestsResponse(
+                301, dict(location="http://unsafe.org/")
+            )
+        status, headers, content = m(
+            "http://caution.org/", {},
+            do_not_access=['unsafe.org'],
+            check_for_redirect=['caution.org'],
+            do_get=object(), cautious_head_client=mock_redirect
+        )
+        eq_(417, status)
+        eq_("application/vnd.librarysimplified-did-not-make-request",
+            headers['content-type'])
+        eq_("Cautiously decided not to make a GET request to http://caution.org/",
+            content)
+
+        # Here, the HEAD request redirects to an allowed site.
+        h.queue_response(200, content="good content")
+        def mock_redirect(*args, **kwargs):
+            return MockRequestsResponse(
+                301, dict(location="http://safe.org/")
+            )
+        status, headers, content = m(
+            "http://caution.org/", {},
+            do_not_access=['unsafe.org'],
+            check_for_redirect=['caution.org'],
+            do_get=h.do_get, cautious_head_client=mock_redirect
+        )
+        eq_(200, status)
+        eq_("good content", content)
+
+    def test_get_would_be_useful(self):
+        """Test the method that determines whether a GET request will go (or
+        redirect) to a site we don't to make requests to.
+        """
+        safe = Representation.get_would_be_useful
+
+        # If get_would_be_useful tries to use this object to make a HEAD
+        # request, the test will blow up.
+        fake_head = object()
+
+        # Most sites are safe with no HEAD request necessary.
+        eq_(True, safe("http://www.safe-site.org/book.epub", {},
+                       head_client=fake_head))
+
+        # gutenberg.org is problematic, no HEAD request necessary.
+        eq_(False, safe("http://www.gutenberg.org/book.epub", {},
+                        head_client=fake_head))
+
+        # do_not_access controls which domains should always be
+        # considered unsafe.
+        eq_(
+            False, safe(
+                "http://www.safe-site.org/book.epub", {},
+                do_not_access=['safe-site.org'], head_client=fake_head
+            )
+        )
+        eq_(
+            True, safe(
+                "http://www.gutenberg.org/book.epub", {},
+                do_not_access=['safe-site.org'], head_client=fake_head
+            )
+        )
+
+        # Domain match is based on a subdomain match, not a substring
+        # match.
+        eq_(True, safe("http://www.not-unsafe-site.org/book.epub", {},
+                       do_not_access=['unsafe-site.org'],
+                       head_client=fake_head))
+
+        # Some domains (unglue.it) are known to make surprise
+        # redirects to unsafe domains. For these, we must make a HEAD
+        # request to check.
+
+        def bad_redirect(*args, **kwargs):
+            return MockRequestsResponse(
+                301, dict(
+                    location="http://www.gutenberg.org/a-book.html"
+                )
+            )
+        eq_(False, safe("http://www.unglue.it/book", {},
+                        head_client=bad_redirect))
+
+        def good_redirect(*args, **kwargs):
+            return MockRequestsResponse(
+                301,
+                dict(location="http://www.some-other-site.org/a-book.epub")
+            )
+        eq_(
+            True,
+            safe("http://www.unglue.it/book", {}, head_client=good_redirect)
+        )
+
+        def not_a_redirect(*args, **kwargs):
+            return MockRequestsResponse(200)
+        eq_(True, safe("http://www.unglue.it/book", {},
+                       head_client=not_a_redirect))
+
+        # The `check_for_redirect` argument controls which domains are
+        # checked using HEAD requests. Here, we customise it to check
+        # a site other than unglue.it.
+        eq_(False, safe("http://www.questionable-site.org/book.epub", {},
+                        check_for_redirect=['questionable-site.org'],
+                        head_client=bad_redirect))
+
 
 class TestCoverResource(DatabaseTest):
 

diff --git a/tests/test_opds_import.py b/tests/test_opds_import.py
@@ -200,6 +200,17 @@ def setup(self):
 
 class TestOPDSImporter(OPDSImporterTest):
 
+    def test_constructor(self):
+        # The default way of making HTTP requests is with
+        # Representation.cautious_http_get.
+        importer = OPDSImporter(self._db, collection=None)
+        eq_(Representation.cautious_http_get, importer.http_get)
+
+        # But you can pass in anything you want.
+        do_get = object()
+        importer = OPDSImporter(self._db, collection=None, http_get=do_get)
+        eq_(do_get, importer.http_get)
+
     def test_data_source_autocreated(self):
         name = "New data source " + self._str
         importer = OPDSImporter(