diff --git a/splash/browser_tab.py b/splash/browser_tab.py index 61e37cd64..b570abc4f 100644 --- a/splash/browser_tab.py +++ b/splash/browser_tab.py @@ -21,7 +21,7 @@ from splash.qtutils import (OPERATION_QT_CONSTANTS, WrappedSignal, qt2py, qurl2ascii, to_qurl) from splash.render_options import validate_size_str -from splash.qwebpage import SplashQWebPage, SplashQWebView +from splash.qwebpage import SplashQWebPage, SplashQWebView, RenderErrorInfo from splash.exceptions import JsError, OneShotCallbackError, ScriptError from splash.utils import to_bytes from splash.jsutils import ( @@ -71,6 +71,9 @@ def __init__(self, network_manager, splash_proxy_factory, verbosity, self._callback_proxies_to_cancel = weakref.WeakSet() self._js_console = None self._autoload_scripts = [] + self._is_unsupported_content = False + self._unsupported_content_reply = None + self._load_finished_after_unsupported_content_ready = False self.logger = _BrowserTabLogger(uid=self._uid, verbosity=verbosity) self._init_webpage(verbosity, network_manager, splash_proxy_factory, @@ -140,6 +143,8 @@ def _setup_webpage_events(self): self.web_page.mainFrame().loadFinished.connect(self._on_load_finished) self.web_page.mainFrame().urlChanged.connect(self._on_url_changed) self.web_page.mainFrame().javaScriptWindowObjectCleared.connect(self._on_javascript_window_object_cleared) + self.web_page.setForwardUnsupportedContent(True) + self.web_page.unsupportedContent.connect(self._on_unsupported_content) self.logger.add_web_page(self.web_page) def return_result(self, result): @@ -379,6 +384,15 @@ def _on_load_finished(self, ok): This callback is called for all web_page.mainFrame() loadFinished events. """ + if self._is_unsupported_content: + if self._unsupported_content_reply.isRunning(): + # XXX: We'll come back later when download finishes + self.logger.log( + 'Still receving unsupported content', min_level=3) + return + else: + self._load_finished_after_unsupported_content_ready = True + self.logger.log('Unsupported content received', min_level=3) if self.web_page.maybe_redirect(ok): self.logger.log("Redirect or other non-fatal error detected", min_level=2) return @@ -426,7 +440,11 @@ def _on_content_ready(self, ok, callback, errback, callback_id): """ This method is called when a QWebPage finishes loading its contents. """ - if self.web_page.maybe_redirect(ok): + if self._is_unsupported_content: + if self._unsupported_content_reply.isRunning(): + # XXX: We'll come back later when download finishes + return + elif self.web_page.maybe_redirect(ok): # XXX: It assumes loadFinished will be called again because # redirect happens. If redirect is detected improperly, # loadFinished won't be called again, and Splash will return @@ -438,6 +456,16 @@ def _on_content_ready(self, ok, callback, errback, callback_id): if self.web_page.is_ok(ok): callback() + elif self._is_unsupported_content: + # XXX: Error downloading unsupported content. + # `self.web_page.error_info` shall be `None` now + error_info = RenderErrorInfo( + 'Network', + int(self._unsupported_content_reply.error()), + six.text_type(self._unsupported_content_reply.errorString()), + six.text_type(self._unsupported_content_reply.url().url()) + ) + errback(error_info) elif self.web_page.error_loading(ok): # XXX: maybe return a meaningful error page instead of generic # error message? @@ -512,6 +540,28 @@ def _on_url_changed(self, url): self.web_page.har.store_redirect(six.text_type(url.toString())) self._cancel_timers(self._timers_to_cancel_on_redirect) + def _on_unsupported_content_finished(self): + self.logger.log('Unsupported content finished', min_level=3) + if not self._load_finished_after_unsupported_content_ready: + # XXX: The unsupported content reply might have finished before the + # original loadFinished signal emits. In such cases we do not want + # the same signal twice. + if not self._unsupported_content_reply.error(): + self.web_page.mainFrame().loadFinished.emit(True) + else: + self.web_page.mainFrame().loadFinished.emit(False) + + def _on_unsupported_content(self, reply): + self.logger.log('Unsupported content detected', min_level=3) + self._is_unsupported_content = True + self._unsupported_content_reply = reply + if reply.isFinished(): + # Already finished. The content might be very short. + self.logger.log('Unsupported content already finished', min_level=3) + self._on_unsupported_content_finished() + else: + reply.finished.connect(self._on_unsupported_content_finished) + def run_js_file(self, filename, handle_errors=True): """ Load JS library from file ``filename`` to the current frame. diff --git a/splash/tests/mockserver.py b/splash/tests/mockserver.py index 4885fb590..8fcbc4af0 100755 --- a/splash/tests/mockserver.py +++ b/splash/tests/mockserver.py @@ -743,6 +743,28 @@ def render_GET(self, request): return b"ok" +class RawBytes(Resource): + + def render_GET(self, request): + body_length = int(request.args.get(b'length', [1024])[0]) + body = b'\x00' * body_length + claim_length = int(request.args.get(b'claim_length', [body_length])[0]) + content = b'\n'.join([ + b'HTTP/1.1 200 OK', + ('Content-Length: %d' % claim_length).encode('utf8'), + b'', + body, + ]) + request.channel.transport.write(content) + if b'delayed_abort' in request.args: + reactor.callLater(1, request.channel.transport.abortConnection) + elif b'abort' in request.args: + request.channel.transport.abortConnection() + else: + request.channel.transport.loseConnection() + return NOT_DONE_YET + + class Index(Resource): isLeaf = True @@ -820,6 +842,8 @@ def __init__(self, http_port, https_port, proxy_port): self.putChild(b"bad-content-type", InvalidContentTypeResource()) self.putChild(b"bad-content-type2", InvalidContentTypeResource2()) + self.putChild(b"raw-bytes", RawBytes()) + self.putChild(b"jsredirect", JsRedirect()) self.putChild(b"jsredirect-to", JsRedirectTo()) self.putChild(b"jsredirect-slowimage", JsRedirectSlowImage()) diff --git a/splash/tests/test_render.py b/splash/tests/test_render.py index ff6672def..5c3118b85 100644 --- a/splash/tests/test_render.py +++ b/splash/tests/test_render.py @@ -208,6 +208,21 @@ def test_invalid_wait(self): 'wait': wait}) self.assertStatusCode(r, 400) + def test_unsupported_content(self): + cases = [ + # Short body (Can be received together with the headers) + ("raw-bytes?length=16", 200), + # Short body with error + ("raw-bytes?length=16&claim_length=32&abort=1", 502), + # Long body (May not be received together with the headers) + ("raw-bytes?length=1000000", 200), + # Long body with error + ("raw-bytes?length=100&claim_length=200&delayed_abort=1", 502), + ] + for url, http_status in cases: + r = self.request({"url": self.mockurl(url)}) + self.assertStatusCode(r, http_status) + @pytest.mark.skipif( not qt_551_plus(), reason="resource_timeout doesn't work in Qt5 < 5.5.1. See issue #269 for details."