From c4d033a18a214926529856bc30ea9d6a0f415d03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 31 May 2019 10:55:29 +0200 Subject: [PATCH] Implement a response size limit option --- splash/defaults.py | 3 + splash/network_manager.py | 93 ++++++++++++++++++++++++++++ splash/render_options.py | 7 ++- splash/resources.py | 37 +++++++---- splash/server.py | 17 +++-- splash/tests/test_network_manager.py | 69 +++++++++++++++++++++ 6 files changed, 208 insertions(+), 18 deletions(-) create mode 100644 splash/tests/test_network_manager.py diff --git a/splash/defaults.py b/splash/defaults.py index 05bc15ac4..9a51006f4 100644 --- a/splash/defaults.py +++ b/splash/defaults.py @@ -6,6 +6,9 @@ MAX_TIMEOUT = 90.0 +RESPONSE_SIZE_LIMIT = None +MAX_RESPONSE_SIZE_LIMIT = None + # Default size of browser window. As there're no decorations, this affects # both "window.inner*" and "window.outer*" values. VIEWPORT_SIZE = '1024x768' diff --git a/splash/network_manager.py b/splash/network_manager.py index 29108a505..6ea529d44 100644 --- a/splash/network_manager.py +++ b/splash/network_manager.py @@ -26,10 +26,34 @@ ) from splash.response_middleware import ContentTypeMiddleware from splash import defaults +from splash.qtutils import qt_header_items from splash.utils import to_bytes from splash.cookies import SplashCookieJar +class _InvalidContentLength(ValueError): + + def __init__(self, value): + if isinstance(value, bytes): + value = '0x' + value.hex() + message = 'Invalid Content-Length header value: {}'.format(value) + super().__init__(message) + + +def _get_content_length(reply): + for name, value in qt_header_items(reply): + if bytes(name).lower() == b'content-length': + value = bytes(value).split(b',', 1)[0] + try: + value = value.decode('latin1') + value = int(value) + except (UnicodeDecodeError, ValueError): + raise _InvalidContentLength(value) + if value < 0: + raise _InvalidContentLength(value) + return value + + class NetworkManagerFactory(object): def __init__(self, filters_path=None, verbosity=None, allowed_schemes=None, disable_browser_caches=None): verbosity = defaults.VERBOSITY if verbosity is None else verbosity @@ -86,6 +110,7 @@ class ProxiedQNetworkAccessManager(QNetworkAccessManager): * Tracks information about requests/responses and stores it in HAR format, including request and response content. * Allows to set per-request timeouts. + * Handles per-request response size limits. """ _REQUEST_ID = QNetworkRequest.User + 1 _SHOULD_TRACK = QNetworkRequest.User + 2 @@ -398,11 +423,69 @@ def _on_reply_finished(self): content) self.log("Finished downloading {url}", reply) + def _aborted_due_to_size(self, reply, request, sizes_and_sources): + render_options = self._get_render_options(request) + if render_options is None: + return False + option = "response_size_limit" + max_size = render_options.get(option, None) + if max_size is not None: + try: + max_size = int(max_size) + except ValueError: + self.log("Non-integer value received for rendering option " + "'{}': {}".format(option, max_size), min_level=1) + self.log(traceback.format_exc(), min_level=1, format_msg=False) + max_size = None + else: + if max_size < 0: + self.log("The value of rendering option '{}' ({}) must be " + "0 or higher.".format(option, max_size), + min_level=1) + max_size = None + elif (render_options.max_response_size_limit is not None and + max_size > render_options.max_response_size_limit): + self.log("The value of rendering option '{}' ({}) exceeds " + "the maximum value allowed.".format( + option, max_size), + min_level=1) + max_size = None + if max_size is None: + if render_options.max_response_size_limit is not None: + max_size = render_options.max_response_size_limit + else: + max_size = defaults.RESPONSE_SIZE_LIMIT + if max_size is None: + return False + for size, source in sizes_and_sources: + if size is None: + continue + if size <= max_size: + continue + self.log("The {} ({}) exceeds the maximum response size ({}), " + "aborting: {{url}}".format(source, size, max_size), + reply, min_level=1) + self.log(render_options, reply, min_level=1, format_msg=False) + reply.abort() + return True + return False + def _on_reply_headers(self): """Signal emitted before reading response body, after getting headers """ reply = self.sender() request = reply.request() + + try: + content_length = _get_content_length(reply) + except _InvalidContentLength as error: + self.log("On response from {{url}}: {}".format(error), + reply, min_level=3) + content_length = None + sizes_and_sources = ((content_length, "Content-Length header"),) + if self._aborted_due_to_size(reply, request, sizes_and_sources): + return + self._handle_reply_cookies(reply) self._run_webpage_callbacks(request, "on_response_headers", reply) @@ -413,6 +496,16 @@ def _on_reply_headers(self): self.log("Headers received for {url}", reply, min_level=3) def _on_reply_download_progress(self, received, total): + reply = self.sender() + request = reply.request() + + sizes_and_sources = ( + (total, "expected response size"), + (received, "size of the response content downloaded so far"), + ) + if self._aborted_due_to_size(reply, request, sizes_and_sources): + return + har = self._get_har() if har is not None: req_id = self._get_request_id() diff --git a/splash/render_options.py b/splash/render_options.py index f85e30b47..213df6712 100644 --- a/splash/render_options.py +++ b/splash/render_options.py @@ -14,8 +14,9 @@ class RenderOptions(object): _REQUIRED = object() - def __init__(self, data, max_timeout): + def __init__(self, data, max_timeout, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT): self.data = data + self.max_response_size_limit = max_response_size_limit self.max_timeout = max_timeout @classmethod @@ -29,7 +30,7 @@ def raise_error(cls, argument, description, type='bad_argument', **kwargs): raise BadOption(params) @classmethod - def fromrequest(cls, request, max_timeout): + def fromrequest(cls, request, max_timeout, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT): """ Initialize options from a Twisted Request. """ @@ -60,7 +61,7 @@ def fromrequest(cls, request, max_timeout): request.content.seek(0) data['uid'] = id(request) - return cls(data, max_timeout) + return cls(data, max_timeout, max_response_size_limit=max_response_size_limit) def get_expired_args(self, cache): """ diff --git a/splash/resources.py b/splash/resources.py index 2c205ef5b..d57bd79e6 100644 --- a/splash/resources.py +++ b/splash/resources.py @@ -17,6 +17,7 @@ import splash from splash.argument_cache import ArgumentCache +from splash import defaults from splash.qtrender import ( HtmlRender, PngRender, JsonRender, HarRender, JpegRender ) @@ -85,17 +86,18 @@ class BaseRenderResource(_ValidatingResource): isLeaf = True content_type = "text/html; charset=utf-8" - def __init__(self, pool, max_timeout, argument_cache): + def __init__(self, pool, max_timeout, argument_cache, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT): Resource.__init__(self) self.pool = pool self.js_profiles_path = self.pool.js_profiles_path self.max_timeout = max_timeout self.argument_cache = argument_cache + self.max_response_size_limit = max_response_size_limit def render_GET(self, request): #log.msg("%s %s %s %s" % (id(request), request.method, request.path, request.args)) request.starttime = time.time() - render_options = RenderOptions.fromrequest(request, self.max_timeout) + render_options = RenderOptions.fromrequest(request, self.max_timeout, max_response_size_limit=self.max_response_size_limit) # process argument cache original_options = render_options.data.copy() @@ -281,8 +283,9 @@ def __init__(self, pool, sandboxed, argument_cache, strict, implicit_main, + max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT, ): - BaseRenderResource.__init__(self, pool, max_timeout, argument_cache) + BaseRenderResource.__init__(self, pool, max_timeout, argument_cache, max_response_size_limit=max_response_size_limit) self.sandboxed = sandboxed self.lua_package_path = lua_package_path self.lua_sandbox_allowed_modules = lua_sandbox_allowed_modules @@ -434,20 +437,22 @@ class DemoUI(_ValidatingResource): PATH = b'info' - def __init__(self, pool, lua_enabled, max_timeout): + def __init__(self, pool, lua_enabled, max_timeout, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT): Resource.__init__(self) self.pool = pool self.lua_enabled = lua_enabled self.max_timeout = max_timeout + self.max_response_size_limit = max_response_size_limit def _validate_params(self, request): - options = RenderOptions.fromrequest(request, self.max_timeout) + options = RenderOptions.fromrequest(request, self.max_timeout, max_response_size_limit=self.max_response_size_limit) options.get_filters(self.pool) # check params = options.get_common_params(self.pool.js_profiles_path) params.update({ 'save_args': options.get_save_args(), 'load_args': options.get_load_args(), 'timeout': options.get_timeout(), + 'response_size_limit': options.get_response_size_limit(), 'request_body': options.get_request_body(), 'response_body': options.get_response_body(), 'har': 1, @@ -471,6 +476,7 @@ def render_GET(self, request): url = 'http://' + url params['url'] = url timeout = params['timeout'] + response_size_limit = params['response_size_limit'] params = {k: v for k, v in params.items() if v is not None} # disable "phases" HAR Viewer feature @@ -514,6 +520,7 @@ def render_GET(self, request): +
@@ -563,6 +570,7 @@ def render_GET(self, request): "lua_enabled": self.lua_enabled, }), timeout=timeout, + response_size_limit=response_size_limit, url=url, theme=BOOTSTRAP_THEME, cm_resources=CODEMIRROR_RESOURCES if self.lua_enabled else "", @@ -576,6 +584,7 @@ def __init__(self, pool, ui_enabled, lua_enabled, lua_sandbox_enabled, max_timeout, argument_cache_max_entries, strict_lua_runner, + max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT, ): Resource.__init__(self) self.argument_cache = ArgumentCache(argument_cache_max_entries) @@ -583,11 +592,12 @@ def __init__(self, pool, ui_enabled, lua_enabled, lua_sandbox_enabled, self.lua_enabled = lua_enabled _args = pool, max_timeout, self.argument_cache - self.putChild(b"render.html", RenderHtmlResource(*_args)) - self.putChild(b"render.png", RenderPngResource(*_args)) - self.putChild(b"render.jpeg", RenderJpegResource(*_args)) - self.putChild(b"render.json", RenderJsonResource(*_args)) - self.putChild(b"render.har", RenderHarResource(*_args)) + _kwargs = {'max_response_size_limit': max_response_size_limit} + self.putChild(b"render.html", RenderHtmlResource(*_args, **_kwargs)) + self.putChild(b"render.png", RenderPngResource(*_args, **_kwargs)) + self.putChild(b"render.jpeg", RenderJpegResource(*_args, **_kwargs)) + self.putChild(b"render.json", RenderJsonResource(*_args, **_kwargs)) + self.putChild(b"render.har", RenderHarResource(*_args, **_kwargs)) self.putChild(b"_debug", DebugResource(pool, self.argument_cache)) self.putChild(b"_gc", ClearCachesResource(self.argument_cache)) @@ -605,6 +615,7 @@ def __init__(self, pool, ui_enabled, lua_enabled, lua_sandbox_enabled, max_timeout=max_timeout, argument_cache=self.argument_cache, strict=strict_lua_runner, + max_response_size_limit=max_response_size_limit, ) self.putChild(b"execute", ExecuteLuaScriptResource( implicit_main=False, **lua_kwargs)) @@ -626,9 +637,11 @@ def __init__(self, pool, ui_enabled, lua_enabled, lua_sandbox_enabled, self.putChild(DemoUI.PATH, DemoUI( pool=pool, lua_enabled=self.lua_enabled, - max_timeout=max_timeout + max_timeout=max_timeout, + max_response_size_limit=max_response_size_limit, )) self.max_timeout = max_timeout + self.max_response_size_limit = max_response_size_limit def getChild(self, name, request): if name == b"" and self.ui_enabled: @@ -720,6 +733,7 @@ def render_GET(self, request): +
@@ -754,5 +768,6 @@ def render_GET(self, request): }), cm_resources=CODEMIRROR_RESOURCES, timeout=self.max_timeout, + response_size_limit=self.max_response_size_limit, ) return result.encode('utf8') diff --git a/splash/server.py b/splash/server.py index 1aa288a58..819e46c87 100644 --- a/splash/server.py +++ b/splash/server.py @@ -78,6 +78,9 @@ def parse_opts(jupyter=False, argv=sys.argv): help="number of render slots (default: %default)") op.add_option("--max-timeout", type="float", default=defaults.MAX_TIMEOUT, help="maximum allowed value for timeout (default: %default)") + op.add_option("--max-response-size-limit", type="int", + default=defaults.MAX_RESPONSE_SIZE_LIMIT, + help="maximum allowed value for response size limit (default: %default)") op.add_option("--disable-ui", action="store_true", default=False, help="disable web UI") op.add_option("--disable-lua", action="store_true", default=False, @@ -94,6 +97,7 @@ def parse_opts(jupyter=False, argv=sys.argv): opts.port = None opts.slots = None opts.max_timeout = None + opts.max_response_size_limit = None opts.argument_cache_max_entries = None return opts, args @@ -170,7 +174,8 @@ def splash_server(portnum, ip, slots, network_manager_factory, max_timeout, strict_lua_runner=False, argument_cache_max_entries=None, disable_browser_caches=False, - verbosity=None): + verbosity=None, + max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT): from twisted.internet import reactor from twisted.web.server import Site from splash.resources import Root @@ -181,8 +186,8 @@ def splash_server(portnum, ip, slots, network_manager_factory, max_timeout, verbosity = defaults.VERBOSITY if verbosity is None else verbosity slots = defaults.SLOTS if slots is None else slots - log.msg("verbosity={}, slots={}, argument_cache_max_entries={}, max-timeout={}".format( - verbosity, slots, argument_cache_max_entries, max_timeout + log.msg("verbosity={}, slots={}, argument_cache_max_entries={}, max-timeout={}, max-response-size-limit={}".format( + verbosity, slots, argument_cache_max_entries, max_timeout, max_response_size_limit )) pool = RenderPool( @@ -215,6 +220,7 @@ def splash_server(portnum, ip, slots, network_manager_factory, max_timeout, max_timeout=max_timeout, argument_cache_max_entries=argument_cache_max_entries, strict_lua_runner=strict_lua_runner, + max_response_size_limit=max_response_size_limit, ) factory = Site(root) reactor.listenTCP(portnum, factory, interface=ip) @@ -264,6 +270,7 @@ def default_splash_server(portnum, ip, max_timeout, slots=None, verbosity=None, server_factory=splash_server, disable_browser_caches=False, + max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT, ): from splash import network_manager network_manager_factory = network_manager.NetworkManagerFactory( @@ -293,6 +300,7 @@ def default_splash_server(portnum, ip, max_timeout, slots=None, verbosity=verbosity, max_timeout=max_timeout, argument_cache_max_entries=argument_cache_max_entries, + max_response_size_limit=max_response_size_limit, ) @@ -391,7 +399,8 @@ def main(jupyter=False, argv=sys.argv, server_factory=splash_server): max_timeout=opts.max_timeout, argument_cache_max_entries=opts.argument_cache_max_entries, server_factory=server_factory, - disable_browser_caches=opts.disable_browser_caches + disable_browser_caches=opts.disable_browser_caches, + max_response_size_limit=opts.max_response_size_limit, ) signal.signal(signal.SIGUSR1, lambda s, f: traceback.print_stack(f)) diff --git a/splash/tests/test_network_manager.py b/splash/tests/test_network_manager.py new file mode 100644 index 000000000..d219a3bd3 --- /dev/null +++ b/splash/tests/test_network_manager.py @@ -0,0 +1,69 @@ +from itertools import permutations, product + +from PyQt5.QtNetwork import QNetworkReply + +from splash.network_manager import _get_content_length, _InvalidContentLength + +from pytest import mark, raises + + +class MockReply(QNetworkReply): + + def __init__(self, headers): + super().__init__() + for header, value in headers: + self.setRawHeader(header, value) + + +CONTENT_LENGHT_HEADER_VARIANTS = ( + b'Content-Length', + b'content-length', + b'CONTENT-LENGTH', + b'cOntent-length', +) + + +@mark.parametrize( + 'headers,result', + ( + ( + (), + None + ), + *( + ( + ( + (header, value), + ), + result + ) + for (header, (value, result)) in product( + CONTENT_LENGHT_HEADER_VARIANTS, + ( + (b'', _InvalidContentLength), + (b'1', 1), + (b'-1', _InvalidContentLength), + (b'1.0', _InvalidContentLength), + (b'a', _InvalidContentLength), + ('รก'.encode('utf-8'), _InvalidContentLength), + ) + ) + ), + *( + ( + ( + (header, b'1,2'), + ), + 1 + ) + for header in CONTENT_LENGHT_HEADER_VARIANTS + ), + ) +) +def test_get_content_length(headers, result): + if result is None or isinstance(result, int): + assert _get_content_length(MockReply(headers)) == result + else: + assert issubclass(result, Exception) + with raises(result): + _get_content_length(MockReply(headers))