From a8a835c9e4885d7a4b87af94cf7e383862aaa7b7 Mon Sep 17 00:00:00 2001 From: Karl Czajkowski Date: Mon, 25 Sep 2023 17:15:26 -0700 Subject: [PATCH] introduce more strict URL pathname validation Use a regexp to enforce strict limits on pathname strings. The full pattern is: `^([-._~A-Za-z0-9/]|%[0-9a-fA-F][0-9a-fA-F])+$` Where the initial character class recognizes: - RFC 3986 non-reserved characters `[-._~A-Za-z0-9]` - RFC 3986 reserved path separator `/` understood by Hatrac And the disjoined patter handles: - Percent-encoded UTF-8 byte sequences `%[0-9a-fA-F][0-9a-fA-F]` A Bad Request error will be raised when clients attempt to use unsupported characters, to make diagnostics easier than if it merely raised Not Found. As a limited escape-hatch, add a top-level config field can override the initial character class of unquoted characters. This could be used to allow additional (out of spec) characters needed to support legacy use cases. This must be used with care. The config value cannot influence recognition of percent-encoded bytes. --- docs/CONFIG.md | 17 +++++++++++++++++ hatrac/rest/core.py | 10 ++++++++++ test/rest-smoketest.sh | 6 ++++++ 3 files changed, 33 insertions(+) diff --git a/docs/CONFIG.md b/docs/CONFIG.md index 9bcf1f0..d889b6a 100644 --- a/docs/CONFIG.md +++ b/docs/CONFIG.md @@ -14,6 +14,7 @@ whole: { "service_prefix": , "database_dsn": , + "allowed_url_char_class": , "max_request_payload_size": , "firewall_acls": { : , ... }, "read_only": , @@ -35,6 +36,22 @@ The connection string used when opening the service database via the `psycopg2` A typical value for a single-host deployment would be `"dbname=hatrac"`. In a more complex deployment, this might include remote database server addresses or other connection options. +### `allowed_url_char_class` + +A Python RE representing the class of single characters allowed in the +pathname component of a Hatrac URL. The default is a strict +definition: `[-._~A-Za-z0-9/]`. This default combines RFC 3986 and +Hatrac URL parsing rules: + +- Non-reserved characters are ASCII alpha-numeric A-Z, a-z, 0-9, and the limited punctuation `-`, `.`, `_`, and `~`. +- Hatrac pathnames can be formed with the non-reserved characters and the `/` path separator, or with percent-encoded UTF-8 sequences. + +The purpose of this configuration field is to allow an administrator +to slightly relax the strict rules and allow for additional bare ASCII +characters to be used outside of this specification. It is recommended +that this feature not be used, and instead deployments be updated to +follow the strict quoting rules described above. + ### `max_request_payload_size` An integer byte count. The default is 134217728, i.e. 128 MiB. diff --git a/hatrac/rest/core.py b/hatrac/rest/core.py index 18c3e7c..d6f925e 100644 --- a/hatrac/rest/core.py +++ b/hatrac/rest/core.py @@ -336,8 +336,18 @@ def _fullname(self, path, name): fullname = '/' + '/'.join(nameparts) return fullname + # RFC 3986 + # + # reserved chars: : / ? # [ ] @ ! $ & ' ( ) * + , ; = + # non-reserved chars: [-._~A-Za-z0-9] + # + def resolve(self, path, name, raise_notfound=True): fullname = self._fullname(path, name) + allowed_char_class = core.config.get("allowed_url_char_class", '[-._~A-Za-z0-9/]') + pat = "^(" + allowed_char_class + "|%[0-9a-fA-F][0-9a-fA-F])+$" + if not re.match(pat, fullname): + raise BadRequest('Request malformed. Hatrac URL path names may only include characters A-Z, a-z, 0-9, "-", ".", "~", or UTF-8 bytes percent-encoded with 2-digit hex values.') return hatrac_ctx.hatrac_directory.name_resolve(fullname, raise_notfound) def resolve_version(self, path, name, version): diff --git a/test/rest-smoketest.sh b/test/rest-smoketest.sh index 71dea03..726244b 100644 --- a/test/rest-smoketest.sh +++ b/test/rest-smoketest.sh @@ -408,6 +408,12 @@ dotest "409::*::*" "/ns-${RUNKEY}/foo/obj1;metadata/content-sha256" -T ${TEST_DA # test path with escaped unicode name "ǝɯɐuǝlᴉɟ ǝpoɔᴉun" TEST_DISPOSITION="filename*=UTF-8''%C7%9D%C9%AF%C9%90u%C7%9Dl%E1%B4%89%C9%9F%20%C7%9Dpo%C9%94%E1%B4%89un" +TEST_NS="${TEST_DISPOSITION:17}" # strip off prefix: filename*=UTF-8'' + +dotest "201::text/uri-list::*" "/ns-${RUNKEY}/${TEST_NS}" -X PUT -H "Content-Type: application/x-hatrac-namespace" +dotest "200::application/json::*" /ns-${RUNKEY}/"${TEST_NS}" +dotest "204::*::*" /ns-${RUNKEY}/"${TEST_NS}" -X DELETE +dotest "404::*::*" /ns-${RUNKEY}/"${TEST_NS}" dotest "204::*::*" /ns-${RUNKEY}/foo/obj1 -X DELETE dotest "404::*::*" "/ns-${RUNKEY}/foo/obj1?cid=smoke" -X DELETE