From b55c8bb976dcddf69fda2593c5707969ca1e13b4 Mon Sep 17 00:00:00 2001 From: Antti Holvikari Date: Sat, 25 May 2024 09:22:12 +0300 Subject: [PATCH] Validate JSON unicode escape sequences with surrogate pairs (#14) Support surrogate pairs in string unicode escape sequences by failing if we encounter an invalid surrogate pair or a lone surrogate. --- json/src/JSON/Internal.ss | 30 ++++++++++++++++++++++++------ json/test/Main.ss | 6 ++++++ 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/json/src/JSON/Internal.ss b/json/src/JSON/Internal.ss index 0d03a5d0..a2fe92c6 100644 --- a/json/src/JSON/Internal.ss +++ b/json/src/JSON/Internal.ss @@ -175,11 +175,29 @@ [second (read-hex-digit cur)] [third (read-hex-digit cur)] [fourth (read-hex-digit cur)]) - (code-points->pstring - (+ (* 4096 first) - (* 256 second) - (* 16 third) - fourth)))) + (+ (* 4096 first) + (* 256 second) + (* 16 third) + fourth))) + + (define (read-unicode-code-point cur) + (let ([w1 (read-unicode-escape cur)]) + (cond + ; If it's a two-word encoded value we need to read a second escape + [(fx<= #xD800 w1 #xDBFF) + (let* ([_ (expect-char cur #\\)] + [_ (expect-char cur #\u)] + [w2 (read-unicode-escape cur)]) + (if (fx<= #xDC00 w2 #xDFFF) + (fx+ + (fxlogor + (fxsll (fx- w1 #xD800) 10) + (fx- w2 #xDC00)) + #x10000) + (error #f (format "Invalid unicode surrogate pair ~,x ~,x" w1 w2))))] + [(fx<= #xDC00 w1 #xDFFF) + (error #f (format "Invalid unicode escape ~,x" w1))] + [else w1]))) (define (read-escape cur) (let ([ch (pstring-cursor-read-char cur)]) @@ -190,7 +208,7 @@ [(eqv? ch #\n) (pstring #\newline)] [(eqv? ch #\t) (pstring #\tab)] [(eqv? ch #\/) (pstring #\/)] - [(eqv? ch #\u) (read-unicode-escape cur)] + [(eqv? ch #\u) (code-points->pstring (read-unicode-code-point cur))] [else (error #f (format "Invalid string escape ~a" ch))]))) (define (control-char? ch) diff --git a/json/test/Main.ss b/json/test/Main.ss index 2edaa8a3..9cab4afa 100644 --- a/json/test/Main.ss +++ b/json/test/Main.ss @@ -64,6 +64,12 @@ (assert-roundtrip "\"foo \\u03BB \\u03bc\"" "\"foo λ μ\"") (assert-fail "\"\\u03z\"" "Unexpected token 'z', was expecting a hex digit") (assert-fail "\"\\u03" "Unexpected end of input, was expecting a hex digit") + ; surrogate pairs + (assert-roundtrip "\"\\uD801\\uDC37\"" "\"𐐷\"") + (assert-roundtrip "\" \\uD801\\uDC37 foo\"" "\" 𐐷 foo\"") + (assert-fail "\"\\uD801 \"" "Unexpected ' ', was expecting '\\'") + (assert-fail "\"\\uDC37 \"" "Invalid unicode escape DC37") + (assert-fail "\" \\uD801\\uD802 foo\"" "Invalid unicode surrogate pair D801 D802") (display " Leading whitespace\n") (assert-roundtrip "\n\r \"\\nbar\"" "\"\\nbar\"")