Skip to content

Commit

Permalink
Validate JSON unicode escape sequences with surrogate pairs (#14)
Browse files Browse the repository at this point in the history
Support surrogate pairs in string unicode escape sequences by failing if
we encounter an invalid surrogate pair or a lone surrogate.
  • Loading branch information
anttih authored May 25, 2024
1 parent 48119e8 commit b55c8bb
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 6 deletions.
30 changes: 24 additions & 6 deletions json/src/JSON/Internal.ss
Original file line number Diff line number Diff line change
Expand Up @@ -175,11 +175,29 @@
[second (read-hex-digit cur)]
[third (read-hex-digit cur)]
[fourth (read-hex-digit cur)])
(code-points->pstring
(+ (* 4096 first)
(* 256 second)
(* 16 third)
fourth))))
(+ (* 4096 first)
(* 256 second)
(* 16 third)
fourth)))

(define (read-unicode-code-point cur)
(let ([w1 (read-unicode-escape cur)])
(cond
; If it's a two-word encoded value we need to read a second escape
[(fx<= #xD800 w1 #xDBFF)
(let* ([_ (expect-char cur #\\)]
[_ (expect-char cur #\u)]
[w2 (read-unicode-escape cur)])
(if (fx<= #xDC00 w2 #xDFFF)
(fx+
(fxlogor
(fxsll (fx- w1 #xD800) 10)
(fx- w2 #xDC00))
#x10000)
(error #f (format "Invalid unicode surrogate pair ~,x ~,x" w1 w2))))]
[(fx<= #xDC00 w1 #xDFFF)
(error #f (format "Invalid unicode escape ~,x" w1))]
[else w1])))

(define (read-escape cur)
(let ([ch (pstring-cursor-read-char cur)])
Expand All @@ -190,7 +208,7 @@
[(eqv? ch #\n) (pstring #\newline)]
[(eqv? ch #\t) (pstring #\tab)]
[(eqv? ch #\/) (pstring #\/)]
[(eqv? ch #\u) (read-unicode-escape cur)]
[(eqv? ch #\u) (code-points->pstring (read-unicode-code-point cur))]
[else (error #f (format "Invalid string escape ~a" ch))])))

(define (control-char? ch)
Expand Down
6 changes: 6 additions & 0 deletions json/test/Main.ss
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@
(assert-roundtrip "\"foo \\u03BB \\u03bc\"" "\"foo λ μ\"")
(assert-fail "\"\\u03z\"" "Unexpected token 'z', was expecting a hex digit")
(assert-fail "\"\\u03" "Unexpected end of input, was expecting a hex digit")
; surrogate pairs
(assert-roundtrip "\"\\uD801\\uDC37\"" "\"𐐷\"")
(assert-roundtrip "\" \\uD801\\uDC37 foo\"" "\" 𐐷 foo\"")
(assert-fail "\"\\uD801 \"" "Unexpected ' ', was expecting '\\'")
(assert-fail "\"\\uDC37 \"" "Invalid unicode escape DC37")
(assert-fail "\" \\uD801\\uD802 foo\"" "Invalid unicode surrogate pair D801 D802")

(display " Leading whitespace\n")
(assert-roundtrip "\n\r \"\\nbar\"" "\"\\nbar\"")
Expand Down

0 comments on commit b55c8bb

Please sign in to comment.