Skip to content

Commit

Permalink
Merge: JSON Parser Optimization
Browse files Browse the repository at this point in the history
This PR improves the speed of JSON parsing with the `string_parser` module. The new version of `parse_json_string` is heavily inspired by Python's implementation, available [here](https://github.com/python/cpython/blob/master/Modules/_json.c)

Time is usually either as performing as before or better, Valgrind also gives the new version an advantage, sometimes significant.

On `large_escaped` (~120M, lots of escaped characters), we have:

* Before: 13 748 436 458 Ir
* After: 11 687 104 643 Ir

i.e. an improvement of ~18%

Time is:
* Before: 0m4.428s
* After: 0m3.932s

i.e. an improvement of ~13%

Note that on this particular test, although the number of allocations is limited as much as we can, Boehm still cannibalizes the runtime, seeing that the real time is ~2.6s.
Perf gives a total time of ~38% in GC_mark and ~7% in GC_cache_miss.

Some more optimizations can be thought, but this shows how important a good GC is necessary for the future, this program can be a good metric of how GC usage can be improved.

For further reference, Python3 on the same test gives the following output on time and Valgrind:

* Time (user): 0m1.760s
* Time (real): 0m2.177s
* Valgrind: 8 845 966 867 Ir

Pull-Request: #2058
Reviewed-by: Jean Privat <[email protected]>
  • Loading branch information
privat committed May 10, 2016
2 parents 0ec0f67 + 5c54927 commit ea71df8
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 40 deletions.
48 changes: 48 additions & 0 deletions lib/core/text/abstract_text.nit
Original file line number Diff line number Diff line change
Expand Up @@ -1514,6 +1514,42 @@ abstract class Buffer
# In Buffers, the internal sequence of character is mutable
# Thus, `chars` can be used to modify the buffer.
redef fun chars: Sequence[Char] is abstract

# Appends `length` chars from `s` starting at index `from`
#
# ~~~nit
# var b = new Buffer
# b.append_substring("abcde", 1, 2)
# assert b == "bc"
# b.append_substring("vwxyz", 2, 3)
# assert b == "bcxyz"
# b.append_substring("ABCDE", 4, 300)
# assert b == "bcxyzE"
# b.append_substring("VWXYZ", 400, 1)
# assert b == "bcxyzE"
# ~~~
fun append_substring(s: Text, from, length: Int) do
if from < 0 then
length += from
from = 0
end
var ln = s.length
if (length + from) > ln then length = ln - from
if length <= 0 then return
append_substring_impl(s, from, length)
end

# Unsafe version of `append_substring` for performance
#
# NOTE: Use only if sure about `from` and `length`, no checks
# or bound recalculation is done
fun append_substring_impl(s: Text, from, length: Int) do
var pos = from
for i in [0 .. length[ do
self.add s[pos]
pos += 1
end
end
end

# View for chars on Buffer objects, extends Sequence
Expand Down Expand Up @@ -1755,6 +1791,18 @@ redef class Char
return cp >= 0xD800 and cp <= 0xDFFF
end

# Is `self` a UTF-16 high surrogate ?
fun is_hi_surrogate: Bool do
var cp = code_point
return cp >= 0xD800 and cp <= 0xDBFF
end

# Is `self` a UTF-16 low surrogate ?
fun is_lo_surrogate: Bool do
var cp = code_point
return cp >= 0xDC00 and cp <= 0xDFFF
end

# Length of `self` in a UTF-8 String
fun u8char_len: Int do
var c = self.code_point
Expand Down
67 changes: 37 additions & 30 deletions lib/core/text/flat.nit
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ abstract class FlatString

if from < 0 then
count += from
if count < 0 then return ""
if count <= 0 then return ""
from = 0
end

Expand Down Expand Up @@ -924,7 +924,10 @@ class FlatBuffer
is_dirty = true
_bytelen = 0
_length = 0
if written then reset
if written then
_capacity = 16
reset
end
end

redef fun empty do return new Buffer
Expand Down Expand Up @@ -1049,6 +1052,21 @@ class FlatBuffer
return new FlatBuffer.with_infos(r_items, byte_length, byte_length, count)
end

redef fun append_substring_impl(s, from, length) do
if length <= 0 then return
if not s isa FlatText then
super
return
end
var bytest = s.char_to_byte_index(from)
var bytend = s.char_to_byte_index(from + length - 1)
var btln = bytend - bytest + 1
enlarge(btln + _bytelen)
s._items.copy_to(_items, btln, bytest, _bytelen)
_bytelen += btln
_length += length
end

redef fun reverse
do
written = false
Expand Down Expand Up @@ -1351,37 +1369,26 @@ redef class NativeString
#
# Very unsafe, make sure to have room for this char prior to calling this function.
private fun set_char_at(pos: Int, c: Char) do
if c.code_point < 128 then
self[pos] = c.code_point.to_b
var cp = c.code_point
if cp < 128 then
self[pos] = cp.to_b
return
end
var ln = c.u8char_len
native_set_char(pos, c, ln)
end

private fun native_set_char(pos: Int, c: Char, ln: Int) `{
char* dst = self + pos;
switch(ln){
case 1:
dst[0] = c;
break;
case 2:
dst[0] = 0xC0 | ((c & 0x7C0) >> 6);
dst[1] = 0x80 | (c & 0x3F);
break;
case 3:
dst[0] = 0xE0 | ((c & 0xF000) >> 12);
dst[1] = 0x80 | ((c & 0xFC0) >> 6);
dst[2] = 0x80 | (c & 0x3F);
break;
case 4:
dst[0] = 0xF0 | ((c & 0x1C0000) >> 18);
dst[1] = 0x80 | ((c & 0x3F000) >> 12);
dst[2] = 0x80 | ((c & 0xFC0) >> 6);
dst[3] = 0x80 | (c & 0x3F);
break;
}
`}
if ln == 2 then
self[pos] = (0xC0 | ((cp & 0x7C0) >> 6)).to_b
self[pos + 1] = (0x80 | (cp & 0x3F)).to_b
else if ln == 3 then
self[pos] = (0xE0 | ((cp & 0xF000) >> 12)).to_b
self[pos + 1] = (0x80 | ((cp & 0xFC0) >> 6)).to_b
self[pos + 2] = (0x80 | (cp & 0x3F)).to_b
else if ln == 4 then
self[pos] = (0xF0 | ((cp & 0x1C0000) >> 18)).to_b
self[pos + 1] = (0x80 | ((cp & 0x3F000) >> 12)).to_b
self[pos + 2] = (0x80 | ((cp & 0xFC0) >> 6)).to_b
self[pos + 3] = (0x80 | (cp & 0x3F)).to_b
end
end
end

redef class Int
Expand Down
5 changes: 5 additions & 0 deletions lib/json/static.nit
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,11 @@ redef class JsonParseError
"\"position\":{position.to_json}," +
"\"message\":{message.to_json}\}"
end

redef fun pretty_json_visit(buf, indents) do
buf.clear
buf.append(to_json)
end
end

redef class Position
Expand Down
91 changes: 81 additions & 10 deletions lib/json/string_parser.nit
Original file line number Diff line number Diff line change
Expand Up @@ -231,32 +231,103 @@ class JSONStringParser
return val
end

private var parse_str_buf = new FlatBuffer

# Parses and returns a Nit string from a JSON String
fun parse_json_string: Jsonable do
var src = src
var ln = src.length
var p = pos
p += 1
if p > ln then return make_parse_error("Malformed JSON String")
var c = src[p]
var st = p
var ret = parse_str_buf
var chunk_st = p
while c != '"' do
if c == '\\' then
if p + 1 >= ln then return make_parse_error("Malformed Escape sequence in JSON string")
if c != '\\' then
p += 1
if p >= ln then return make_parse_error("Malformed JSON string")
c = src[p]
if c == 'u' then
continue
end
ret.append_substring_impl(src, chunk_st, p - chunk_st)
p += 1
if p >= ln then return make_parse_error("Malformed Escape sequence in JSON string")
c = src[p]
if c == 'r' then
ret.add '\r'
p += 1
else if c == 'n' then
ret.add '\n'
p += 1
else if c == 't' then
ret.add '\t'
p += 1
else if c == 'u' then
var cp = 0
p += 1
for i in [0 .. 4[ do
cp <<= 4
if p >= ln then make_parse_error("Malformed \uXXXX Escape sequence in JSON string")
c = src[p]
if c >= '0' and c <= '9' then
cp += c.code_point - '0'.code_point
else if c >= 'a' and c <= 'f' then
cp += c.code_point - 'a'.code_point + 10
else if c >= 'A' and c <= 'F' then
cp += c.code_point - 'A'.code_point + 10
else
make_parse_error("Malformed \uXXXX Escape sequence in JSON string")
end
p += 1
if p + 3 >= ln then return make_parse_error("Bad Unicode escape sequence in string")
for i in [0 .. 4[ do if not src[p + i].is_hexdigit then return make_parse_error("Bad Unicode escape sequence in string")
p += 3
end
c = cp.code_point
if cp >= 0xD800 and cp <= 0xDBFF then
if p >= ln then make_parse_error("Malformed \uXXXX Escape sequence in JSON string")
c = src[p]
if c != '\\' then make_parse_error("Malformed \uXXXX Escape sequence in JSON string")
p += 1
c = src[p]
if c != 'u' then make_parse_error("Malformed \uXXXX Escape sequence in JSON string")
var locp = 0
p += 1
for i in [0 .. 4[ do
locp <<= 4
if p > ln then make_parse_error("Malformed \uXXXX Escape sequence in JSON string")
c = src[p]
if c >= '0' and c <= '9' then
locp += c.code_point - '0'.code_point
else if c >= 'a' and c <= 'f' then
locp += c.code_point - 'a'.code_point + 10
else if c >= 'A' and c <= 'F' then
locp += c.code_point - 'A'.code_point + 10
else
make_parse_error("Malformed \uXXXX Escape sequence in JSON string")
end
p += 1
end
c = (((locp & 0x3FF) | ((cp & 0x3FF) << 10)) + 0x10000).code_point
end
ret.add c
else if c == 'b' then
ret.add 8.code_point
p += 1
else if c == 'f' then
ret.add '\f'
p += 1
else
p += 1
ret.add c
end
p += 1
if p >= ln then return make_parse_error("Malformed JSON String")
chunk_st = p
c = src[p]
end
pos = p + 1
return src.substring(st, p - st).unescape_json
if ret.is_empty then return src.substring(chunk_st, p - chunk_st)
ret.append_substring_impl(src, chunk_st, p - chunk_st)
var rets = ret.to_s
ret.clear
return rets
end

# Ignores any character until a JSON separator is encountered
Expand Down

0 comments on commit ea71df8

Please sign in to comment.