Merge: JSON Parser Optimization

This PR improves the speed of JSON parsing with the `string_parser` module. The new version of `parse_json_string` is heavily inspired by Python's implementation, available [here](https://github.com/python/cpython/blob/master/Modules/_json.c) Time is usually either as performing as before or better, Valgrind also gives the new version an advantage, sometimes significant. On `large_escaped` (~120M, lots of escaped characters), we have: * Before: 13 748 436 458 Ir * After: 11 687 104 643 Ir i.e. an improvement of ~18% Time is: * Before: 0m4.428s * After: 0m3.932s i.e. an improvement of ~13% Note that on this particular test, although the number of allocations is limited as much as we can, Boehm still cannibalizes the runtime, seeing that the real time is ~2.6s. Perf gives a total time of ~38% in GC_mark and ~7% in GC_cache_miss. Some more optimizations can be thought, but this shows how important a good GC is necessary for the future, this program can be a good metric of how GC usage can be improved. For further reference, Python3 on the same test gives the following output on time and Valgrind: * Time (user): 0m1.760s * Time (real): 0m2.177s * Valgrind: 8 845 966 867 Ir Pull-Request: #2058 Reviewed-by: Jean Privat <[email protected]>
nitlang · May 10, 2016 · ea71df8 · ea71df8
2 parents 0ec0f67 + 5c54927
commit ea71df8
Show file tree

Hide file tree

Showing 4 changed files with 171 additions and 40 deletions.
diff --git a/lib/core/text/abstract_text.nit b/lib/core/text/abstract_text.nit
@@ -1514,6 +1514,42 @@ abstract class Buffer
 	# In Buffers, the internal sequence of character is mutable
 	# Thus, `chars` can be used to modify the buffer.
 	redef fun chars: Sequence[Char] is abstract
+
+	# Appends `length` chars from `s` starting at index `from`
+	#
+	# ~~~nit
+	#	var b = new Buffer
+	#	b.append_substring("abcde", 1, 2)
+	#	assert b == "bc"
+	#	b.append_substring("vwxyz", 2, 3)
+	#	assert b == "bcxyz"
+	#	b.append_substring("ABCDE", 4, 300)
+	#	assert b == "bcxyzE"
+	#	b.append_substring("VWXYZ", 400, 1)
+	#	assert b == "bcxyzE"
+	# ~~~
+	fun append_substring(s: Text, from, length: Int) do
+		if from < 0 then
+			length += from
+			from = 0
+		end
+		var ln = s.length
+		if (length + from) > ln then length = ln - from
+		if length <= 0 then return
+		append_substring_impl(s, from, length)
+	end
+
+	# Unsafe version of `append_substring` for performance
+	#
+	# NOTE: Use only if sure about `from` and `length`, no checks
+	# or bound recalculation is done
+	fun append_substring_impl(s: Text, from, length: Int) do
+		var pos = from
+		for i in [0 .. length[ do
+			self.add s[pos]
+			pos += 1
+		end
+	end
 end
 
 # View for chars on Buffer objects, extends Sequence
@@ -1755,6 +1791,18 @@ redef class Char
 		return cp >= 0xD800 and cp <= 0xDFFF
 	end
 
+	# Is `self` a UTF-16 high surrogate ?
+	fun is_hi_surrogate: Bool do
+		var cp = code_point
+		return cp >= 0xD800 and cp <= 0xDBFF
+	end
+
+	# Is `self` a UTF-16 low surrogate ?
+	fun is_lo_surrogate: Bool do
+		var cp = code_point
+		return cp >= 0xDC00 and cp <= 0xDFFF
+	end
+
 	# Length of `self` in a UTF-8 String
 	fun u8char_len: Int do
 		var c = self.code_point

diff --git a/lib/core/text/flat.nit b/lib/core/text/flat.nit
@@ -411,7 +411,7 @@ abstract class FlatString
 
 		if from < 0 then
 			count += from
-			if count < 0 then return ""
+			if count <= 0 then return ""
 			from = 0
 		end
 
@@ -924,7 +924,10 @@ class FlatBuffer
 		is_dirty = true
 		_bytelen = 0
 		_length = 0
-		if written then reset
+		if written then
+			_capacity = 16
+			reset
+		end
 	end
 
 	redef fun empty do return new Buffer
@@ -1049,6 +1052,21 @@ class FlatBuffer
 		return new FlatBuffer.with_infos(r_items, byte_length, byte_length, count)
 	end
 
+	redef fun append_substring_impl(s, from, length) do
+		if length <= 0 then return
+		if not s isa FlatText then
+			super
+			return
+		end
+		var bytest = s.char_to_byte_index(from)
+		var bytend = s.char_to_byte_index(from + length - 1)
+		var btln = bytend - bytest + 1
+		enlarge(btln + _bytelen)
+		s._items.copy_to(_items, btln, bytest, _bytelen)
+		_bytelen += btln
+		_length += length
+	end
+
 	redef fun reverse
 	do
 		written = false
@@ -1351,37 +1369,26 @@ redef class NativeString
 	#
 	# Very unsafe, make sure to have room for this char prior to calling this function.
 	private fun set_char_at(pos: Int, c: Char) do
-		if c.code_point < 128 then
-			self[pos] = c.code_point.to_b
+		var cp = c.code_point
+		if cp < 128 then
+			self[pos] = cp.to_b
 			return
 		end
 		var ln = c.u8char_len
-		native_set_char(pos, c, ln)
-	end
-
-	private fun native_set_char(pos: Int, c: Char, ln: Int) `{
-		char* dst = self + pos;
-		switch(ln){
-			case 1:
-				dst[0] = c;
-				break;
-			case 2:
-				dst[0] = 0xC0 | ((c & 0x7C0) >> 6);
-				dst[1] = 0x80 | (c & 0x3F);
-				break;
-			case 3:
-				dst[0] = 0xE0 | ((c & 0xF000) >> 12);
-				dst[1] = 0x80 | ((c & 0xFC0) >> 6);
-				dst[2] = 0x80 | (c & 0x3F);
-				break;
-			case 4:
-				dst[0] = 0xF0 | ((c & 0x1C0000) >> 18);
-				dst[1] = 0x80 | ((c & 0x3F000) >> 12);
-				dst[2] = 0x80 | ((c & 0xFC0) >> 6);
-				dst[3] = 0x80 | (c & 0x3F);
-				break;
-		}
-	`}
+		if ln == 2 then
+			self[pos] = (0xC0 | ((cp & 0x7C0) >> 6)).to_b
+			self[pos + 1] = (0x80 | (cp & 0x3F)).to_b
+		else if ln == 3 then
+			self[pos] = (0xE0 | ((cp & 0xF000) >> 12)).to_b
+			self[pos + 1] = (0x80 | ((cp & 0xFC0) >> 6)).to_b
+			self[pos + 2] = (0x80 | (cp & 0x3F)).to_b
+		else if ln == 4 then
+			self[pos] = (0xF0 | ((cp & 0x1C0000) >> 18)).to_b
+			self[pos + 1] = (0x80 | ((cp & 0x3F000) >> 12)).to_b
+			self[pos + 2] = (0x80 | ((cp & 0xFC0) >> 6)).to_b
+			self[pos + 3] = (0x80 | (cp & 0x3F)).to_b
+		end
+	end
 end
 
 redef class Int

diff --git a/lib/json/static.nit b/lib/json/static.nit
@@ -439,6 +439,11 @@ redef class JsonParseError
 				"\"position\":{position.to_json}," +
 				"\"message\":{message.to_json}\}"
 	end
+
+	redef fun pretty_json_visit(buf, indents) do
+		buf.clear
+		buf.append(to_json)
+	end
 end
 
 redef class Position

diff --git a/lib/json/string_parser.nit b/lib/json/string_parser.nit
@@ -231,32 +231,103 @@ class JSONStringParser
 		return val
 	end
 
+	private var parse_str_buf = new FlatBuffer
+
 	# Parses and returns a Nit string from a JSON String
 	fun parse_json_string: Jsonable do
+		var src = src
 		var ln = src.length
 		var p = pos
 		p += 1
 		if p > ln then return make_parse_error("Malformed JSON String")
 		var c = src[p]
-		var st = p
+		var ret = parse_str_buf
+		var chunk_st = p
 		while c != '"' do
-			if c == '\\' then
-				if p + 1 >= ln then return make_parse_error("Malformed Escape sequence in JSON string")
+			if c != '\\' then
 				p += 1
+				if p >= ln then return make_parse_error("Malformed JSON string")
 				c = src[p]
-				if c == 'u' then
+				continue
+			end
+			ret.append_substring_impl(src, chunk_st, p - chunk_st)
+			p += 1
+			if p >= ln then return make_parse_error("Malformed Escape sequence in JSON string")
+			c = src[p]
+			if c == 'r' then
+				ret.add '\r'
+				p += 1
+			else if c == 'n' then
+				ret.add '\n'
+				p += 1
+			else if c == 't' then
+				ret.add '\t'
+				p += 1
+			else if c == 'u' then
+				var cp = 0
+				p += 1
+				for i in [0 .. 4[ do
+					cp <<= 4
+					if p >= ln then make_parse_error("Malformed \uXXXX Escape sequence in JSON string")
+					c = src[p]
+					if c >= '0' and c <= '9' then
+						cp += c.code_point - '0'.code_point
+					else if c >= 'a' and c <= 'f' then
+						cp += c.code_point - 'a'.code_point + 10
+					else if c >= 'A' and c <= 'F' then
+						cp += c.code_point - 'A'.code_point + 10
+					else
+						make_parse_error("Malformed \uXXXX Escape sequence in JSON string")
+					end
 					p += 1
-					if p + 3 >= ln then return make_parse_error("Bad Unicode escape sequence in string")
-					for i in [0 .. 4[ do if not src[p + i].is_hexdigit then return make_parse_error("Bad Unicode escape sequence in string")
-					p += 3
 				end
+				c = cp.code_point
+				if cp >= 0xD800 and cp <= 0xDBFF then
+					if p >= ln then make_parse_error("Malformed \uXXXX Escape sequence in JSON string")
+					c = src[p]
+					if c != '\\' then make_parse_error("Malformed \uXXXX Escape sequence in JSON string")
+					p += 1
+					c = src[p]
+					if c != 'u' then make_parse_error("Malformed \uXXXX Escape sequence in JSON string")
+					var locp = 0
+					p += 1
+					for i in [0 .. 4[ do
+						locp <<= 4
+						if p > ln then make_parse_error("Malformed \uXXXX Escape sequence in JSON string")
+						c = src[p]
+						if c >= '0' and c <= '9' then
+							locp += c.code_point - '0'.code_point
+						else if c >= 'a' and c <= 'f' then
+							locp += c.code_point - 'a'.code_point + 10
+						else if c >= 'A' and c <= 'F' then
+							locp += c.code_point - 'A'.code_point + 10
+						else
+							make_parse_error("Malformed \uXXXX Escape sequence in JSON string")
+						end
+						p += 1
+					end
+					c = (((locp & 0x3FF) | ((cp & 0x3FF) << 10)) + 0x10000).code_point
+				end
+				ret.add c
+			else if c == 'b' then
+				ret.add 8.code_point
+				p += 1
+			else if c == 'f' then
+				ret.add '\f'
+				p += 1
+			else
+				p += 1
+				ret.add c
 			end
-			p += 1
-			if p >= ln then return make_parse_error("Malformed JSON String")
+			chunk_st = p
 			c = src[p]
 		end
 		pos = p + 1
-		return src.substring(st, p - st).unescape_json
+		if ret.is_empty then return src.substring(chunk_st, p - chunk_st)
+		ret.append_substring_impl(src, chunk_st, p - chunk_st)
+		var rets = ret.to_s
+		ret.clear
+		return rets
 	end
 
 	# Ignores any character until a JSON separator is encountered