Merge pull request #15 from secondlife/SRV-439

SRV-439 - performance optimizations for string handling in xml formatting
secondlife · Sep 7, 2023 · b703873 · b703873 · github-actions · Sep 7, 2023
2 parents a63abbe + 2432466
commit b703873
Show file tree

Hide file tree

Showing 7 changed files with 295 additions and 154 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -13,21 +13,23 @@ jobs:
       matrix:
         python-version: ['2.7', '3.7', '3.8', '3.10']
     runs-on: [ubuntu-latest]
+    container:
+      image: "python:${{ matrix.python-version }}-buster"
     env:
       PYTHON: ${{ matrix.python-version }}
     steps:
       - uses: actions/checkout@v3
         with:
           fetch-depth: 0 # fetch all history for setuptools_scm to be able to read tags
 
-      - uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.python-version }}
-
       - name: Install python dependencies
         run: |
-          pip install wheel build tox
-          pip install .[dev]
+          apt-get update
+          apt-get -y install sudo
+          pip install --upgrade pip
+          sudo chown root .
+          sudo -H pip install wheel build tox
+          sudo -H pip install .[dev]
 
       - name: Determine pyenv
         id: pyenv

diff --git a/llsd/base.py b/llsd/base.py
@@ -31,6 +31,8 @@
 
 ALL_CHARS = str(bytearray(range(256))) if PY2 else bytes(range(256))
 
+MAX_FORMAT_DEPTH = 200
+MAX_PARSE_DEPTH = 200
 
 class _LLSD:
     __metaclass__ = abc.ABCMeta
@@ -209,7 +211,7 @@ def _parse_datestr(datestr):
     return datetime.datetime(year, month, day, hour, minute, second, usec)
 
 
-def _bool_to_python(node):
+def _bool_to_python(node, depth=0):
     "Convert boolean node to a python object."
     val = node.text or ''
     try:
@@ -220,35 +222,35 @@ def _bool_to_python(node):
        return bool(val)
 
 
-def _int_to_python(node):
+def _int_to_python(node, depth=0):
     "Convert integer node to a python object."
     val = node.text or ''
     if not val.strip():
         return 0
     return int(val)
 
 
-def _real_to_python(node):
+def _real_to_python(node, depth=0):
     "Convert floating point node to a python object."
     val = node.text or ''
     if not val.strip():
         return 0.0
     return float(val)
 
 
-def _uuid_to_python(node):
+def _uuid_to_python(node, depth=0):
     "Convert uuid node to a python object."
     if node.text:
         return uuid.UUID(hex=node.text)
     return uuid.UUID(int=0)
 
 
-def _str_to_python(node):
+def _str_to_python(node, depth=0):
     "Convert string node to a python object."
     return node.text or ''
 
 
-def _bin_to_python(node):
+def _bin_to_python(node, depth=0):
     base = node.get('encoding') or 'base64'
     try:
         if base == 'base16':
@@ -267,38 +269,38 @@ def _bin_to_python(node):
         return LLSDParseError("Bad binary data: " + str(exc))
 
 
-def _date_to_python(node):
+def _date_to_python(node, depth=0):
     "Convert date node to a python object."
     val = node.text or ''
     if not val:
         val = "1970-01-01T00:00:00Z"
     return _parse_datestr(val)
 
 
-def _uri_to_python(node):
+def _uri_to_python(node, depth=0):
     "Convert uri node to a python object."
     val = node.text or ''
     return uri(val)
 
 
-def _map_to_python(node):
+def _map_to_python(node, depth=0):
     "Convert map node to a python object."
     result = {}
     for index in range(len(node))[::2]:
         if node[index].text is None:
-            result[''] = _to_python(node[index+1])
+            result[''] = _to_python(node[index+1], depth+1)
         else:
-            result[node[index].text] = _to_python(node[index+1])
+            result[node[index].text] = _to_python(node[index+1], depth+1)
     return result
 
 
-def _array_to_python(node):
+def _array_to_python(node, depth=0):
     "Convert array node to a python object."
-    return [_to_python(child) for child in node]
+    return [_to_python(child, depth+1) for child in node]
 
 
 NODE_HANDLERS = dict(
-    undef=lambda x: None,
+    undef=lambda x,y: None,
     boolean=_bool_to_python,
     integer=_int_to_python,
     real=_real_to_python,
@@ -312,9 +314,12 @@ def _array_to_python(node):
 )
 
 
-def _to_python(node):
+def _to_python(node, depth=0):
     "Convert node to a python object."
-    return NODE_HANDLERS[node.tag](node)
+    if depth > MAX_PARSE_DEPTH:
+        raise LLSDParseError("Cannot parse depth of more than %d" % MAX_PARSE_DEPTH)
+
+    return NODE_HANDLERS[node.tag](node, depth)
 
 
 class LLSDBaseFormatter(object):

diff --git a/llsd/serde_binary.py b/llsd/serde_binary.py
@@ -5,7 +5,7 @@
 import uuid
 
 from llsd.base import (_LLSD, LLSDBaseParser, LLSDSerializationError, BINARY_HEADER,
-                       _str_to_bytes, binary, is_integer, is_string, uri)
+                       MAX_FORMAT_DEPTH, MAX_PARSE_DEPTH, _str_to_bytes, binary, is_integer, is_string, uri)
 
 
 try:
@@ -15,14 +15,13 @@
     # Python 3: 'range()' is already lazy
     pass
 
-
 class LLSDBinaryParser(LLSDBaseParser):
     """
     Parse application/llsd+binary to a python object.
 
     See http://wiki.secondlife.com/wiki/LLSD#Binary_Serialization
     """
-    __slots__ = ['_dispatch', '_keep_binary']
+    __slots__ = ['_dispatch', '_keep_binary', '_depth']
 
     def __init__(self):
         super(LLSDBinaryParser, self).__init__()
@@ -63,6 +62,7 @@ def __init__(self):
         # entries in _dispatch.
         for c, func in _dispatch_dict.items():
             self._dispatch[ord(c)] = func
+        self._depth = 0
 
     def parse(self, something, ignore_binary = False):
         """
@@ -82,6 +82,9 @@ def parse(self, something, ignore_binary = False):
 
     def _parse(self):
         "The actual parser which is called recursively when necessary."
+        if self._depth > MAX_PARSE_DEPTH:
+            self._error("Parse depth exceeded maximum depth of %d." % MAX_PARSE_DEPTH)
+
         cc = self._getc()
         try:
             func = self._dispatch[ord(cc)]
@@ -97,6 +100,7 @@ def _parse_map(self):
         count = 0
         cc = self._getc()
         key = b''
+        self._depth += 1
         while (cc != b'}') and (count < size):
             if cc == b'k':
                 key = self._parse_string()
@@ -110,16 +114,19 @@ def _parse_map(self):
             cc = self._getc()
         if cc != b'}':
             self._error("invalid map close token")
+        self._depth -= 1
         return rv
 
     def _parse_array(self):
         "Parse a single llsd array"
         rv = []
+        self._depth += 1
         size = struct.unpack("!i", self._getc(4))[0]
         for count in range(size):
             rv.append(self._parse())
         if self._getc() != b']':
             self._error("invalid array close token")
+        self._depth -= 1
         return rv
 
     def _parse_string(self):
@@ -164,15 +171,19 @@ def format_binary(something):
 
 def write_binary(stream, something):
     stream.write(b'<?llsd/binary?>\n')
-    _write_binary_recurse(stream, something)
+    _write_binary_recurse(stream, something, 0)
 
 
-def _write_binary_recurse(stream, something):
+def _write_binary_recurse(stream, something, depth):
     "Binary formatter workhorse."
+
+    if depth > MAX_FORMAT_DEPTH:
+        raise LLSDSerializationError("Cannot serialize depth of more than %d" % MAX_FORMAT_DEPTH)
+
     if something is None:
         stream.write(b'!')
     elif isinstance(something, _LLSD):
-        _write_binary_recurse(stream, something.thing)
+        _write_binary_recurse(stream, something.thing, depth)
     elif isinstance(something, bool):
         stream.write(b'1' if something else b'0')
     elif is_integer(something):
@@ -202,27 +213,27 @@ def _write_binary_recurse(stream, something):
         seconds_since_epoch = calendar.timegm(something.timetuple())
         stream.writelines([b'd', struct.pack('<d', seconds_since_epoch)])
     elif isinstance(something, (list, tuple)):
-        _write_list(stream, something)
+        _write_list(stream, something, depth)
     elif isinstance(something, dict):
         stream.writelines([b'{', struct.pack('!i', len(something))])
         for key, value in something.items():
             key = _str_to_bytes(key)
             stream.writelines([b'k', struct.pack('!i', len(key)), key])
-            _write_binary_recurse(stream, value)
+            _write_binary_recurse(stream, value, depth+1)
         stream.write(b'}')
     else:
         try:
-            return _write_list(stream, list(something))
+            return _write_list(stream, list(something), depth)
         except TypeError:
             raise LLSDSerializationError(
                 "Cannot serialize unknown type: %s (%s)" %
                 (type(something), something))
 
 
-def _write_list(stream, something):
+def _write_list(stream, something, depth):
     stream.writelines([b'[', struct.pack('!i', len(something))])
     for item in something:
-        _write_binary_recurse(stream, item)
+        _write_binary_recurse(stream, item, depth+1)
     stream.write(b']')
 
 

diff --git a/llsd/serde_notation.py b/llsd/serde_notation.py
@@ -4,7 +4,7 @@
 import uuid
 
 from llsd.base import (_LLSD, B, LLSDBaseFormatter, LLSDBaseParser, NOTATION_HEADER,
-                       LLSDParseError, LLSDSerializationError, UnicodeType,
+                       MAX_FORMAT_DEPTH, MAX_PARSE_DEPTH, LLSDParseError, LLSDSerializationError, UnicodeType,
                        _format_datestr, _parse_datestr, _str_to_bytes, binary, uri)
 
 
@@ -70,6 +70,7 @@ def __init__(self):
         # Then fill in specific entries based on the dict above.
         for c, func in _dispatch_dict.items():
             self._dispatch[ord(c)] = func
+        self._depth = 0
 
     def parse(self, something, ignore_binary = False):
         """
@@ -107,6 +108,8 @@ def _get_until(self, delim):
 
     def _parse(self, cc):
         "The notation parser workhorse."
+        if self._depth > MAX_PARSE_DEPTH:
+            self._error("Parse depth exceeded max of %d" % MAX_PARSE_DEPTH)
         try:
             func = self._dispatch[ord(cc)]
         except IndexError:
@@ -182,6 +185,7 @@ def _parse_map(self, cc):
         rv = {}
         key = b''
         found_key = False
+        self._depth += 1
         # skip the beginning '{'
         cc = self._getc()
         while (cc != b'}'):
@@ -207,6 +211,7 @@ def _parse_map(self, cc):
             else:
                 self._error("missing separator")
             cc = self._getc()
+        self._depth -= 1
 
         return rv
 
@@ -217,6 +222,7 @@ def _parse_array(self, cc):
         array: [ object, object, object ]
         """
         rv = []
+        self._depth += 1
         # skip the beginning '['
         cc = self._getc()
         while (cc != b']'):
@@ -227,7 +233,7 @@ def _parse_array(self, cc):
                 continue
             rv.append(self._parse(cc))
             cc = self._getc()
-
+        self._depth -= 1
         return rv
 
     def _parse_uuid(self, cc):
@@ -411,6 +417,11 @@ class LLSDNotationFormatter(LLSDBaseFormatter):
 
     See http://wiki.secondlife.com/wiki/LLSD#Notation_Serialization
     """
+
+    def __init__(self):
+        super(LLSDNotationFormatter, self).__init__()
+        self._depth = 0
+
     def _LLSD(self, v):
         return self._generate(v.thing)
     def _UNDEF(self, v):
@@ -443,18 +454,22 @@ def _DATE(self, v):
     def _ARRAY(self, v):
         self.stream.write(b'[')
         delim = b''
+        self._depth += 1
         for item in v:
             self.stream.write(delim)
             self._generate(item)
             delim = b','
+        self._depth -= 1
         self.stream.write(b']')
     def _MAP(self, v):
         self.stream.write(b'{')
         delim = b''
+        self._depth += 1
         for key, value in v.items():
             self.stream.writelines([delim, b"'", self._esc(UnicodeType(key)), b"':"])
             self._generate(value)
             delim = b','
+        self._depth -= 1
         self.stream.write(b'}')
 
     def _esc(self, data, quote=b"'"):
@@ -466,6 +481,9 @@ def _generate(self, something):
 
         :param something: a python object (typically a dict) to be serialized.
         """
+        if self._depth > MAX_FORMAT_DEPTH:
+            raise LLSDSerializationError("Cannot serialize depth of more than %d" % MAX_FORMAT_DEPTH)
+
         t = type(something)
         handler = self.type_map.get(t)
         if handler: