From 1bdb8de83894fdcd70794f81a045f876be807b61 Mon Sep 17 00:00:00 2001 From: Juniper Tyree <50025784+juntyr@users.noreply.github.com> Date: Tue, 9 Apr 2024 05:47:15 +0000 Subject: [PATCH] Speed up escaped byte buf parsing --- src/parse.rs | 110 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 85 insertions(+), 25 deletions(-) diff --git a/src/parse.rs b/src/parse.rs index 08fd7fe8c..079aeaa61 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -214,11 +214,6 @@ impl<'a> Parser<'a> { .find(|c| !condition(c)) .unwrap_or(self.src().len() - from) } - - #[must_use] - pub fn find_char_index(&self, condition: fn(char) -> bool) -> Option<(usize, char)> { - self.src().char_indices().find(|&(_, c)| condition(c)) - } } /// actual parsing of ron tokens @@ -619,7 +614,8 @@ impl<'a> Parser<'a> { Err(_) => parser.set_cursor(cursor_backup), } let cursor_backup = parser.cursor; - match parser.byte_string() { + // we have already checked for strings, which subsume base64 byte strings + match parser.byte_string_no_base64() { Ok(_) => (), // prevent quadratic complexity backtracking for unterminated byte string Err(err @ (Error::ExpectedStringEnd | Error::Eof)) => return Err(err), @@ -1058,7 +1054,13 @@ impl<'a> Parser<'a> { Err(_) => Err(Error::ExpectedByteString), } } - } else if self.consume_str("b\"") { + } else { + self.byte_string_no_base64() + } + } + + pub fn byte_string_no_base64(&mut self) -> Result> { + if self.consume_str("b\"") { self.escaped_byte_string() } else if self.consume_str("br") { self.raw_byte_string() @@ -1121,17 +1123,13 @@ impl<'a> Parser<'a> { } fn escaped_byte_buf(&mut self, encoding: EscapeEncoding) -> Result<(ParsedByteStr<'a>, usize)> { - let (i, end_or_escape) = self - .find_char_index(|c| matches!(c, '\\' | '"')) - .ok_or(Error::ExpectedStringEnd)?; - - if end_or_escape == '"' { - let s = &self.src().as_bytes()[..i]; + // Checking for '"' and '\\' separately is faster than searching for both at the same time + let str_end = self.src().find('"').ok_or(Error::ExpectedStringEnd)?; + let escape = self.src()[..str_end].find('\\'); - // Advance by the number of bytes of the string + 1 for the `"`. - Ok((ParsedByteStr::Slice(s), i + 1)) - } else { - let mut i = i; + if let Some(escape) = escape { + // Now check if escaping is used inside the string + let mut i = escape; let mut s = self.src().as_bytes()[..i].to_vec(); loop { @@ -1149,19 +1147,81 @@ impl<'a> Parser<'a> { }, } - let (new_i, end_or_escape) = self - .find_char_index(|c| matches!(c, '\\' | '"')) - .ok_or(Error::ExpectedStringEnd)?; + // Checking for '"' and '\\' separately is faster than searching for both at the same time + let new_str_end = self.src().find('"').ok_or(Error::ExpectedStringEnd)?; + let new_escape = self.src()[..new_str_end].find('\\'); - i = new_i; - s.extend_from_slice(&self.src().as_bytes()[..i]); - - if end_or_escape == '"' { + if let Some(new_escape) = new_escape { + s.extend_from_slice(&self.src().as_bytes()[..new_escape]); + i = new_escape; + } else { + s.extend_from_slice(&self.src().as_bytes()[..new_str_end]); // Advance to the end of the string + 1 for the `"`. - break Ok((ParsedByteStr::Allocated(s), i + 1)); + break Ok((ParsedByteStr::Allocated(s), new_str_end + 1)); } + + // let new_i = self.src().find(['\\', '"']).ok_or(Error::ExpectedStringEnd)?; + // let end_or_escape = self.src()[new_i..].chars().next().unwrap(); + + // let (new_i, end_or_escape) = self + // .find_char_index(|c| matches!(c, '\\' | '"')) + // .ok_or(Error::ExpectedStringEnd)?; + + // i = new_i; + // s.extend_from_slice(&self.src().as_bytes()[..i]); + + // // if end_or_escape == '"' { + // if self.src()[new_i..].starts_with('"') { + // // Advance to the end of the string + 1 for the `"`. + // break Ok((ParsedByteStr::Allocated(s), i + 1)); + // } + + // i = if let Some(new_i) = self.src().find('"') { + // s.extend_from_slice(&self.src().as_bytes()[..new_i]); + // // Advance to the end of the string + 1 for the `"`. + // break Ok((ParsedByteStr::Allocated(s), new_i + 1)); + // } else if let Some(new_i) = self.src().find('\\') { + // s.extend_from_slice(&self.src().as_bytes()[..new_i]); + // new_i + // } else { + // return Err(Error::ExpectedStringEnd); + // }; + + // let new_i = self.src().find(['\\', '"']).ok_or(Error::ExpectedStringEnd)?; + // let end_or_escape = self.src()[new_i..].chars().next().unwrap(); + + // let (new_i, end_or_escape) = self + // .find_char_index(|c| matches!(c, '\\' | '"')) + // .ok_or(Error::ExpectedStringEnd)?; + + // i = new_i; + // s.extend_from_slice(&self.src().as_bytes()[..i]); + + // if end_or_escape == '"' { + // if self.src()[new_i..].starts_with('"') { + // // Advance to the end of the string + 1 for the `"`. + // break Ok((ParsedByteStr::Allocated(s), i + 1)); + // } } + } else { + let s = &self.src().as_bytes()[..str_end]; + + // Advance by the number of bytes of the string + 1 for the `"`. + Ok((ParsedByteStr::Slice(s), str_end + 1)) } + + // if let Some(i) = self.src().find('"') { + // // First check the happy case of an unescaped string + // // (also finding based on a single char twice is faster) + // let s = &self.src().as_bytes()[..i]; + + // // Advance by the number of bytes of the string + 1 for the `"`. + // Ok((ParsedByteStr::Slice(s), i + 1)) + // } else if let Some(i) = self.src().find('\\') { + + // } else { + // Err(Error::ExpectedStringEnd) + // } } fn raw_byte_buf(&mut self) -> Result<(ParsedByteStr<'a>, usize)> {