optimize double quota string literals

2026-08-02 12:24:49 +02:00 · 2020-12-08 21:32:46 +01:00 · 2020-12-08 21:32:46 +01:00 · 7d6934a265
commit 7d6934a265
parent f7ae4135d6
3 changed files with 152 additions and 136 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -12,6 +12,7 @@ documentation = "https://docs.rs/php-literal-parser"
 logos = "0.11"
 thiserror = "1.0"
 source-span = "2.2"
 memchr = "2.3.4"
 [dev-dependencies]
 maplit = "1.0.2"
--- a/benches/parse.rs
+++ b/benches/parse.rs
@ -13,3 +13,21 @@ fn perf_parse_int_basic(b: &mut Bencher) {
        assert_eq!(parse(input).unwrap(), 12345676);
    });
 }
 #[bench]
 fn perf_str_basic(b: &mut Bencher) {
    let input = r#""aut dolores excepturi rerum est velit ad natus eveniet quo tenetur et fugiat sit velit ipsam nesciunt sint et architecto""#;
    b.iter(|| {
        assert!(parse(input).unwrap().is_string());
    });
 }
 #[bench]
 fn perf_str_escape(b: &mut Bencher) {
    let input = r#""aut dolores excepturi rerum est velit ad natus \"eveniet\" quo tenetur et fugiat sit velit ipsam nesciunt sint et architecto""#;
    b.iter(|| {
        assert!(parse(input).unwrap().is_string());
    });
 }
--- a/src/string.rs
+++ b/src/string.rs
@ -1,7 +1,3 @@
 /// unescaping php string literals borrowed mostly from `escape8259`
 use std::char::decode_utf16;
 use std::iter::{once, Peekable};
 #[derive(Debug, Clone, Eq, PartialEq)]
 /// An error occurred while
@ -12,100 +8,61 @@ type UnescapeResult<T> = Result<T, UnescapeError>;
 // Used to collect output characters and queue u16 values for translation.
 struct UnescapeState {
    // The accumulated characters
-    out: String,
+    out: Vec<u8>,
    // Store a fragment of a large character for later decoding
    stash: u16,
 }
 impl UnescapeState {
    fn new() -> UnescapeState {
        UnescapeState { out: Vec::new() }
    }
    fn with_capacity(capacity: usize) -> UnescapeState {
        UnescapeState {
-            out: String::new(),
+            out: Vec::with_capacity(capacity),
            stash: 0,
        }
    }
    // Collect a new character
-    fn push_char(&mut self, c: char) -> UnescapeResult<()> {
+    fn push_char(&mut self, c: char) {
-        if self.stash != 0 {
+        let mut buff = [0; 8];
-            return Err(UnescapeError);
+        self.out
-        }
+            .extend_from_slice(c.encode_utf8(&mut buff).as_bytes());
        self.out.push(c);
        Ok(())
    }
-    // Collect a new UTF16 word.  This can either be one whole character,
+    fn push_u8(&mut self, c: u8) {
    // or part of a larger character.
    fn push_u16(&mut self, x: u16) -> UnescapeResult<()> {
        let surrogate = x >= 0xD800 && x <= 0xDFFF;
        match (self.stash, surrogate) {
            (0, false) => {
                // The std library only provides utf16 decode of an iterator,
                // so to decode a single character we wrap it in a `once`.
                // Hopefully the compiler will elide most of this extra work.
                match decode_utf16(once(x)).next() {
                    Some(Ok(c)) => {
        self.out.push(c);
    }
                    _ => return Err(UnescapeError),
                }
            }
            (0, true) => self.stash = x,
            (_, false) => {
                return Err(UnescapeError);
            }
            (w, true) => {
                let words = [w, x];
                match decode_utf16(words.iter().copied()).next() {
                    Some(Ok(c)) => {
                        self.out.push(c);
                        self.stash = 0;
                    }
                    _ => return Err(UnescapeError),
                }
            }
        }
        Ok(())
    }
-    // If we queued up part of a UTF-16 encoded word but didn't
+    fn push_raw(&mut self, c: u32) -> UnescapeResult<()> {
-    // finish it, return an error.  Otherwise, consume self and
+        match std::char::from_u32(c) {
-    // return the accumulated String.
+            Some(c) => Ok(self.push_char(c)),
-    fn finalize(self) -> UnescapeResult<String> {
+            None => Err(UnescapeError),
        if self.stash != 0 {
            return Err(UnescapeError);
        }
        Ok(self.out)
        }
    }
-fn parse_u16_hex<S>(s: &mut Peekable<S>, max: Option<u8>) -> UnescapeResult<u16>
+    fn push_slice(&mut self, slice: &[u8]) {
-where
+        self.out.extend_from_slice(slice);
-    S: Iterator<Item = char>,
+    }
-{
+
-    let mut result = 0;
+    fn finalize(self) -> String {
        // this is safe because we only push bytes into the buffer that either
        //   - come from the source &str, and are delimited a \
        //   - are validated unicode points, utf8 encoded
        unsafe { String::from_utf8_unchecked(self.out) }
    }
 }
 fn parse_u32(
    s: &mut PeekableBytes,
    radix: u32,
    mut result: u32,
    max: Option<u8>,
 ) -> UnescapeResult<u32> {
    let mut max = max.unwrap_or(u8::max_value());
-    while s.peek().map(|c| c.is_ascii_hexdigit()).unwrap_or_default() {
+    while let Some(digit) = s.peek().and_then(|digit| (digit as char).to_digit(radix)) {
-        result *= 16;
+        let _ = s.next(); // consume the digit we peeked
-        result += s.next().unwrap().to_digit(16).unwrap() as u16;
+        result = result.checked_mul(radix).ok_or(UnescapeError)?;
-        max -= 1;
+        result = result.checked_add(digit).ok_or(UnescapeError)?;
        if max == 0 {
            break;
        }
    }
    Ok(result)
 }
 fn parse_u16_oct<S>(s: &mut Peekable<S>, mut result: u16, max: Option<u8>) -> UnescapeResult<u16>
 where
    S: Iterator<Item = char>,
 {
    let mut max = max.unwrap_or(u8::max_value());
    while s.peek().map(|c| c >= &'1' && c <= &'7').unwrap_or_default() {
        let digit = s.next().unwrap();
        dbg!(digit);
        result *= 8;
        result += digit.to_digit(8).unwrap() as u16;
        max -= 1;
        if max == 0 {
            break;
@ -126,80 +83,114 @@ pub fn unescape_single(s: &str) -> UnescapeResult<String> {
                    return Err(UnescapeError);
                }
                Some(d) => match d {
-                    '\\' | '\'' => state.push_char(d)?,
+                    '\\' | '\'' => state.push_char(d),
                    _ => {
-                        state.push_char('\\')?;
+                        state.push_char('\\');
-                        state.push_char(d)?
+                        state.push_char(d)
                    }
                },
            }
        } else {
-            state.push_char(c)?;
+            state.push_char(c);
        }
    }
-    state.finalize()
+    Ok(state.finalize())
 }
-/// Un-escape a string, following php double quote rules
+fn handle_escape<'a>(bytes: &'a [u8], state: &mut UnescapeState) -> UnescapeResult<&'a [u8]> {
-pub fn unescape_double(s: &str) -> UnescapeResult<String> {
+    let mut ins = PeekableBytes::new(bytes);
-    let mut state = UnescapeState::new();
+    debug_assert_eq!(ins.next(), Some(b'\\'));
    let mut ins = s.chars().peekable();
    while let Some(c) = ins.next() {
        if c == '\\' {
    match ins.next() {
        None => {
            return Err(UnescapeError);
        }
        Some(d) => {
            match d {
-                        '$' | '"' | '\\' => state.push_char(d)?,
+                b'$' | b'"' | b'\\' => state.push_u8(d),
-                        'n' => state.push_char('\n')?,   // linefeed
+                b'n' => state.push_u8(b'\n'),   // linefeed
-                        'r' => state.push_char('\r')?,   // carriage return
+                b'r' => state.push_u8(b'\r'),   // carriage return
-                        't' => state.push_char('\t')?,   // tab
+                b't' => state.push_u8(b'\t'),   // tab
-                        'v' => state.push_char('\x0B')?, // vertical tab
+                b'v' => state.push_u8(b'\x0B'), // vertical tab
-                        'f' => state.push_char('\x0C')?, // form feed
+                b'f' => state.push_u8(b'\x0C'), // form feed
-                        'x' => {
+                b'x' => {
-                            let val = parse_u16_hex(&mut ins, Some(2))?;
+                    let val = parse_u32(&mut ins, 16, 0, Some(2))?;
-                            state.push_u16(val)?;
+                    state.push_raw(val)?;
                }
-                        'u' => match ins.next() {
+                b'u' => match ins.next() {
-                            Some('{') => {
+                    Some(b'{') => {
-                                let val = parse_u16_hex(&mut ins, None)?;
+                        let val = parse_u32(&mut ins, 16, 0, None)?;
-                                state.push_u16(val)?;
+                        state.push_raw(val)?;
-                                if !matches!(ins.next(), Some('}')) {
+                        if !matches!(ins.next(), Some(b'}')) {
                            return Err(UnescapeError);
                        }
                    }
                    Some(d) => {
-                                state.push_char('\\')?;
+                        state.push_u8(b'\\');
-                                state.push_char('u')?;
+                        state.push_u8(b'u');
-                                state.push_char(d)?;
+                        state.push_u8(d);
                    }
                    None => {
-                                state.push_char('\\')?;
+                        state.push_u8(b'\\');
-                                state.push_char(d)?;
+                        state.push_u8(d);
                    }
                },
-                        '0'..='7' => {
+                b'0'..=b'7' => {
-                            let val =
+                    let val = parse_u32(&mut ins, 8, (d as char).to_digit(8).unwrap(), Some(3))?;
-                                parse_u16_oct(&mut ins, d.to_digit(8).unwrap() as u16, Some(3))?;
+                    state.push_raw(val)?;
                            state.push_u16(val)?;
                }
                _ => {
-                            state.push_char('\\')?;
+                    state.push_u8(b'\\');
-                            state.push_char(d)?
+                    state.push_u8(d)
                }
            }
        }
    }
-        } else {
+    Ok(ins.as_slice())
-            state.push_char(c)?;
+}
 /// Un-escape a string, following php double quote rules
 pub fn unescape_double(s: &str) -> UnescapeResult<String> {
    let mut state = UnescapeState::with_capacity(s.len());
    let mut bytes = s.as_bytes();
    while let Some(escape_index) = memchr::memchr(b'\\', bytes) {
        state.push_slice(&bytes[0..escape_index]);
        bytes = &bytes[escape_index..];
        bytes = handle_escape(bytes, &mut state)?;
    }
    state.push_slice(&bytes[0..]);
    Ok(state.finalize())
 }
 struct PeekableBytes<'a> {
    slice: &'a [u8],
    pos: usize,
 }
 impl<'a> Iterator for PeekableBytes<'a> {
    type Item = u8;
    fn next(&mut self) -> Option<Self::Item> {
        let byte = self.slice.get(self.pos)?;
        self.pos += 1;
        Some(*byte)
    }
 }
-    state.finalize()
+impl<'a> PeekableBytes<'a> {
    pub fn new(slice: &'a [u8]) -> Self {
        PeekableBytes { slice, pos: 0 }
    }
    pub fn peek(&self) -> Option<u8> {
        self.slice.get(self.pos).copied()
    }
    pub fn as_slice(self) -> &'a [u8] {
        &self.slice[self.pos..]
    }
 }
 #[cfg(test)]
@ -231,7 +222,7 @@ mod tests {
        assert_eq!(unescape_double(r#" \"abc\" "#), Ok(" \"abc\" ".into()));
        assert_eq!(unescape_double(r#"𝄞"#), Ok("𝄞".into()));
        assert_eq!(unescape_double(r#"\𝄞"#), Ok("\\𝄞".into()));
-        assert_eq!(unescape_double(r#"\u{D834}\u{DD1E}"#), Ok("𝄞".into()));
+        assert_eq!(unescape_double(r#"\u{1D11E}"#), Ok("𝄞".into()));
        assert_eq!(unescape_double(r#"\xD834"#), Ok("\u{D8}34".into()));
        assert_eq!(unescape_double(r#"\xDD1E"#), Ok("\u{DD}1E".into()));
        assert_eq!(unescape_double(r#"\xD"#), Ok("\u{D}".into()));
@ -242,5 +233,11 @@ mod tests {
        assert_eq!(unescape_double(r#"\47foo"#), Ok("'foo".into()));
        assert_eq!(unescape_double(r#"\48foo"#), Ok("\u{4}8foo".into()));
        assert_eq!(unescape_double(r#"\87foo"#), Ok("\\87foo".into()));
        assert_eq!(unescape_double(r#"\u{999999}"#), Err(UnescapeError));
        assert_eq!(
            unescape_double(r#"\u{999999999999999999}"#),
            Err(UnescapeError)
        );
    }
 }