optimize double quota string literals

2026-08-02 12:24:49 +02:00 · 2020-12-08 21:32:46 +01:00 · 2020-12-08 21:32:46 +01:00 · 7d6934a265
commit 7d6934a265
parent f7ae4135d6
3 changed files with 152 additions and 136 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -12,6 +12,7 @@ documentation = "https://docs.rs/php-literal-parser"
 logos = "0.11"
 thiserror = "1.0"
 source-span = "2.2"
+memchr = "2.3.4"

 [dev-dependencies]
 maplit = "1.0.2"
--- a/benches/parse.rs
+++ b/benches/parse.rs
@ -13,3 +13,21 @@ fn perf_parse_int_basic(b: &mut Bencher) {
        assert_eq!(parse(input).unwrap(), 12345676);
    });
 }
+
+#[bench]
+fn perf_str_basic(b: &mut Bencher) {
+    let input = r#""aut dolores excepturi rerum est velit ad natus eveniet quo tenetur et fugiat sit velit ipsam nesciunt sint et architecto""#;
+
+    b.iter(|| {
+        assert!(parse(input).unwrap().is_string());
+    });
+}
+
+#[bench]
+fn perf_str_escape(b: &mut Bencher) {
+    let input = r#""aut dolores excepturi rerum est velit ad natus \"eveniet\" quo tenetur et fugiat sit velit ipsam nesciunt sint et architecto""#;
+
+    b.iter(|| {
+        assert!(parse(input).unwrap().is_string());
+    });
+}
--- a/src/string.rs
+++ b/src/string.rs
@ -1,7 +1,3 @@
-/// unescaping php string literals borrowed mostly from `escape8259`
-use std::char::decode_utf16;
-use std::iter::{once, Peekable};
-
 #[derive(Debug, Clone, Eq, PartialEq)]

 /// An error occurred while
@ -12,100 +8,61 @@ type UnescapeResult<T> = Result<T, UnescapeError>;
 // Used to collect output characters and queue u16 values for translation.
 struct UnescapeState {
    // The accumulated characters
-    out: String,
-    // Store a fragment of a large character for later decoding
-    stash: u16,
+    out: Vec<u8>,
 }

 impl UnescapeState {
    fn new() -> UnescapeState {
+        UnescapeState { out: Vec::new() }
+    }
+
+    fn with_capacity(capacity: usize) -> UnescapeState {
        UnescapeState {
-            out: String::new(),
-            stash: 0,
+            out: Vec::with_capacity(capacity),
        }
    }

    // Collect a new character
-    fn push_char(&mut self, c: char) -> UnescapeResult<()> {
-        if self.stash != 0 {
-            return Err(UnescapeError);
-        }
-        self.out.push(c);
-        Ok(())
+    fn push_char(&mut self, c: char) {
+        let mut buff = [0; 8];
+        self.out
+            .extend_from_slice(c.encode_utf8(&mut buff).as_bytes());
    }

-    // Collect a new UTF16 word.  This can either be one whole character,
-    // or part of a larger character.
-    fn push_u16(&mut self, x: u16) -> UnescapeResult<()> {
-        let surrogate = x >= 0xD800 && x <= 0xDFFF;
-        match (self.stash, surrogate) {
-            (0, false) => {
-                // The std library only provides utf16 decode of an iterator,
-                // so to decode a single character we wrap it in a `once`.
-                // Hopefully the compiler will elide most of this extra work.
-                match decode_utf16(once(x)).next() {
-                    Some(Ok(c)) => {
+    fn push_u8(&mut self, c: u8) {
        self.out.push(c);
    }
-                    _ => return Err(UnescapeError),
-                }
-            }
-            (0, true) => self.stash = x,
-            (_, false) => {
-                return Err(UnescapeError);
-            }
-            (w, true) => {
-                let words = [w, x];
-                match decode_utf16(words.iter().copied()).next() {
-                    Some(Ok(c)) => {
-                        self.out.push(c);
-                        self.stash = 0;
-                    }
-                    _ => return Err(UnescapeError),
-                }
-            }
-        }
-        Ok(())
-    }

-    // If we queued up part of a UTF-16 encoded word but didn't
-    // finish it, return an error.  Otherwise, consume self and
-    // return the accumulated String.
-    fn finalize(self) -> UnescapeResult<String> {
-        if self.stash != 0 {
-            return Err(UnescapeError);
+    fn push_raw(&mut self, c: u32) -> UnescapeResult<()> {
+        match std::char::from_u32(c) {
+            Some(c) => Ok(self.push_char(c)),
+            None => Err(UnescapeError),
        }
-        Ok(self.out)
+    }
+
+    fn push_slice(&mut self, slice: &[u8]) {
+        self.out.extend_from_slice(slice);
+    }
+
+    fn finalize(self) -> String {
+        // this is safe because we only push bytes into the buffer that either
+        //   - come from the source &str, and are delimited a \
+        //   - are validated unicode points, utf8 encoded
+        unsafe { String::from_utf8_unchecked(self.out) }
    }
 }

-fn parse_u16_hex<S>(s: &mut Peekable<S>, max: Option<u8>) -> UnescapeResult<u16>
-where
-    S: Iterator<Item = char>,
-{
-    let mut result = 0;
+fn parse_u32(
+    s: &mut PeekableBytes,
+    radix: u32,
+    mut result: u32,
+    max: Option<u8>,
+) -> UnescapeResult<u32> {
    let mut max = max.unwrap_or(u8::max_value());
-    while s.peek().map(|c| c.is_ascii_hexdigit()).unwrap_or_default() {
-        result *= 16;
-        result += s.next().unwrap().to_digit(16).unwrap() as u16;
-        max -= 1;
-        if max == 0 {
-            break;
-        }
-    }
-    Ok(result)
-}
-
-fn parse_u16_oct<S>(s: &mut Peekable<S>, mut result: u16, max: Option<u8>) -> UnescapeResult<u16>
-where
-    S: Iterator<Item = char>,
-{
-    let mut max = max.unwrap_or(u8::max_value());
-    while s.peek().map(|c| c >= &'1' && c <= &'7').unwrap_or_default() {
-        let digit = s.next().unwrap();
-        dbg!(digit);
-        result *= 8;
-        result += digit.to_digit(8).unwrap() as u16;
+    while let Some(digit) = s.peek().and_then(|digit| (digit as char).to_digit(radix)) {
+        let _ = s.next(); // consume the digit we peeked
+        result = result.checked_mul(radix).ok_or(UnescapeError)?;
+        result = result.checked_add(digit).ok_or(UnescapeError)?;
        max -= 1;
        if max == 0 {
            break;
@ -126,80 +83,114 @@ pub fn unescape_single(s: &str) -> UnescapeResult<String> {
                    return Err(UnescapeError);
                }
                Some(d) => match d {
-                    '\\' | '\'' => state.push_char(d)?,
+                    '\\' | '\'' => state.push_char(d),
                    _ => {
-                        state.push_char('\\')?;
-                        state.push_char(d)?
+                        state.push_char('\\');
+                        state.push_char(d)
                    }
                },
            }
        } else {
-            state.push_char(c)?;
+            state.push_char(c);
        }
    }

-    state.finalize()
+    Ok(state.finalize())
 }

-/// Un-escape a string, following php double quote rules
-pub fn unescape_double(s: &str) -> UnescapeResult<String> {
-    let mut state = UnescapeState::new();
-    let mut ins = s.chars().peekable();
-
-    while let Some(c) = ins.next() {
-        if c == '\\' {
+fn handle_escape<'a>(bytes: &'a [u8], state: &mut UnescapeState) -> UnescapeResult<&'a [u8]> {
+    let mut ins = PeekableBytes::new(bytes);
+    debug_assert_eq!(ins.next(), Some(b'\\'));
    match ins.next() {
        None => {
            return Err(UnescapeError);
        }
        Some(d) => {
            match d {
-                        '$' | '"' | '\\' => state.push_char(d)?,
-                        'n' => state.push_char('\n')?,   // linefeed
-                        'r' => state.push_char('\r')?,   // carriage return
-                        't' => state.push_char('\t')?,   // tab
-                        'v' => state.push_char('\x0B')?, // vertical tab
-                        'f' => state.push_char('\x0C')?, // form feed
-                        'x' => {
-                            let val = parse_u16_hex(&mut ins, Some(2))?;
-                            state.push_u16(val)?;
+                b'$' | b'"' | b'\\' => state.push_u8(d),
+                b'n' => state.push_u8(b'\n'),   // linefeed
+                b'r' => state.push_u8(b'\r'),   // carriage return
+                b't' => state.push_u8(b'\t'),   // tab
+                b'v' => state.push_u8(b'\x0B'), // vertical tab
+                b'f' => state.push_u8(b'\x0C'), // form feed
+                b'x' => {
+                    let val = parse_u32(&mut ins, 16, 0, Some(2))?;
+                    state.push_raw(val)?;
                }
-                        'u' => match ins.next() {
-                            Some('{') => {
-                                let val = parse_u16_hex(&mut ins, None)?;
-                                state.push_u16(val)?;
-                                if !matches!(ins.next(), Some('}')) {
+                b'u' => match ins.next() {
+                    Some(b'{') => {
+                        let val = parse_u32(&mut ins, 16, 0, None)?;
+                        state.push_raw(val)?;
+                        if !matches!(ins.next(), Some(b'}')) {
                            return Err(UnescapeError);
                        }
                    }
                    Some(d) => {
-                                state.push_char('\\')?;
-                                state.push_char('u')?;
-                                state.push_char(d)?;
+                        state.push_u8(b'\\');
+                        state.push_u8(b'u');
+                        state.push_u8(d);
                    }
                    None => {
-                                state.push_char('\\')?;
-                                state.push_char(d)?;
+                        state.push_u8(b'\\');
+                        state.push_u8(d);
                    }
                },
-                        '0'..='7' => {
-                            let val =
-                                parse_u16_oct(&mut ins, d.to_digit(8).unwrap() as u16, Some(3))?;
-                            state.push_u16(val)?;
+                b'0'..=b'7' => {
+                    let val = parse_u32(&mut ins, 8, (d as char).to_digit(8).unwrap(), Some(3))?;
+                    state.push_raw(val)?;
                }
                _ => {
-                            state.push_char('\\')?;
-                            state.push_char(d)?
+                    state.push_u8(b'\\');
+                    state.push_u8(d)
                }
            }
        }
    }
-        } else {
-            state.push_char(c)?;
-        }
+    Ok(ins.as_slice())
+}
+
+/// Un-escape a string, following php double quote rules
+pub fn unescape_double(s: &str) -> UnescapeResult<String> {
+    let mut state = UnescapeState::with_capacity(s.len());
+    let mut bytes = s.as_bytes();
+    while let Some(escape_index) = memchr::memchr(b'\\', bytes) {
+        state.push_slice(&bytes[0..escape_index]);
+        bytes = &bytes[escape_index..];
+        bytes = handle_escape(bytes, &mut state)?;
    }

-    state.finalize()
+    state.push_slice(&bytes[0..]);
+
+    Ok(state.finalize())
+}
+
+struct PeekableBytes<'a> {
+    slice: &'a [u8],
+    pos: usize,
+}
+
+impl<'a> Iterator for PeekableBytes<'a> {
+    type Item = u8;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let byte = self.slice.get(self.pos)?;
+        self.pos += 1;
+        Some(*byte)
+    }
+}
+
+impl<'a> PeekableBytes<'a> {
+    pub fn new(slice: &'a [u8]) -> Self {
+        PeekableBytes { slice, pos: 0 }
+    }
+
+    pub fn peek(&self) -> Option<u8> {
+        self.slice.get(self.pos).copied()
+    }
+
+    pub fn as_slice(self) -> &'a [u8] {
+        &self.slice[self.pos..]
+    }
 }

 #[cfg(test)]
@ -231,7 +222,7 @@ mod tests {
        assert_eq!(unescape_double(r#" \"abc\" "#), Ok(" \"abc\" ".into()));
        assert_eq!(unescape_double(r#"𝄞"#), Ok("𝄞".into()));
        assert_eq!(unescape_double(r#"\𝄞"#), Ok("\\𝄞".into()));
-        assert_eq!(unescape_double(r#"\u{D834}\u{DD1E}"#), Ok("𝄞".into()));
+        assert_eq!(unescape_double(r#"\u{1D11E}"#), Ok("𝄞".into()));
        assert_eq!(unescape_double(r#"\xD834"#), Ok("\u{D8}34".into()));
        assert_eq!(unescape_double(r#"\xDD1E"#), Ok("\u{DD}1E".into()));
        assert_eq!(unescape_double(r#"\xD"#), Ok("\u{D}".into()));
@ -242,5 +233,11 @@ mod tests {
        assert_eq!(unescape_double(r#"\47foo"#), Ok("'foo".into()));
        assert_eq!(unescape_double(r#"\48foo"#), Ok("\u{4}8foo".into()));
        assert_eq!(unescape_double(r#"\87foo"#), Ok("\\87foo".into()));
+
+        assert_eq!(unescape_double(r#"\u{999999}"#), Err(UnescapeError));
+        assert_eq!(
+            unescape_double(r#"\u{999999999999999999}"#),
+            Err(UnescapeError)
+        );
    }
 }