mirror of
https://codeberg.org/icewind/php-literal-parser.git
synced 2026-06-03 18:44:07 +02:00
optimize double quota string literals
This commit is contained in:
parent
f7ae4135d6
commit
7d6934a265
3 changed files with 152 additions and 136 deletions
|
|
@ -12,6 +12,7 @@ documentation = "https://docs.rs/php-literal-parser"
|
||||||
logos = "0.11"
|
logos = "0.11"
|
||||||
thiserror = "1.0"
|
thiserror = "1.0"
|
||||||
source-span = "2.2"
|
source-span = "2.2"
|
||||||
|
memchr = "2.3.4"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
maplit = "1.0.2"
|
maplit = "1.0.2"
|
||||||
|
|
@ -13,3 +13,21 @@ fn perf_parse_int_basic(b: &mut Bencher) {
|
||||||
assert_eq!(parse(input).unwrap(), 12345676);
|
assert_eq!(parse(input).unwrap(), 12345676);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn perf_str_basic(b: &mut Bencher) {
|
||||||
|
let input = r#""aut dolores excepturi rerum est velit ad natus eveniet quo tenetur et fugiat sit velit ipsam nesciunt sint et architecto""#;
|
||||||
|
|
||||||
|
b.iter(|| {
|
||||||
|
assert!(parse(input).unwrap().is_string());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn perf_str_escape(b: &mut Bencher) {
|
||||||
|
let input = r#""aut dolores excepturi rerum est velit ad natus \"eveniet\" quo tenetur et fugiat sit velit ipsam nesciunt sint et architecto""#;
|
||||||
|
|
||||||
|
b.iter(|| {
|
||||||
|
assert!(parse(input).unwrap().is_string());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
|
||||||
239
src/string.rs
239
src/string.rs
|
|
@ -1,7 +1,3 @@
|
||||||
/// unescaping php string literals borrowed mostly from `escape8259`
|
|
||||||
use std::char::decode_utf16;
|
|
||||||
use std::iter::{once, Peekable};
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||||
|
|
||||||
/// An error occurred while
|
/// An error occurred while
|
||||||
|
|
@ -12,100 +8,61 @@ type UnescapeResult<T> = Result<T, UnescapeError>;
|
||||||
// Used to collect output characters and queue u16 values for translation.
|
// Used to collect output characters and queue u16 values for translation.
|
||||||
struct UnescapeState {
|
struct UnescapeState {
|
||||||
// The accumulated characters
|
// The accumulated characters
|
||||||
out: String,
|
out: Vec<u8>,
|
||||||
// Store a fragment of a large character for later decoding
|
|
||||||
stash: u16,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl UnescapeState {
|
impl UnescapeState {
|
||||||
fn new() -> UnescapeState {
|
fn new() -> UnescapeState {
|
||||||
|
UnescapeState { out: Vec::new() }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn with_capacity(capacity: usize) -> UnescapeState {
|
||||||
UnescapeState {
|
UnescapeState {
|
||||||
out: String::new(),
|
out: Vec::with_capacity(capacity),
|
||||||
stash: 0,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Collect a new character
|
// Collect a new character
|
||||||
fn push_char(&mut self, c: char) -> UnescapeResult<()> {
|
fn push_char(&mut self, c: char) {
|
||||||
if self.stash != 0 {
|
let mut buff = [0; 8];
|
||||||
return Err(UnescapeError);
|
self.out
|
||||||
}
|
.extend_from_slice(c.encode_utf8(&mut buff).as_bytes());
|
||||||
self.out.push(c);
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Collect a new UTF16 word. This can either be one whole character,
|
fn push_u8(&mut self, c: u8) {
|
||||||
// or part of a larger character.
|
|
||||||
fn push_u16(&mut self, x: u16) -> UnescapeResult<()> {
|
|
||||||
let surrogate = x >= 0xD800 && x <= 0xDFFF;
|
|
||||||
match (self.stash, surrogate) {
|
|
||||||
(0, false) => {
|
|
||||||
// The std library only provides utf16 decode of an iterator,
|
|
||||||
// so to decode a single character we wrap it in a `once`.
|
|
||||||
// Hopefully the compiler will elide most of this extra work.
|
|
||||||
match decode_utf16(once(x)).next() {
|
|
||||||
Some(Ok(c)) => {
|
|
||||||
self.out.push(c);
|
self.out.push(c);
|
||||||
}
|
}
|
||||||
_ => return Err(UnescapeError),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
(0, true) => self.stash = x,
|
|
||||||
(_, false) => {
|
|
||||||
return Err(UnescapeError);
|
|
||||||
}
|
|
||||||
(w, true) => {
|
|
||||||
let words = [w, x];
|
|
||||||
match decode_utf16(words.iter().copied()).next() {
|
|
||||||
Some(Ok(c)) => {
|
|
||||||
self.out.push(c);
|
|
||||||
self.stash = 0;
|
|
||||||
}
|
|
||||||
_ => return Err(UnescapeError),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we queued up part of a UTF-16 encoded word but didn't
|
fn push_raw(&mut self, c: u32) -> UnescapeResult<()> {
|
||||||
// finish it, return an error. Otherwise, consume self and
|
match std::char::from_u32(c) {
|
||||||
// return the accumulated String.
|
Some(c) => Ok(self.push_char(c)),
|
||||||
fn finalize(self) -> UnescapeResult<String> {
|
None => Err(UnescapeError),
|
||||||
if self.stash != 0 {
|
|
||||||
return Err(UnescapeError);
|
|
||||||
}
|
|
||||||
Ok(self.out)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_u16_hex<S>(s: &mut Peekable<S>, max: Option<u8>) -> UnescapeResult<u16>
|
fn push_slice(&mut self, slice: &[u8]) {
|
||||||
where
|
self.out.extend_from_slice(slice);
|
||||||
S: Iterator<Item = char>,
|
}
|
||||||
{
|
|
||||||
let mut result = 0;
|
fn finalize(self) -> String {
|
||||||
|
// this is safe because we only push bytes into the buffer that either
|
||||||
|
// - come from the source &str, and are delimited a \
|
||||||
|
// - are validated unicode points, utf8 encoded
|
||||||
|
unsafe { String::from_utf8_unchecked(self.out) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_u32(
|
||||||
|
s: &mut PeekableBytes,
|
||||||
|
radix: u32,
|
||||||
|
mut result: u32,
|
||||||
|
max: Option<u8>,
|
||||||
|
) -> UnescapeResult<u32> {
|
||||||
let mut max = max.unwrap_or(u8::max_value());
|
let mut max = max.unwrap_or(u8::max_value());
|
||||||
while s.peek().map(|c| c.is_ascii_hexdigit()).unwrap_or_default() {
|
while let Some(digit) = s.peek().and_then(|digit| (digit as char).to_digit(radix)) {
|
||||||
result *= 16;
|
let _ = s.next(); // consume the digit we peeked
|
||||||
result += s.next().unwrap().to_digit(16).unwrap() as u16;
|
result = result.checked_mul(radix).ok_or(UnescapeError)?;
|
||||||
max -= 1;
|
result = result.checked_add(digit).ok_or(UnescapeError)?;
|
||||||
if max == 0 {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(result)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_u16_oct<S>(s: &mut Peekable<S>, mut result: u16, max: Option<u8>) -> UnescapeResult<u16>
|
|
||||||
where
|
|
||||||
S: Iterator<Item = char>,
|
|
||||||
{
|
|
||||||
let mut max = max.unwrap_or(u8::max_value());
|
|
||||||
while s.peek().map(|c| c >= &'1' && c <= &'7').unwrap_or_default() {
|
|
||||||
let digit = s.next().unwrap();
|
|
||||||
dbg!(digit);
|
|
||||||
result *= 8;
|
|
||||||
result += digit.to_digit(8).unwrap() as u16;
|
|
||||||
max -= 1;
|
max -= 1;
|
||||||
if max == 0 {
|
if max == 0 {
|
||||||
break;
|
break;
|
||||||
|
|
@ -126,80 +83,114 @@ pub fn unescape_single(s: &str) -> UnescapeResult<String> {
|
||||||
return Err(UnescapeError);
|
return Err(UnescapeError);
|
||||||
}
|
}
|
||||||
Some(d) => match d {
|
Some(d) => match d {
|
||||||
'\\' | '\'' => state.push_char(d)?,
|
'\\' | '\'' => state.push_char(d),
|
||||||
_ => {
|
_ => {
|
||||||
state.push_char('\\')?;
|
state.push_char('\\');
|
||||||
state.push_char(d)?
|
state.push_char(d)
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
state.push_char(c)?;
|
state.push_char(c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
state.finalize()
|
Ok(state.finalize())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Un-escape a string, following php double quote rules
|
fn handle_escape<'a>(bytes: &'a [u8], state: &mut UnescapeState) -> UnescapeResult<&'a [u8]> {
|
||||||
pub fn unescape_double(s: &str) -> UnescapeResult<String> {
|
let mut ins = PeekableBytes::new(bytes);
|
||||||
let mut state = UnescapeState::new();
|
debug_assert_eq!(ins.next(), Some(b'\\'));
|
||||||
let mut ins = s.chars().peekable();
|
|
||||||
|
|
||||||
while let Some(c) = ins.next() {
|
|
||||||
if c == '\\' {
|
|
||||||
match ins.next() {
|
match ins.next() {
|
||||||
None => {
|
None => {
|
||||||
return Err(UnescapeError);
|
return Err(UnescapeError);
|
||||||
}
|
}
|
||||||
Some(d) => {
|
Some(d) => {
|
||||||
match d {
|
match d {
|
||||||
'$' | '"' | '\\' => state.push_char(d)?,
|
b'$' | b'"' | b'\\' => state.push_u8(d),
|
||||||
'n' => state.push_char('\n')?, // linefeed
|
b'n' => state.push_u8(b'\n'), // linefeed
|
||||||
'r' => state.push_char('\r')?, // carriage return
|
b'r' => state.push_u8(b'\r'), // carriage return
|
||||||
't' => state.push_char('\t')?, // tab
|
b't' => state.push_u8(b'\t'), // tab
|
||||||
'v' => state.push_char('\x0B')?, // vertical tab
|
b'v' => state.push_u8(b'\x0B'), // vertical tab
|
||||||
'f' => state.push_char('\x0C')?, // form feed
|
b'f' => state.push_u8(b'\x0C'), // form feed
|
||||||
'x' => {
|
b'x' => {
|
||||||
let val = parse_u16_hex(&mut ins, Some(2))?;
|
let val = parse_u32(&mut ins, 16, 0, Some(2))?;
|
||||||
state.push_u16(val)?;
|
state.push_raw(val)?;
|
||||||
}
|
}
|
||||||
'u' => match ins.next() {
|
b'u' => match ins.next() {
|
||||||
Some('{') => {
|
Some(b'{') => {
|
||||||
let val = parse_u16_hex(&mut ins, None)?;
|
let val = parse_u32(&mut ins, 16, 0, None)?;
|
||||||
state.push_u16(val)?;
|
state.push_raw(val)?;
|
||||||
if !matches!(ins.next(), Some('}')) {
|
if !matches!(ins.next(), Some(b'}')) {
|
||||||
return Err(UnescapeError);
|
return Err(UnescapeError);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Some(d) => {
|
Some(d) => {
|
||||||
state.push_char('\\')?;
|
state.push_u8(b'\\');
|
||||||
state.push_char('u')?;
|
state.push_u8(b'u');
|
||||||
state.push_char(d)?;
|
state.push_u8(d);
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
state.push_char('\\')?;
|
state.push_u8(b'\\');
|
||||||
state.push_char(d)?;
|
state.push_u8(d);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
'0'..='7' => {
|
b'0'..=b'7' => {
|
||||||
let val =
|
let val = parse_u32(&mut ins, 8, (d as char).to_digit(8).unwrap(), Some(3))?;
|
||||||
parse_u16_oct(&mut ins, d.to_digit(8).unwrap() as u16, Some(3))?;
|
state.push_raw(val)?;
|
||||||
state.push_u16(val)?;
|
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
state.push_char('\\')?;
|
state.push_u8(b'\\');
|
||||||
state.push_char(d)?
|
state.push_u8(d)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
Ok(ins.as_slice())
|
||||||
state.push_char(c)?;
|
}
|
||||||
|
|
||||||
|
/// Un-escape a string, following php double quote rules
|
||||||
|
pub fn unescape_double(s: &str) -> UnescapeResult<String> {
|
||||||
|
let mut state = UnescapeState::with_capacity(s.len());
|
||||||
|
let mut bytes = s.as_bytes();
|
||||||
|
while let Some(escape_index) = memchr::memchr(b'\\', bytes) {
|
||||||
|
state.push_slice(&bytes[0..escape_index]);
|
||||||
|
bytes = &bytes[escape_index..];
|
||||||
|
bytes = handle_escape(bytes, &mut state)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
state.push_slice(&bytes[0..]);
|
||||||
|
|
||||||
|
Ok(state.finalize())
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PeekableBytes<'a> {
|
||||||
|
slice: &'a [u8],
|
||||||
|
pos: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for PeekableBytes<'a> {
|
||||||
|
type Item = u8;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
let byte = self.slice.get(self.pos)?;
|
||||||
|
self.pos += 1;
|
||||||
|
Some(*byte)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
state.finalize()
|
impl<'a> PeekableBytes<'a> {
|
||||||
|
pub fn new(slice: &'a [u8]) -> Self {
|
||||||
|
PeekableBytes { slice, pos: 0 }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn peek(&self) -> Option<u8> {
|
||||||
|
self.slice.get(self.pos).copied()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_slice(self) -> &'a [u8] {
|
||||||
|
&self.slice[self.pos..]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|
@ -231,7 +222,7 @@ mod tests {
|
||||||
assert_eq!(unescape_double(r#" \"abc\" "#), Ok(" \"abc\" ".into()));
|
assert_eq!(unescape_double(r#" \"abc\" "#), Ok(" \"abc\" ".into()));
|
||||||
assert_eq!(unescape_double(r#"𝄞"#), Ok("𝄞".into()));
|
assert_eq!(unescape_double(r#"𝄞"#), Ok("𝄞".into()));
|
||||||
assert_eq!(unescape_double(r#"\𝄞"#), Ok("\\𝄞".into()));
|
assert_eq!(unescape_double(r#"\𝄞"#), Ok("\\𝄞".into()));
|
||||||
assert_eq!(unescape_double(r#"\u{D834}\u{DD1E}"#), Ok("𝄞".into()));
|
assert_eq!(unescape_double(r#"\u{1D11E}"#), Ok("𝄞".into()));
|
||||||
assert_eq!(unescape_double(r#"\xD834"#), Ok("\u{D8}34".into()));
|
assert_eq!(unescape_double(r#"\xD834"#), Ok("\u{D8}34".into()));
|
||||||
assert_eq!(unescape_double(r#"\xDD1E"#), Ok("\u{DD}1E".into()));
|
assert_eq!(unescape_double(r#"\xDD1E"#), Ok("\u{DD}1E".into()));
|
||||||
assert_eq!(unescape_double(r#"\xD"#), Ok("\u{D}".into()));
|
assert_eq!(unescape_double(r#"\xD"#), Ok("\u{D}".into()));
|
||||||
|
|
@ -242,5 +233,11 @@ mod tests {
|
||||||
assert_eq!(unescape_double(r#"\47foo"#), Ok("'foo".into()));
|
assert_eq!(unescape_double(r#"\47foo"#), Ok("'foo".into()));
|
||||||
assert_eq!(unescape_double(r#"\48foo"#), Ok("\u{4}8foo".into()));
|
assert_eq!(unescape_double(r#"\48foo"#), Ok("\u{4}8foo".into()));
|
||||||
assert_eq!(unescape_double(r#"\87foo"#), Ok("\\87foo".into()));
|
assert_eq!(unescape_double(r#"\87foo"#), Ok("\\87foo".into()));
|
||||||
|
|
||||||
|
assert_eq!(unescape_double(r#"\u{999999}"#), Err(UnescapeError));
|
||||||
|
assert_eq!(
|
||||||
|
unescape_double(r#"\u{999999999999999999}"#),
|
||||||
|
Err(UnescapeError)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue