minor string code cleanup

This commit is contained in:
Robin Appelman 2020-12-08 21:46:32 +01:00
commit f1c1f99328
2 changed files with 161 additions and 125 deletions

View file

@ -2,7 +2,7 @@ use crate::error::UnexpectedTokenError;
use crate::error::{ExpectToken, InvalidArrayKeyError, ParseError, ResultExt, SpannedError}; use crate::error::{ExpectToken, InvalidArrayKeyError, ParseError, ResultExt, SpannedError};
use crate::lexer::Token; use crate::lexer::Token;
use crate::num::parse_int; use crate::num::parse_int;
use crate::string::{unescape_double, unescape_single, UnescapeError}; use crate::string::parse_string;
use crate::{Key, Value}; use crate::{Key, Value};
use logos::{Lexer, Logos}; use logos::{Lexer, Logos};
use std::collections::HashMap; use std::collections::HashMap;
@ -69,17 +69,6 @@ pub fn parse_lexer<'source>(
Ok(value) Ok(value)
} }
fn parse_string(literal: &str) -> Result<String, UnescapeError> {
let single_quote = literal.bytes().next().unwrap() == b'\'';
let inner = &literal[1..(literal.len()) - 1];
if single_quote {
unescape_single(inner)
} else {
unescape_double(inner)
}
}
fn parse_float(literal: &str) -> Result<f64, ParseFloatError> { fn parse_float(literal: &str) -> Result<f64, ParseFloatError> {
let stripped = literal.replace('_', ""); let stripped = literal.replace('_', "");
stripped.parse() stripped.parse()

View file

@ -12,10 +12,6 @@ struct UnescapeState {
} }
impl UnescapeState { impl UnescapeState {
fn new() -> UnescapeState {
UnescapeState { out: Vec::new() }
}
fn with_capacity(capacity: usize) -> UnescapeState { fn with_capacity(capacity: usize) -> UnescapeState {
UnescapeState { UnescapeState {
out: Vec::with_capacity(capacity), out: Vec::with_capacity(capacity),
@ -71,104 +67,104 @@ fn parse_u32(
Ok(result) Ok(result)
} }
fn handle_single_escape<'a>( trait EscapedString {
bytes: &'a [u8], fn handle_escape<'a>(bytes: &'a [u8], state: &mut UnescapeState) -> UnescapeResult<&'a [u8]>;
state: &mut UnescapeState, }
) -> UnescapeResult<&'a [u8]> {
let mut ins = PeekableBytes::new(bytes); struct SingleQuoteString;
debug_assert_eq!(ins.next(), Some(b'\\'));
match ins.next() { impl EscapedString for SingleQuoteString {
None => { fn handle_escape<'a>(bytes: &'a [u8], state: &mut UnescapeState) -> UnescapeResult<&'a [u8]> {
return Err(UnescapeError); let mut ins = PeekableBytes::new(bytes);
} debug_assert_eq!(ins.next(), Some(b'\\'));
Some(d) => match d { match ins.next() {
b'\\' | b'\'' => state.push_u8(d), None => {
_ => { return Err(UnescapeError);
state.push_u8(b'\\');
state.push_u8(d)
} }
}, Some(d) => match d {
} b'\\' | b'\'' => state.push_u8(d),
Ok(ins.as_slice())
}
/// Un-escape a string, following php single quote rules
pub fn unescape_single(s: &str) -> UnescapeResult<String> {
let mut state = UnescapeState::with_capacity(s.len());
let mut bytes = s.as_bytes();
while let Some(escape_index) = memchr::memchr(b'\\', bytes) {
state.push_slice(&bytes[0..escape_index]);
bytes = &bytes[escape_index..];
bytes = handle_single_escape(bytes, &mut state)?;
}
state.push_slice(&bytes[0..]);
Ok(state.finalize())
}
fn handle_double_escape<'a>(
bytes: &'a [u8],
state: &mut UnescapeState,
) -> UnescapeResult<&'a [u8]> {
let mut ins = PeekableBytes::new(bytes);
debug_assert_eq!(ins.next(), Some(b'\\'));
match ins.next() {
None => {
return Err(UnescapeError);
}
Some(d) => {
match d {
b'$' | b'"' | b'\\' => state.push_u8(d),
b'n' => state.push_u8(b'\n'), // linefeed
b'r' => state.push_u8(b'\r'), // carriage return
b't' => state.push_u8(b'\t'), // tab
b'v' => state.push_u8(b'\x0B'), // vertical tab
b'f' => state.push_u8(b'\x0C'), // form feed
b'x' => {
let val = parse_u32(&mut ins, 16, 0, Some(2))?;
state.push_raw(val)?;
}
b'u' => match ins.next() {
Some(b'{') => {
let val = parse_u32(&mut ins, 16, 0, None)?;
state.push_raw(val)?;
if !matches!(ins.next(), Some(b'}')) {
return Err(UnescapeError);
}
}
Some(d) => {
state.push_u8(b'\\');
state.push_u8(b'u');
state.push_u8(d);
}
None => {
state.push_u8(b'\\');
state.push_u8(d);
}
},
b'0'..=b'7' => {
let val = parse_u32(&mut ins, 8, (d as char).to_digit(8).unwrap(), Some(3))?;
state.push_raw(val)?;
}
_ => { _ => {
state.push_u8(b'\\'); state.push_u8(b'\\');
state.push_u8(d) state.push_u8(d)
} }
} },
} }
Ok(ins.as_slice())
} }
Ok(ins.as_slice())
} }
/// Un-escape a string, following php double quote rules struct DoubleQuoteString;
pub fn unescape_double(s: &str) -> UnescapeResult<String> {
impl EscapedString for DoubleQuoteString {
fn handle_escape<'a>(bytes: &'a [u8], state: &mut UnescapeState) -> UnescapeResult<&'a [u8]> {
let mut ins = PeekableBytes::new(bytes);
debug_assert_eq!(ins.next(), Some(b'\\'));
match ins.next() {
None => {
return Err(UnescapeError);
}
Some(d) => {
match d {
b'$' | b'"' | b'\\' => state.push_u8(d),
b'n' => state.push_u8(b'\n'), // linefeed
b'r' => state.push_u8(b'\r'), // carriage return
b't' => state.push_u8(b'\t'), // tab
b'v' => state.push_u8(b'\x0B'), // vertical tab
b'f' => state.push_u8(b'\x0C'), // form feed
b'x' => {
let val = parse_u32(&mut ins, 16, 0, Some(2))?;
state.push_raw(val)?;
}
b'u' => match ins.next() {
Some(b'{') => {
let val = parse_u32(&mut ins, 16, 0, None)?;
state.push_raw(val)?;
if !matches!(ins.next(), Some(b'}')) {
return Err(UnescapeError);
}
}
Some(d) => {
state.push_u8(b'\\');
state.push_u8(b'u');
state.push_u8(d);
}
None => {
state.push_u8(b'\\');
state.push_u8(d);
}
},
b'0'..=b'7' => {
let val =
parse_u32(&mut ins, 8, (d as char).to_digit(8).unwrap(), Some(3))?;
state.push_raw(val)?;
}
_ => {
state.push_u8(b'\\');
state.push_u8(d)
}
}
}
}
Ok(ins.as_slice())
}
}
pub fn parse_string(literal: &str) -> Result<String, UnescapeError> {
let inner = &literal[1..(literal.len()) - 1];
if literal.bytes().next().unwrap() == b'\'' {
unescape::<SingleQuoteString>(inner)
} else {
unescape::<DoubleQuoteString>(inner)
}
}
fn unescape<S: EscapedString>(s: &str) -> UnescapeResult<String> {
let mut state = UnescapeState::with_capacity(s.len()); let mut state = UnescapeState::with_capacity(s.len());
let mut bytes = s.as_bytes(); let mut bytes = s.as_bytes();
while let Some(escape_index) = memchr::memchr(b'\\', bytes) { while let Some(escape_index) = memchr::memchr(b'\\', bytes) {
state.push_slice(&bytes[0..escape_index]); state.push_slice(&bytes[0..escape_index]);
bytes = &bytes[escape_index..]; bytes = &bytes[escape_index..];
bytes = handle_double_escape(bytes, &mut state)?; bytes = S::handle_escape(bytes, &mut state)?;
} }
state.push_slice(&bytes[0..]); state.push_slice(&bytes[0..]);
@ -211,44 +207,95 @@ mod tests {
#[test] #[test]
fn test_unescape_single() { fn test_unescape_single() {
assert_eq!(unescape_single(&r#"abc"#), Ok("abc".into())); assert_eq!(unescape::<SingleQuoteString>(&r#"abc"#), Ok("abc".into()));
assert_eq!(unescape_single(&r#"ab\nc"#), Ok("ab\\nc".into()));
assert_eq!(unescape_single(r#"ab\zc"#), Ok("ab\\zc".into()));
assert_eq!(unescape_single(r#" \"abc\" "#), Ok(" \\\"abc\\\" ".into()));
assert_eq!(unescape_single(r#"𝄞"#), Ok("𝄞".into()));
assert_eq!(unescape_single(r#"\𝄞"#), Ok("\\𝄞".into()));
assert_eq!( assert_eq!(
unescape_single(r#"\xD834\xDD1E"#), unescape::<SingleQuoteString>(&r#"ab\nc"#),
Ok("ab\\nc".into())
);
assert_eq!(
unescape::<SingleQuoteString>(r#"ab\zc"#),
Ok("ab\\zc".into())
);
assert_eq!(
unescape::<SingleQuoteString>(r#" \"abc\" "#),
Ok(" \\\"abc\\\" ".into())
);
assert_eq!(unescape::<SingleQuoteString>(r#"𝄞"#), Ok("𝄞".into()));
assert_eq!(unescape::<SingleQuoteString>(r#"\𝄞"#), Ok("\\𝄞".into()));
assert_eq!(
unescape::<SingleQuoteString>(r#"\xD834\xDD1E"#),
Ok("\\xD834\\xDD1E".into()) Ok("\\xD834\\xDD1E".into())
); );
assert_eq!(unescape_single(r#"\xD834"#), Ok("\\xD834".into())); assert_eq!(
assert_eq!(unescape_single(r#"\xDD1E"#), Ok("\\xDD1E".into())); unescape::<SingleQuoteString>(r#"\xD834"#),
assert_eq!(unescape_single("\t"), Ok("\t".into())); Ok("\\xD834".into())
);
assert_eq!(
unescape::<SingleQuoteString>(r#"\xDD1E"#),
Ok("\\xDD1E".into())
);
assert_eq!(unescape::<SingleQuoteString>("\t"), Ok("\t".into()));
} }
#[test] #[test]
fn test_unescape_double() { fn test_unescape_double() {
assert_eq!(unescape_double(&r#"abc"#), Ok("abc".into())); assert_eq!(unescape::<DoubleQuoteString>(&r#"abc"#), Ok("abc".into()));
assert_eq!(unescape_double(&r#"ab\nc"#), Ok("ab\nc".into()));
assert_eq!(unescape_double(r#"ab\zc"#), Ok("ab\\zc".into()));
assert_eq!(unescape_double(r#" \"abc\" "#), Ok(" \"abc\" ".into()));
assert_eq!(unescape_double(r#"𝄞"#), Ok("𝄞".into()));
assert_eq!(unescape_double(r#"\𝄞"#), Ok("\\𝄞".into()));
assert_eq!(unescape_double(r#"\u{1D11E}"#), Ok("𝄞".into()));
assert_eq!(unescape_double(r#"\xD834"#), Ok("\u{D8}34".into()));
assert_eq!(unescape_double(r#"\xDD1E"#), Ok("\u{DD}1E".into()));
assert_eq!(unescape_double(r#"\xD"#), Ok("\u{D}".into()));
assert_eq!(unescape_double("\t"), Ok("\t".into()));
assert_eq!(unescape_double(r#"\u{D834"#), Err(UnescapeError));
assert_eq!(unescape_double(r#"\uD834"#), Ok("\\uD834".into()));
assert_eq!(unescape_double(r#"\u"#), Ok("\\u".into()));
assert_eq!(unescape_double(r#"\47foo"#), Ok("'foo".into()));
assert_eq!(unescape_double(r#"\48foo"#), Ok("\u{4}8foo".into()));
assert_eq!(unescape_double(r#"\87foo"#), Ok("\\87foo".into()));
assert_eq!(unescape_double(r#"\u{999999}"#), Err(UnescapeError));
assert_eq!( assert_eq!(
unescape_double(r#"\u{999999999999999999}"#), unescape::<DoubleQuoteString>(&r#"ab\nc"#),
Ok("ab\nc".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"ab\zc"#),
Ok("ab\\zc".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#" \"abc\" "#),
Ok(" \"abc\" ".into())
);
assert_eq!(unescape::<DoubleQuoteString>(r#"𝄞"#), Ok("𝄞".into()));
assert_eq!(unescape::<DoubleQuoteString>(r#"\𝄞"#), Ok("\\𝄞".into()));
assert_eq!(
unescape::<DoubleQuoteString>(r#"\u{1D11E}"#),
Ok("𝄞".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\xD834"#),
Ok("\u{D8}34".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\xDD1E"#),
Ok("\u{DD}1E".into())
);
assert_eq!(unescape::<DoubleQuoteString>(r#"\xD"#), Ok("\u{D}".into()));
assert_eq!(unescape::<DoubleQuoteString>("\t"), Ok("\t".into()));
assert_eq!(
unescape::<DoubleQuoteString>(r#"\u{D834"#),
Err(UnescapeError)
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\uD834"#),
Ok("\\uD834".into())
);
assert_eq!(unescape::<DoubleQuoteString>(r#"\u"#), Ok("\\u".into()));
assert_eq!(
unescape::<DoubleQuoteString>(r#"\47foo"#),
Ok("'foo".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\48foo"#),
Ok("\u{4}8foo".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\87foo"#),
Ok("\\87foo".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\u{999999}"#),
Err(UnescapeError)
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\u{999999999999999999}"#),
Err(UnescapeError) Err(UnescapeError)
); );
} }