minor string code cleanup

This commit is contained in:
Robin Appelman 2020-12-08 21:46:32 +01:00
commit f1c1f99328
2 changed files with 161 additions and 125 deletions

View file

@ -2,7 +2,7 @@ use crate::error::UnexpectedTokenError;
use crate::error::{ExpectToken, InvalidArrayKeyError, ParseError, ResultExt, SpannedError};
use crate::lexer::Token;
use crate::num::parse_int;
use crate::string::{unescape_double, unescape_single, UnescapeError};
use crate::string::parse_string;
use crate::{Key, Value};
use logos::{Lexer, Logos};
use std::collections::HashMap;
@ -69,17 +69,6 @@ pub fn parse_lexer<'source>(
Ok(value)
}
fn parse_string(literal: &str) -> Result<String, UnescapeError> {
let single_quote = literal.bytes().next().unwrap() == b'\'';
let inner = &literal[1..(literal.len()) - 1];
if single_quote {
unescape_single(inner)
} else {
unescape_double(inner)
}
}
fn parse_float(literal: &str) -> Result<f64, ParseFloatError> {
let stripped = literal.replace('_', "");
stripped.parse()

View file

@ -12,10 +12,6 @@ struct UnescapeState {
}
impl UnescapeState {
fn new() -> UnescapeState {
UnescapeState { out: Vec::new() }
}
fn with_capacity(capacity: usize) -> UnescapeState {
UnescapeState {
out: Vec::with_capacity(capacity),
@ -71,10 +67,14 @@ fn parse_u32(
Ok(result)
}
fn handle_single_escape<'a>(
bytes: &'a [u8],
state: &mut UnescapeState,
) -> UnescapeResult<&'a [u8]> {
trait EscapedString {
fn handle_escape<'a>(bytes: &'a [u8], state: &mut UnescapeState) -> UnescapeResult<&'a [u8]>;
}
struct SingleQuoteString;
impl EscapedString for SingleQuoteString {
fn handle_escape<'a>(bytes: &'a [u8], state: &mut UnescapeState) -> UnescapeResult<&'a [u8]> {
let mut ins = PeekableBytes::new(bytes);
debug_assert_eq!(ins.next(), Some(b'\\'));
match ins.next() {
@ -91,26 +91,12 @@ fn handle_single_escape<'a>(
}
Ok(ins.as_slice())
}
/// Un-escape a string, following php single quote rules
pub fn unescape_single(s: &str) -> UnescapeResult<String> {
let mut state = UnescapeState::with_capacity(s.len());
let mut bytes = s.as_bytes();
while let Some(escape_index) = memchr::memchr(b'\\', bytes) {
state.push_slice(&bytes[0..escape_index]);
bytes = &bytes[escape_index..];
bytes = handle_single_escape(bytes, &mut state)?;
}
state.push_slice(&bytes[0..]);
struct DoubleQuoteString;
Ok(state.finalize())
}
fn handle_double_escape<'a>(
bytes: &'a [u8],
state: &mut UnescapeState,
) -> UnescapeResult<&'a [u8]> {
impl EscapedString for DoubleQuoteString {
fn handle_escape<'a>(bytes: &'a [u8], state: &mut UnescapeState) -> UnescapeResult<&'a [u8]> {
let mut ins = PeekableBytes::new(bytes);
debug_assert_eq!(ins.next(), Some(b'\\'));
match ins.next() {
@ -148,7 +134,8 @@ fn handle_double_escape<'a>(
}
},
b'0'..=b'7' => {
let val = parse_u32(&mut ins, 8, (d as char).to_digit(8).unwrap(), Some(3))?;
let val =
parse_u32(&mut ins, 8, (d as char).to_digit(8).unwrap(), Some(3))?;
state.push_raw(val)?;
}
_ => {
@ -160,15 +147,24 @@ fn handle_double_escape<'a>(
}
Ok(ins.as_slice())
}
}
/// Un-escape a string, following php double quote rules
pub fn unescape_double(s: &str) -> UnescapeResult<String> {
pub fn parse_string(literal: &str) -> Result<String, UnescapeError> {
let inner = &literal[1..(literal.len()) - 1];
if literal.bytes().next().unwrap() == b'\'' {
unescape::<SingleQuoteString>(inner)
} else {
unescape::<DoubleQuoteString>(inner)
}
}
fn unescape<S: EscapedString>(s: &str) -> UnescapeResult<String> {
let mut state = UnescapeState::with_capacity(s.len());
let mut bytes = s.as_bytes();
while let Some(escape_index) = memchr::memchr(b'\\', bytes) {
state.push_slice(&bytes[0..escape_index]);
bytes = &bytes[escape_index..];
bytes = handle_double_escape(bytes, &mut state)?;
bytes = S::handle_escape(bytes, &mut state)?;
}
state.push_slice(&bytes[0..]);
@ -211,44 +207,95 @@ mod tests {
#[test]
fn test_unescape_single() {
assert_eq!(unescape_single(&r#"abc"#), Ok("abc".into()));
assert_eq!(unescape_single(&r#"ab\nc"#), Ok("ab\\nc".into()));
assert_eq!(unescape_single(r#"ab\zc"#), Ok("ab\\zc".into()));
assert_eq!(unescape_single(r#" \"abc\" "#), Ok(" \\\"abc\\\" ".into()));
assert_eq!(unescape_single(r#"𝄞"#), Ok("𝄞".into()));
assert_eq!(unescape_single(r#"\𝄞"#), Ok("\\𝄞".into()));
assert_eq!(unescape::<SingleQuoteString>(&r#"abc"#), Ok("abc".into()));
assert_eq!(
unescape_single(r#"\xD834\xDD1E"#),
unescape::<SingleQuoteString>(&r#"ab\nc"#),
Ok("ab\\nc".into())
);
assert_eq!(
unescape::<SingleQuoteString>(r#"ab\zc"#),
Ok("ab\\zc".into())
);
assert_eq!(
unescape::<SingleQuoteString>(r#" \"abc\" "#),
Ok(" \\\"abc\\\" ".into())
);
assert_eq!(unescape::<SingleQuoteString>(r#"𝄞"#), Ok("𝄞".into()));
assert_eq!(unescape::<SingleQuoteString>(r#"\𝄞"#), Ok("\\𝄞".into()));
assert_eq!(
unescape::<SingleQuoteString>(r#"\xD834\xDD1E"#),
Ok("\\xD834\\xDD1E".into())
);
assert_eq!(unescape_single(r#"\xD834"#), Ok("\\xD834".into()));
assert_eq!(unescape_single(r#"\xDD1E"#), Ok("\\xDD1E".into()));
assert_eq!(unescape_single("\t"), Ok("\t".into()));
assert_eq!(
unescape::<SingleQuoteString>(r#"\xD834"#),
Ok("\\xD834".into())
);
assert_eq!(
unescape::<SingleQuoteString>(r#"\xDD1E"#),
Ok("\\xDD1E".into())
);
assert_eq!(unescape::<SingleQuoteString>("\t"), Ok("\t".into()));
}
#[test]
fn test_unescape_double() {
assert_eq!(unescape_double(&r#"abc"#), Ok("abc".into()));
assert_eq!(unescape_double(&r#"ab\nc"#), Ok("ab\nc".into()));
assert_eq!(unescape_double(r#"ab\zc"#), Ok("ab\\zc".into()));
assert_eq!(unescape_double(r#" \"abc\" "#), Ok(" \"abc\" ".into()));
assert_eq!(unescape_double(r#"𝄞"#), Ok("𝄞".into()));
assert_eq!(unescape_double(r#"\𝄞"#), Ok("\\𝄞".into()));
assert_eq!(unescape_double(r#"\u{1D11E}"#), Ok("𝄞".into()));
assert_eq!(unescape_double(r#"\xD834"#), Ok("\u{D8}34".into()));
assert_eq!(unescape_double(r#"\xDD1E"#), Ok("\u{DD}1E".into()));
assert_eq!(unescape_double(r#"\xD"#), Ok("\u{D}".into()));
assert_eq!(unescape_double("\t"), Ok("\t".into()));
assert_eq!(unescape_double(r#"\u{D834"#), Err(UnescapeError));
assert_eq!(unescape_double(r#"\uD834"#), Ok("\\uD834".into()));
assert_eq!(unescape_double(r#"\u"#), Ok("\\u".into()));
assert_eq!(unescape_double(r#"\47foo"#), Ok("'foo".into()));
assert_eq!(unescape_double(r#"\48foo"#), Ok("\u{4}8foo".into()));
assert_eq!(unescape_double(r#"\87foo"#), Ok("\\87foo".into()));
assert_eq!(unescape_double(r#"\u{999999}"#), Err(UnescapeError));
assert_eq!(unescape::<DoubleQuoteString>(&r#"abc"#), Ok("abc".into()));
assert_eq!(
unescape_double(r#"\u{999999999999999999}"#),
unescape::<DoubleQuoteString>(&r#"ab\nc"#),
Ok("ab\nc".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"ab\zc"#),
Ok("ab\\zc".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#" \"abc\" "#),
Ok(" \"abc\" ".into())
);
assert_eq!(unescape::<DoubleQuoteString>(r#"𝄞"#), Ok("𝄞".into()));
assert_eq!(unescape::<DoubleQuoteString>(r#"\𝄞"#), Ok("\\𝄞".into()));
assert_eq!(
unescape::<DoubleQuoteString>(r#"\u{1D11E}"#),
Ok("𝄞".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\xD834"#),
Ok("\u{D8}34".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\xDD1E"#),
Ok("\u{DD}1E".into())
);
assert_eq!(unescape::<DoubleQuoteString>(r#"\xD"#), Ok("\u{D}".into()));
assert_eq!(unescape::<DoubleQuoteString>("\t"), Ok("\t".into()));
assert_eq!(
unescape::<DoubleQuoteString>(r#"\u{D834"#),
Err(UnescapeError)
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\uD834"#),
Ok("\\uD834".into())
);
assert_eq!(unescape::<DoubleQuoteString>(r#"\u"#), Ok("\\u".into()));
assert_eq!(
unescape::<DoubleQuoteString>(r#"\47foo"#),
Ok("'foo".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\48foo"#),
Ok("\u{4}8foo".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\87foo"#),
Ok("\\87foo".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\u{999999}"#),
Err(UnescapeError)
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\u{999999999999999999}"#),
Err(UnescapeError)
);
}