mirror of
https://codeberg.org/icewind/php-literal-parser.git
synced 2026-06-03 10:34:08 +02:00
wip
This commit is contained in:
parent
b3fd3bc6a3
commit
1f64e8af89
6 changed files with 770 additions and 2 deletions
|
|
@ -4,6 +4,8 @@ version = "0.1.0"
|
|||
authors = ["Robin Appelman <robin@icewind.nl>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
logos = "0.11"
|
||||
thiserror = "1.0"
|
||||
source-span = "2.2"
|
||||
maplit = "1.0.2"
|
||||
264
src/ast.rs
Normal file
264
src/ast.rs
Normal file
|
|
@ -0,0 +1,264 @@
|
|||
use crate::error::{
|
||||
ExpectToken, InvalidArrayKeyError, ParseError, ResultExt, SpannedError, UnexpectedTokenError,
|
||||
};
|
||||
use crate::lexer::Token;
|
||||
use crate::string::{unescape_double, unescape_single, UnescapeError};
|
||||
use logos::{Lexer, Logos};
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Debug;
|
||||
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
pub enum Value {
|
||||
Bool(bool),
|
||||
Int(i64),
|
||||
Float(f64),
|
||||
String(String),
|
||||
Array(HashMap<Key, Value>),
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq, Hash, Clone)]
|
||||
pub enum Key {
|
||||
Int(i64),
|
||||
String(String),
|
||||
}
|
||||
|
||||
pub fn parse(source: &str) -> Result<Value, SpannedError<ParseError>> {
|
||||
let mut lexer: Lexer<Token> = Token::lexer(source);
|
||||
parse_lexer(source, &mut lexer)
|
||||
}
|
||||
|
||||
pub fn parse_lexer<'source>(
|
||||
source: &'source str,
|
||||
lexer: &mut Lexer<Token>,
|
||||
) -> Result<Value, SpannedError<'source, ParseError>> {
|
||||
let token = lexer
|
||||
.next()
|
||||
.expect_token("bool, int, float, string, array start")
|
||||
.with_span(lexer.span(), source)?;
|
||||
parse_token(token, source, lexer)
|
||||
}
|
||||
|
||||
pub fn parse_token<'source>(
|
||||
token: Token,
|
||||
source: &'source str,
|
||||
lexer: &mut Lexer<Token>,
|
||||
) -> Result<Value, SpannedError<'source, ParseError>> {
|
||||
let value = match token {
|
||||
Token::Bool => parse_literal(token, lexer.slice()).with_span(lexer.span(), source)?,
|
||||
Token::Integer => parse_literal(token, lexer.slice()).with_span(lexer.span(), source)?,
|
||||
Token::Float => parse_literal(token, lexer.slice()).with_span(lexer.span(), source)?,
|
||||
Token::LiteralString => {
|
||||
parse_literal(token, lexer.slice()).with_span(lexer.span(), source)?
|
||||
}
|
||||
Token::Array => Value::Array(parse_array(source, lexer, ArraySyntax::Long)?),
|
||||
Token::SquareOpen => Value::Array(parse_array(source, lexer, ArraySyntax::Short)?),
|
||||
_ => todo!(),
|
||||
};
|
||||
|
||||
Ok(value)
|
||||
}
|
||||
|
||||
fn parse_literal(token: Token, slice: &str) -> Result<Value, ParseError> {
|
||||
match token {
|
||||
Token::Bool => Ok(Value::Bool(slice.parse()?)),
|
||||
Token::Integer => Ok(Value::Int(slice.parse()?)),
|
||||
Token::Float => Ok(Value::Float(slice.parse()?)),
|
||||
Token::LiteralString => Ok(Value::String(parse_string(slice)?)),
|
||||
token => Err(ParseError::UnexpectedToken(UnexpectedTokenError::new(
|
||||
"bool, int, float, string, array start",
|
||||
Some(token),
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_string(literal: &str) -> Result<String, UnescapeError> {
|
||||
let single_quote = literal.bytes().next().unwrap() == b'\'';
|
||||
let inner = &literal[1..(literal.len()) - 1];
|
||||
|
||||
if single_quote {
|
||||
unescape_single(inner)
|
||||
} else {
|
||||
unescape_double(inner)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct ArrayBuilder {
|
||||
next_int_key: i64,
|
||||
data: HashMap<Key, Value>,
|
||||
}
|
||||
|
||||
impl ArrayBuilder {
|
||||
fn push_value(&mut self, value: Value) {
|
||||
let key = Key::Int(self.next_int_key);
|
||||
self.next_int_key += 1;
|
||||
self.data.insert(key, value);
|
||||
}
|
||||
|
||||
fn push_key_value(&mut self, key: Key, value: Value) {
|
||||
if let Key::Int(int) = &key {
|
||||
self.next_int_key = int + 1;
|
||||
}
|
||||
self.data.insert(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq)]
|
||||
enum ArraySyntax {
|
||||
Short,
|
||||
Long,
|
||||
}
|
||||
|
||||
fn parse_array<'source>(
|
||||
source: &'source str,
|
||||
lexer: &mut Lexer<Token>,
|
||||
syntax: ArraySyntax,
|
||||
) -> Result<HashMap<Key, Value>, SpannedError<'source, ParseError>> {
|
||||
let mut builder = ArrayBuilder::default();
|
||||
|
||||
if syntax == ArraySyntax::Long {
|
||||
let open = lexer
|
||||
.next()
|
||||
.expect_token("open bracket")
|
||||
.with_span(lexer.span(), source)?;
|
||||
if !matches!(open, Token::BracketOpen) {
|
||||
return Err(ParseError::UnexpectedToken(UnexpectedTokenError::new(
|
||||
"open bracket",
|
||||
Some(open),
|
||||
)))
|
||||
.with_span(lexer.span(), source);
|
||||
}
|
||||
}
|
||||
|
||||
loop {
|
||||
let key_or_value = parse_lexer(source, lexer)?;
|
||||
let key_or_value_span = lexer.span();
|
||||
let next = lexer
|
||||
.next()
|
||||
.expect_token("close bracket, comma, arrow")
|
||||
.with_span(lexer.span(), source)?;
|
||||
|
||||
match next {
|
||||
Token::BracketClose if syntax == ArraySyntax::Long => {
|
||||
builder.push_value(key_or_value);
|
||||
break;
|
||||
}
|
||||
Token::SquareClose if syntax == ArraySyntax::Short => {
|
||||
builder.push_value(key_or_value);
|
||||
break;
|
||||
}
|
||||
Token::Comma => {
|
||||
builder.push_value(key_or_value);
|
||||
}
|
||||
Token::Arrow => {
|
||||
let value = parse_lexer(source, lexer)?;
|
||||
let key = match key_or_value {
|
||||
Value::Int(int) => Key::Int(int),
|
||||
Value::Float(float) => Key::Int(float as i64),
|
||||
Value::String(str) => Key::String(str),
|
||||
value => {
|
||||
let err = ParseError::InvalidArrayKey(InvalidArrayKeyError(value));
|
||||
let span_err = SpannedError::new(err, key_or_value_span, source);
|
||||
return Err(span_err);
|
||||
}
|
||||
};
|
||||
builder.push_key_value(key, value);
|
||||
|
||||
match lexer
|
||||
.next()
|
||||
.expect_token("close bracket, comma, arrow")
|
||||
.with_span(lexer.span(), source)?
|
||||
{
|
||||
Token::BracketClose if syntax == ArraySyntax::Long => {
|
||||
break;
|
||||
}
|
||||
Token::SquareClose if syntax == ArraySyntax::Short => {
|
||||
break;
|
||||
}
|
||||
Token::Comma => {}
|
||||
token => {
|
||||
return Err(ParseError::UnexpectedToken(UnexpectedTokenError::new(
|
||||
"close bracket, comma, arrow",
|
||||
Some(token),
|
||||
)))
|
||||
.with_span(lexer.span(), source)
|
||||
}
|
||||
}
|
||||
}
|
||||
token => {
|
||||
return Err(ParseError::UnexpectedToken(UnexpectedTokenError::new(
|
||||
"close bracket, comma, arrow",
|
||||
Some(token),
|
||||
)))
|
||||
.with_span(lexer.span(), source)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(builder.data)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse() {
|
||||
use maplit::hashmap;
|
||||
|
||||
assert_eq!(Value::Bool(true), parse("true").unwrap());
|
||||
assert_eq!(Value::Bool(false), parse("false").unwrap());
|
||||
assert_eq!(Value::Int(12), parse("12").unwrap());
|
||||
assert_eq!(Value::Int(-1), parse("-1").unwrap());
|
||||
assert_eq!(Value::Float(1.12), parse("1.12").unwrap());
|
||||
assert_eq!(
|
||||
Value::String("test".to_string()),
|
||||
parse(r#""test""#).unwrap()
|
||||
);
|
||||
assert_eq!(
|
||||
Value::Array(hashmap! {
|
||||
Key::Int(0) => Value::Int(3),
|
||||
Key::Int(1) => Value::Int(4),
|
||||
Key::Int(2) => Value::Int(5),
|
||||
}),
|
||||
parse(r#"array(3,4,5)"#).unwrap()
|
||||
);
|
||||
assert_eq!(
|
||||
Value::Array(hashmap! {
|
||||
Key::Int(1) => Value::Int(3),
|
||||
Key::Int(3) => Value::Int(4),
|
||||
Key::Int(5) => Value::Int(5),
|
||||
}),
|
||||
parse(r#"array(1=>3,3=>4,5=>5)"#).unwrap()
|
||||
);
|
||||
assert_eq!(
|
||||
Value::Array(hashmap! {
|
||||
Key::Int(1) => Value::Int(3),
|
||||
Key::Int(2) => Value::Int(4),
|
||||
Key::Int(3) => Value::Int(5),
|
||||
}),
|
||||
parse(r#"array(1=>3,4,5)"#).unwrap()
|
||||
);
|
||||
assert_eq!(
|
||||
Value::Array(hashmap! {
|
||||
Key::Int(1) => Value::Int(3),
|
||||
Key::String("foo".into()) => Value::Int(4),
|
||||
Key::Int(2) => Value::Int(5),
|
||||
}),
|
||||
parse(r#"array(1=>3,"foo" => 4,5)"#).unwrap()
|
||||
);
|
||||
assert_eq!(
|
||||
Value::Array(hashmap! {
|
||||
Key::String("foo".into()) => Value::Bool(true),
|
||||
Key::String("nested".into()) => Value::Array(hashmap! {
|
||||
Key::String("foo".into()) => Value::Bool(false),
|
||||
}),
|
||||
}),
|
||||
parse(r#"array("foo" => true, "nested" => array ('foo' => false))"#).unwrap()
|
||||
);
|
||||
assert_eq!(
|
||||
Value::Array(hashmap! {
|
||||
Key::String("foo".into()) => Value::Bool(true),
|
||||
Key::String("nested".into()) => Value::Array(hashmap! {
|
||||
Key::String("foo".into()) => Value::Bool(false),
|
||||
}),
|
||||
}),
|
||||
parse(r#"["foo" => true, "nested" => ['foo' => false]]"#).unwrap()
|
||||
);
|
||||
}
|
||||
136
src/error.rs
Normal file
136
src/error.rs
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
use crate::lexer::Token;
|
||||
use crate::string::UnescapeError;
|
||||
use crate::Value;
|
||||
use logos::Span;
|
||||
use source_span::{
|
||||
fmt::{Color, Formatter, Style},
|
||||
DefaultMetrics, Position, SourceBuffer, Span as SourceSpan,
|
||||
};
|
||||
use std::error::Error;
|
||||
use std::fmt::{self, Debug};
|
||||
use std::num::{ParseFloatError, ParseIntError};
|
||||
use std::str::ParseBoolError;
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct SpannedError<'a, T: Error + Debug> {
|
||||
span: Span,
|
||||
source: &'a str,
|
||||
error: T,
|
||||
}
|
||||
|
||||
impl<'a, T: Error + Debug> SpannedError<'a, T> {
|
||||
pub fn new(error: T, span: Span, source: &'a str) -> Self {
|
||||
SpannedError {
|
||||
span,
|
||||
source,
|
||||
error,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const METRICS: DefaultMetrics = DefaultMetrics::with_tab_stop(4);
|
||||
|
||||
impl<'a, T: Error + Debug> fmt::Display for SpannedError<'a, T> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let start = get_position(self.source, self.span.start);
|
||||
let end = get_position(self.source, self.span.end);
|
||||
let span = SourceSpan::new(start, end, end.next_line());
|
||||
|
||||
let mut fmt = Formatter::with_margin_color(Color::Blue);
|
||||
let buffer = SourceBuffer::new(
|
||||
self.source.chars().map(|char| Result::<char, ()>::Ok(char)),
|
||||
Position::default(),
|
||||
METRICS,
|
||||
);
|
||||
fmt.add(span, Some(format!("{}", self.error)), Style::Error);
|
||||
let formatted = fmt
|
||||
.render(
|
||||
buffer.iter(),
|
||||
SourceSpan::new(
|
||||
Position::default(),
|
||||
Position::new(usize::max_value() - 1, usize::max_value()),
|
||||
Position::end(),
|
||||
),
|
||||
&METRICS,
|
||||
)
|
||||
.unwrap();
|
||||
write!(f, "{}", formatted)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn get_position(text: &str, index: usize) -> Position {
|
||||
let mut pos = Position::default();
|
||||
for char in text.chars().take(index) {
|
||||
pos = pos.next(char, &METRICS);
|
||||
}
|
||||
|
||||
pos
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum ParseError {
|
||||
#[error("{0}")]
|
||||
UnexpectedToken(#[from] UnexpectedTokenError),
|
||||
#[error("{0}")]
|
||||
InvalidArrayKey(#[from] InvalidArrayKeyError),
|
||||
#[error("Invalid boolean literal: {0}")]
|
||||
InvalidBoolLiteral(#[from] ParseBoolError),
|
||||
#[error("Invalid integer literal: {0}")]
|
||||
InvalidIntLiteral(#[from] ParseIntError),
|
||||
#[error("Invalid float literal: {0}")]
|
||||
InvalidFloatLiteral(#[from] ParseFloatError),
|
||||
#[error("Invalid string literal")]
|
||||
InvalidStringLiteral,
|
||||
}
|
||||
|
||||
impl From<UnescapeError> for ParseError {
|
||||
fn from(_: UnescapeError) -> Self {
|
||||
ParseError::InvalidStringLiteral
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
#[error("Unexpected token, found {found:?} expected one of {expected}")]
|
||||
pub struct UnexpectedTokenError {
|
||||
expected: &'static str,
|
||||
pub found: Option<Token>,
|
||||
}
|
||||
|
||||
impl UnexpectedTokenError {
|
||||
pub fn new(expected: &'static str, found: Option<Token>) -> Self {
|
||||
UnexpectedTokenError { expected, found }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
#[error("Invalid array key {0:?} expected number or string")]
|
||||
pub struct InvalidArrayKeyError(pub Value);
|
||||
|
||||
pub trait ExpectToken {
|
||||
fn expect_token(self, expected: &'static str) -> Result<Token, UnexpectedTokenError>;
|
||||
}
|
||||
|
||||
impl ExpectToken for Option<Token> {
|
||||
fn expect_token(self, expected: &'static str) -> Result<Token, UnexpectedTokenError> {
|
||||
self.ok_or_else(|| UnexpectedTokenError {
|
||||
expected,
|
||||
found: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub trait ResultExt<'a, T, E: Error + Debug> {
|
||||
fn with_span(self, span: Span, source: &'a str) -> Result<T, SpannedError<'a, E>>;
|
||||
}
|
||||
|
||||
impl<'a, T, E: Into<ParseError>> ResultExt<'a, T, ParseError> for Result<T, E> {
|
||||
fn with_span(self, span: Span, source: &'a str) -> Result<T, SpannedError<'a, ParseError>> {
|
||||
self.map_err(|error| SpannedError {
|
||||
span,
|
||||
source,
|
||||
error: error.into(),
|
||||
})
|
||||
}
|
||||
}
|
||||
111
src/lexer.rs
Normal file
111
src/lexer.rs
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
use logos::Logos;
|
||||
|
||||
#[derive(Logos, Debug, PartialEq, Clone)]
|
||||
pub enum Token {
|
||||
#[token("array")]
|
||||
Array,
|
||||
#[regex("true|false")]
|
||||
Bool,
|
||||
#[token("=>")]
|
||||
Arrow,
|
||||
#[token("(")]
|
||||
BracketOpen,
|
||||
#[token(")")]
|
||||
BracketClose,
|
||||
#[token("[")]
|
||||
SquareOpen,
|
||||
#[token("]")]
|
||||
SquareClose,
|
||||
#[token(",")]
|
||||
Comma,
|
||||
#[regex("(\"([^\"\\\\]|\\\\.)*\")|(\'([^\'\\\\]|\\\\.)*\')")]
|
||||
LiteralString,
|
||||
#[regex("-?[0-9]*\\.[0-9]+")]
|
||||
Float,
|
||||
#[regex("-?[0-9]+")]
|
||||
Integer,
|
||||
#[error]
|
||||
#[regex(r"[ \t\n\f]+", logos::skip)]
|
||||
Error,
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lex() {
|
||||
let source = r###"
|
||||
array (
|
||||
"double" => "quote",
|
||||
'single' => 'quote',
|
||||
"escaped" => "\"quote\"",
|
||||
1 => 2,
|
||||
"nested" => [
|
||||
"sub" => "key",
|
||||
],
|
||||
"array" => [1,2,3,4],
|
||||
"bool" => false,
|
||||
"negative" => -1,
|
||||
)
|
||||
"###;
|
||||
let mut lex = Token::lexer(source);
|
||||
|
||||
assert_eq!(lex.next(), Some(Token::Array));
|
||||
assert_eq!(lex.next(), Some(Token::BracketOpen));
|
||||
|
||||
assert_eq!(lex.next(), Some(Token::LiteralString));
|
||||
assert_eq!(lex.next(), Some(Token::Arrow));
|
||||
assert_eq!(lex.next(), Some(Token::LiteralString));
|
||||
assert_eq!(lex.next(), Some(Token::Comma));
|
||||
|
||||
assert_eq!(lex.next(), Some(Token::LiteralString));
|
||||
assert_eq!(lex.next(), Some(Token::Arrow));
|
||||
assert_eq!(lex.next(), Some(Token::LiteralString));
|
||||
assert_eq!(lex.next(), Some(Token::Comma));
|
||||
|
||||
assert_eq!(lex.next(), Some(Token::LiteralString));
|
||||
assert_eq!(lex.next(), Some(Token::Arrow));
|
||||
assert_eq!(lex.next(), Some(Token::LiteralString));
|
||||
assert_eq!(lex.next(), Some(Token::Comma));
|
||||
|
||||
assert_eq!(lex.next(), Some(Token::Integer));
|
||||
assert_eq!(lex.next(), Some(Token::Arrow));
|
||||
assert_eq!(lex.next(), Some(Token::Integer));
|
||||
assert_eq!(lex.next(), Some(Token::Comma));
|
||||
|
||||
assert_eq!(lex.next(), Some(Token::LiteralString));
|
||||
assert_eq!(lex.next(), Some(Token::Arrow));
|
||||
assert_eq!(lex.next(), Some(Token::SquareOpen));
|
||||
|
||||
assert_eq!(lex.next(), Some(Token::LiteralString));
|
||||
assert_eq!(lex.next(), Some(Token::Arrow));
|
||||
assert_eq!(lex.next(), Some(Token::LiteralString));
|
||||
assert_eq!(lex.next(), Some(Token::Comma));
|
||||
|
||||
assert_eq!(lex.next(), Some(Token::SquareClose));
|
||||
assert_eq!(lex.next(), Some(Token::Comma));
|
||||
|
||||
assert_eq!(lex.next(), Some(Token::LiteralString));
|
||||
assert_eq!(lex.next(), Some(Token::Arrow));
|
||||
assert_eq!(lex.next(), Some(Token::SquareOpen));
|
||||
assert_eq!(lex.next(), Some(Token::Integer));
|
||||
assert_eq!(lex.next(), Some(Token::Comma));
|
||||
assert_eq!(lex.next(), Some(Token::Integer));
|
||||
assert_eq!(lex.next(), Some(Token::Comma));
|
||||
assert_eq!(lex.next(), Some(Token::Integer));
|
||||
assert_eq!(lex.next(), Some(Token::Comma));
|
||||
assert_eq!(lex.next(), Some(Token::Integer));
|
||||
assert_eq!(lex.next(), Some(Token::SquareClose));
|
||||
assert_eq!(lex.next(), Some(Token::Comma));
|
||||
|
||||
assert_eq!(lex.next(), Some(Token::LiteralString));
|
||||
assert_eq!(lex.next(), Some(Token::Arrow));
|
||||
assert_eq!(lex.next(), Some(Token::Bool));
|
||||
assert_eq!(lex.next(), Some(Token::Comma));
|
||||
|
||||
assert_eq!(lex.next(), Some(Token::LiteralString));
|
||||
assert_eq!(lex.next(), Some(Token::Arrow));
|
||||
assert_eq!(lex.next(), Some(Token::Integer));
|
||||
assert_eq!(lex.next(), Some(Token::Comma));
|
||||
|
||||
assert_eq!(lex.next(), Some(Token::BracketClose));
|
||||
|
||||
assert_eq!(lex.next(), None);
|
||||
}
|
||||
|
|
@ -1,3 +1,11 @@
|
|||
mod ast;
|
||||
mod error;
|
||||
mod lexer;
|
||||
mod string;
|
||||
|
||||
pub use ast::{parse, Key, Value};
|
||||
pub use error::{ParseError, SpannedError};
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[test]
|
||||
|
|
|
|||
247
src/string.rs
Normal file
247
src/string.rs
Normal file
|
|
@ -0,0 +1,247 @@
|
|||
/// unescaping php string literals borrowed mostly from `escape8259`
|
||||
use std::char::decode_utf16;
|
||||
use std::iter::Peekable;
|
||||
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
|
||||
/// An error occurred while
|
||||
pub struct UnescapeError;
|
||||
|
||||
type UnescapeResult<T> = Result<T, UnescapeError>;
|
||||
|
||||
// Used to collect output characters and queue u16 values for translation.
|
||||
struct UnescapeState {
|
||||
// The accumulated characters
|
||||
out: String,
|
||||
// Store a fragment of a large character for later decoding
|
||||
stash: u16,
|
||||
}
|
||||
|
||||
impl UnescapeState {
|
||||
fn new() -> UnescapeState {
|
||||
UnescapeState {
|
||||
out: String::new(),
|
||||
stash: 0,
|
||||
}
|
||||
}
|
||||
|
||||
// Collect a new character
|
||||
fn push_char(&mut self, c: char) -> UnescapeResult<()> {
|
||||
if self.stash != 0 {
|
||||
return Err(UnescapeError);
|
||||
}
|
||||
self.out.push(c);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Collect a new UTF16 word. This can either be one whole character,
|
||||
// or part of a larger character.
|
||||
fn push_u16(&mut self, x: u16) -> UnescapeResult<()> {
|
||||
let surrogate = x >= 0xD800 && x <= 0xDFFF;
|
||||
match (self.stash, surrogate) {
|
||||
(0, false) => {
|
||||
// The std library only provides utf16 decode of an iterator,
|
||||
// so to decode a single character we wrap it in an array.
|
||||
// Hopefully the compiler will elide most of this extra work.
|
||||
let words = [x];
|
||||
match decode_utf16(words.iter().copied()).next() {
|
||||
Some(Ok(c)) => {
|
||||
self.out.push(c);
|
||||
}
|
||||
_ => return Err(UnescapeError),
|
||||
}
|
||||
}
|
||||
(0, true) => self.stash = x,
|
||||
(_, false) => {
|
||||
return Err(UnescapeError);
|
||||
}
|
||||
(w, true) => {
|
||||
let words = [w, x];
|
||||
match decode_utf16(words.iter().copied()).next() {
|
||||
Some(Ok(c)) => {
|
||||
self.out.push(c);
|
||||
self.stash = 0;
|
||||
}
|
||||
_ => return Err(UnescapeError),
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// If we queued up part of a UTF-16 encoded word but didn't
|
||||
// finish it, return an error. Otherwise, consume self and
|
||||
// return the accumulated String.
|
||||
fn finalize(self) -> UnescapeResult<String> {
|
||||
if self.stash != 0 {
|
||||
return Err(UnescapeError);
|
||||
}
|
||||
Ok(self.out)
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_u16_hex<S>(s: &mut Peekable<S>, max: Option<u8>) -> UnescapeResult<u16>
|
||||
where
|
||||
S: Iterator<Item = char>,
|
||||
{
|
||||
let mut result = 0;
|
||||
let mut max = max.unwrap_or(u8::max_value());
|
||||
while s.peek().map(|c| c.is_ascii_hexdigit()).unwrap_or_default() {
|
||||
result *= 16;
|
||||
result += s.next().unwrap().to_digit(16).unwrap() as u16;
|
||||
max -= 1;
|
||||
if max == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn parse_u16_oct<S>(s: &mut Peekable<S>, mut result: u16, max: Option<u8>) -> UnescapeResult<u16>
|
||||
where
|
||||
S: Iterator<Item = char>,
|
||||
{
|
||||
let mut max = max.unwrap_or(u8::max_value());
|
||||
while s.peek().map(|c| c >= &'1' && c <= &'7').unwrap_or_default() {
|
||||
let digit = s.next().unwrap();
|
||||
dbg!(digit);
|
||||
result *= 8;
|
||||
result += digit.to_digit(8).unwrap() as u16;
|
||||
max -= 1;
|
||||
if max == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Un-escape a string, following php single quote rules
|
||||
pub fn unescape_single(s: &str) -> UnescapeResult<String> {
|
||||
let mut state = UnescapeState::new();
|
||||
let mut ins = s.chars();
|
||||
|
||||
while let Some(c) = ins.next() {
|
||||
if c == '\\' {
|
||||
match ins.next() {
|
||||
None => {
|
||||
return Err(UnescapeError);
|
||||
}
|
||||
Some(d) => match d {
|
||||
'\\' | '\'' => state.push_char(d)?,
|
||||
_ => {
|
||||
state.push_char('\\')?;
|
||||
state.push_char(d)?
|
||||
}
|
||||
},
|
||||
}
|
||||
} else {
|
||||
state.push_char(c)?;
|
||||
}
|
||||
}
|
||||
|
||||
state.finalize()
|
||||
}
|
||||
|
||||
/// Un-escape a string, following php double quote rules
|
||||
pub fn unescape_double(s: &str) -> UnescapeResult<String> {
|
||||
let mut state = UnescapeState::new();
|
||||
let mut ins = s.chars().peekable();
|
||||
|
||||
while let Some(c) = ins.next() {
|
||||
if c == '\\' {
|
||||
match ins.next() {
|
||||
None => {
|
||||
return Err(UnescapeError);
|
||||
}
|
||||
Some(d) => {
|
||||
match d {
|
||||
'$' | '"' | '\\' => state.push_char(d)?,
|
||||
'n' => state.push_char('\n')?, // linefeed
|
||||
'r' => state.push_char('\r')?, // carriage return
|
||||
't' => state.push_char('\t')?, // tab
|
||||
'v' => state.push_char('\x0B')?, // vertical tab
|
||||
'f' => state.push_char('\x0C')?, // form feed
|
||||
'x' => {
|
||||
let val = parse_u16_hex(&mut ins, Some(2))?;
|
||||
state.push_u16(val)?;
|
||||
}
|
||||
'u' => match ins.next() {
|
||||
Some('{') => {
|
||||
let val = parse_u16_hex(&mut ins, None)?;
|
||||
state.push_u16(val)?;
|
||||
if !matches!(ins.next(), Some('}')) {
|
||||
return Err(UnescapeError);
|
||||
}
|
||||
}
|
||||
Some(d) => {
|
||||
state.push_char('\\')?;
|
||||
state.push_char('u')?;
|
||||
state.push_char(d)?;
|
||||
}
|
||||
None => {
|
||||
state.push_char('\\')?;
|
||||
state.push_char(d)?;
|
||||
}
|
||||
},
|
||||
'0'..='7' => {
|
||||
let val =
|
||||
parse_u16_oct(&mut ins, d.to_digit(8).unwrap() as u16, Some(3))?;
|
||||
state.push_u16(val)?;
|
||||
}
|
||||
_ => {
|
||||
state.push_char('\\')?;
|
||||
state.push_char(d)?
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
state.push_char(c)?;
|
||||
}
|
||||
}
|
||||
|
||||
state.finalize()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_unescape_single() {
|
||||
assert_eq!(unescape_single(&r#"abc"#), Ok("abc".into()));
|
||||
assert_eq!(unescape_single(&r#"ab\nc"#), Ok("ab\\nc".into()));
|
||||
assert_eq!(unescape_single(r#"ab\zc"#), Ok("ab\\zc".into()));
|
||||
assert_eq!(unescape_single(r#" \"abc\" "#), Ok(" \\\"abc\\\" ".into()));
|
||||
assert_eq!(unescape_single(r#"𝄞"#), Ok("𝄞".into()));
|
||||
assert_eq!(unescape_single(r#"\𝄞"#), Ok("\\𝄞".into()));
|
||||
assert_eq!(
|
||||
unescape_single(r#"\xD834\xDD1E"#),
|
||||
Ok("\\xD834\\xDD1E".into())
|
||||
);
|
||||
assert_eq!(unescape_single(r#"\xD834"#), Ok("\\xD834".into()));
|
||||
assert_eq!(unescape_single(r#"\xDD1E"#), Ok("\\xDD1E".into()));
|
||||
assert_eq!(unescape_single("\t"), Ok("\t".into()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unescape_double() {
|
||||
assert_eq!(unescape_double(&r#"abc"#), Ok("abc".into()));
|
||||
assert_eq!(unescape_double(&r#"ab\nc"#), Ok("ab\nc".into()));
|
||||
assert_eq!(unescape_double(r#"ab\zc"#), Ok("ab\\zc".into()));
|
||||
assert_eq!(unescape_double(r#" \"abc\" "#), Ok(" \"abc\" ".into()));
|
||||
assert_eq!(unescape_double(r#"𝄞"#), Ok("𝄞".into()));
|
||||
assert_eq!(unescape_double(r#"\𝄞"#), Ok("\\𝄞".into()));
|
||||
assert_eq!(unescape_double(r#"\u{D834}\u{DD1E}"#), Ok("𝄞".into()));
|
||||
assert_eq!(unescape_double(r#"\xD834"#), Ok("\u{D8}34".into()));
|
||||
assert_eq!(unescape_double(r#"\xDD1E"#), Ok("\u{DD}1E".into()));
|
||||
assert_eq!(unescape_double(r#"\xD"#), Ok("\u{D}".into()));
|
||||
assert_eq!(unescape_double("\t"), Ok("\t".into()));
|
||||
assert_eq!(unescape_double(r#"\u{D834"#), Err(UnescapeError));
|
||||
assert_eq!(unescape_double(r#"\uD834"#), Ok("\\uD834".into()));
|
||||
assert_eq!(unescape_double(r#"\u"#), Ok("\\u".into()));
|
||||
assert_eq!(unescape_double(r#"\47foo"#), Ok("'foo".into()));
|
||||
assert_eq!(unescape_double(r#"\48foo"#), Ok("\u{4}8foo".into()));
|
||||
assert_eq!(unescape_double(r#"\87foo"#), Ok("\\87foo".into()));
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue