This commit is contained in:
Robin Appelman 2020-12-01 21:53:10 +01:00
commit 1f64e8af89
6 changed files with 770 additions and 2 deletions

264
src/ast.rs Normal file
View file

@ -0,0 +1,264 @@
use crate::error::{
ExpectToken, InvalidArrayKeyError, ParseError, ResultExt, SpannedError, UnexpectedTokenError,
};
use crate::lexer::Token;
use crate::string::{unescape_double, unescape_single, UnescapeError};
use logos::{Lexer, Logos};
use std::collections::HashMap;
use std::fmt::Debug;
#[derive(Debug, PartialEq, Clone)]
pub enum Value {
Bool(bool),
Int(i64),
Float(f64),
String(String),
Array(HashMap<Key, Value>),
}
#[derive(Debug, Eq, PartialEq, Hash, Clone)]
pub enum Key {
Int(i64),
String(String),
}
pub fn parse(source: &str) -> Result<Value, SpannedError<ParseError>> {
let mut lexer: Lexer<Token> = Token::lexer(source);
parse_lexer(source, &mut lexer)
}
pub fn parse_lexer<'source>(
source: &'source str,
lexer: &mut Lexer<Token>,
) -> Result<Value, SpannedError<'source, ParseError>> {
let token = lexer
.next()
.expect_token("bool, int, float, string, array start")
.with_span(lexer.span(), source)?;
parse_token(token, source, lexer)
}
pub fn parse_token<'source>(
token: Token,
source: &'source str,
lexer: &mut Lexer<Token>,
) -> Result<Value, SpannedError<'source, ParseError>> {
let value = match token {
Token::Bool => parse_literal(token, lexer.slice()).with_span(lexer.span(), source)?,
Token::Integer => parse_literal(token, lexer.slice()).with_span(lexer.span(), source)?,
Token::Float => parse_literal(token, lexer.slice()).with_span(lexer.span(), source)?,
Token::LiteralString => {
parse_literal(token, lexer.slice()).with_span(lexer.span(), source)?
}
Token::Array => Value::Array(parse_array(source, lexer, ArraySyntax::Long)?),
Token::SquareOpen => Value::Array(parse_array(source, lexer, ArraySyntax::Short)?),
_ => todo!(),
};
Ok(value)
}
fn parse_literal(token: Token, slice: &str) -> Result<Value, ParseError> {
match token {
Token::Bool => Ok(Value::Bool(slice.parse()?)),
Token::Integer => Ok(Value::Int(slice.parse()?)),
Token::Float => Ok(Value::Float(slice.parse()?)),
Token::LiteralString => Ok(Value::String(parse_string(slice)?)),
token => Err(ParseError::UnexpectedToken(UnexpectedTokenError::new(
"bool, int, float, string, array start",
Some(token),
))),
}
}
fn parse_string(literal: &str) -> Result<String, UnescapeError> {
let single_quote = literal.bytes().next().unwrap() == b'\'';
let inner = &literal[1..(literal.len()) - 1];
if single_quote {
unescape_single(inner)
} else {
unescape_double(inner)
}
}
#[derive(Default)]
struct ArrayBuilder {
next_int_key: i64,
data: HashMap<Key, Value>,
}
impl ArrayBuilder {
fn push_value(&mut self, value: Value) {
let key = Key::Int(self.next_int_key);
self.next_int_key += 1;
self.data.insert(key, value);
}
fn push_key_value(&mut self, key: Key, value: Value) {
if let Key::Int(int) = &key {
self.next_int_key = int + 1;
}
self.data.insert(key, value);
}
}
#[derive(Eq, PartialEq)]
enum ArraySyntax {
Short,
Long,
}
fn parse_array<'source>(
source: &'source str,
lexer: &mut Lexer<Token>,
syntax: ArraySyntax,
) -> Result<HashMap<Key, Value>, SpannedError<'source, ParseError>> {
let mut builder = ArrayBuilder::default();
if syntax == ArraySyntax::Long {
let open = lexer
.next()
.expect_token("open bracket")
.with_span(lexer.span(), source)?;
if !matches!(open, Token::BracketOpen) {
return Err(ParseError::UnexpectedToken(UnexpectedTokenError::new(
"open bracket",
Some(open),
)))
.with_span(lexer.span(), source);
}
}
loop {
let key_or_value = parse_lexer(source, lexer)?;
let key_or_value_span = lexer.span();
let next = lexer
.next()
.expect_token("close bracket, comma, arrow")
.with_span(lexer.span(), source)?;
match next {
Token::BracketClose if syntax == ArraySyntax::Long => {
builder.push_value(key_or_value);
break;
}
Token::SquareClose if syntax == ArraySyntax::Short => {
builder.push_value(key_or_value);
break;
}
Token::Comma => {
builder.push_value(key_or_value);
}
Token::Arrow => {
let value = parse_lexer(source, lexer)?;
let key = match key_or_value {
Value::Int(int) => Key::Int(int),
Value::Float(float) => Key::Int(float as i64),
Value::String(str) => Key::String(str),
value => {
let err = ParseError::InvalidArrayKey(InvalidArrayKeyError(value));
let span_err = SpannedError::new(err, key_or_value_span, source);
return Err(span_err);
}
};
builder.push_key_value(key, value);
match lexer
.next()
.expect_token("close bracket, comma, arrow")
.with_span(lexer.span(), source)?
{
Token::BracketClose if syntax == ArraySyntax::Long => {
break;
}
Token::SquareClose if syntax == ArraySyntax::Short => {
break;
}
Token::Comma => {}
token => {
return Err(ParseError::UnexpectedToken(UnexpectedTokenError::new(
"close bracket, comma, arrow",
Some(token),
)))
.with_span(lexer.span(), source)
}
}
}
token => {
return Err(ParseError::UnexpectedToken(UnexpectedTokenError::new(
"close bracket, comma, arrow",
Some(token),
)))
.with_span(lexer.span(), source)
}
}
}
Ok(builder.data)
}
#[test]
fn test_parse() {
use maplit::hashmap;
assert_eq!(Value::Bool(true), parse("true").unwrap());
assert_eq!(Value::Bool(false), parse("false").unwrap());
assert_eq!(Value::Int(12), parse("12").unwrap());
assert_eq!(Value::Int(-1), parse("-1").unwrap());
assert_eq!(Value::Float(1.12), parse("1.12").unwrap());
assert_eq!(
Value::String("test".to_string()),
parse(r#""test""#).unwrap()
);
assert_eq!(
Value::Array(hashmap! {
Key::Int(0) => Value::Int(3),
Key::Int(1) => Value::Int(4),
Key::Int(2) => Value::Int(5),
}),
parse(r#"array(3,4,5)"#).unwrap()
);
assert_eq!(
Value::Array(hashmap! {
Key::Int(1) => Value::Int(3),
Key::Int(3) => Value::Int(4),
Key::Int(5) => Value::Int(5),
}),
parse(r#"array(1=>3,3=>4,5=>5)"#).unwrap()
);
assert_eq!(
Value::Array(hashmap! {
Key::Int(1) => Value::Int(3),
Key::Int(2) => Value::Int(4),
Key::Int(3) => Value::Int(5),
}),
parse(r#"array(1=>3,4,5)"#).unwrap()
);
assert_eq!(
Value::Array(hashmap! {
Key::Int(1) => Value::Int(3),
Key::String("foo".into()) => Value::Int(4),
Key::Int(2) => Value::Int(5),
}),
parse(r#"array(1=>3,"foo" => 4,5)"#).unwrap()
);
assert_eq!(
Value::Array(hashmap! {
Key::String("foo".into()) => Value::Bool(true),
Key::String("nested".into()) => Value::Array(hashmap! {
Key::String("foo".into()) => Value::Bool(false),
}),
}),
parse(r#"array("foo" => true, "nested" => array ('foo' => false))"#).unwrap()
);
assert_eq!(
Value::Array(hashmap! {
Key::String("foo".into()) => Value::Bool(true),
Key::String("nested".into()) => Value::Array(hashmap! {
Key::String("foo".into()) => Value::Bool(false),
}),
}),
parse(r#"["foo" => true, "nested" => ['foo' => false]]"#).unwrap()
);
}

136
src/error.rs Normal file
View file

@ -0,0 +1,136 @@
use crate::lexer::Token;
use crate::string::UnescapeError;
use crate::Value;
use logos::Span;
use source_span::{
fmt::{Color, Formatter, Style},
DefaultMetrics, Position, SourceBuffer, Span as SourceSpan,
};
use std::error::Error;
use std::fmt::{self, Debug};
use std::num::{ParseFloatError, ParseIntError};
use std::str::ParseBoolError;
use thiserror::Error;
#[derive(Debug)]
pub struct SpannedError<'a, T: Error + Debug> {
span: Span,
source: &'a str,
error: T,
}
impl<'a, T: Error + Debug> SpannedError<'a, T> {
pub fn new(error: T, span: Span, source: &'a str) -> Self {
SpannedError {
span,
source,
error,
}
}
}
const METRICS: DefaultMetrics = DefaultMetrics::with_tab_stop(4);
impl<'a, T: Error + Debug> fmt::Display for SpannedError<'a, T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let start = get_position(self.source, self.span.start);
let end = get_position(self.source, self.span.end);
let span = SourceSpan::new(start, end, end.next_line());
let mut fmt = Formatter::with_margin_color(Color::Blue);
let buffer = SourceBuffer::new(
self.source.chars().map(|char| Result::<char, ()>::Ok(char)),
Position::default(),
METRICS,
);
fmt.add(span, Some(format!("{}", self.error)), Style::Error);
let formatted = fmt
.render(
buffer.iter(),
SourceSpan::new(
Position::default(),
Position::new(usize::max_value() - 1, usize::max_value()),
Position::end(),
),
&METRICS,
)
.unwrap();
write!(f, "{}", formatted)?;
Ok(())
}
}
fn get_position(text: &str, index: usize) -> Position {
let mut pos = Position::default();
for char in text.chars().take(index) {
pos = pos.next(char, &METRICS);
}
pos
}
#[derive(Error, Debug)]
pub enum ParseError {
#[error("{0}")]
UnexpectedToken(#[from] UnexpectedTokenError),
#[error("{0}")]
InvalidArrayKey(#[from] InvalidArrayKeyError),
#[error("Invalid boolean literal: {0}")]
InvalidBoolLiteral(#[from] ParseBoolError),
#[error("Invalid integer literal: {0}")]
InvalidIntLiteral(#[from] ParseIntError),
#[error("Invalid float literal: {0}")]
InvalidFloatLiteral(#[from] ParseFloatError),
#[error("Invalid string literal")]
InvalidStringLiteral,
}
impl From<UnescapeError> for ParseError {
fn from(_: UnescapeError) -> Self {
ParseError::InvalidStringLiteral
}
}
#[derive(Error, Debug)]
#[error("Unexpected token, found {found:?} expected one of {expected}")]
pub struct UnexpectedTokenError {
expected: &'static str,
pub found: Option<Token>,
}
impl UnexpectedTokenError {
pub fn new(expected: &'static str, found: Option<Token>) -> Self {
UnexpectedTokenError { expected, found }
}
}
#[derive(Error, Debug)]
#[error("Invalid array key {0:?} expected number or string")]
pub struct InvalidArrayKeyError(pub Value);
pub trait ExpectToken {
fn expect_token(self, expected: &'static str) -> Result<Token, UnexpectedTokenError>;
}
impl ExpectToken for Option<Token> {
fn expect_token(self, expected: &'static str) -> Result<Token, UnexpectedTokenError> {
self.ok_or_else(|| UnexpectedTokenError {
expected,
found: None,
})
}
}
pub trait ResultExt<'a, T, E: Error + Debug> {
fn with_span(self, span: Span, source: &'a str) -> Result<T, SpannedError<'a, E>>;
}
impl<'a, T, E: Into<ParseError>> ResultExt<'a, T, ParseError> for Result<T, E> {
fn with_span(self, span: Span, source: &'a str) -> Result<T, SpannedError<'a, ParseError>> {
self.map_err(|error| SpannedError {
span,
source,
error: error.into(),
})
}
}

111
src/lexer.rs Normal file
View file

@ -0,0 +1,111 @@
use logos::Logos;
#[derive(Logos, Debug, PartialEq, Clone)]
pub enum Token {
#[token("array")]
Array,
#[regex("true|false")]
Bool,
#[token("=>")]
Arrow,
#[token("(")]
BracketOpen,
#[token(")")]
BracketClose,
#[token("[")]
SquareOpen,
#[token("]")]
SquareClose,
#[token(",")]
Comma,
#[regex("(\"([^\"\\\\]|\\\\.)*\")|(\'([^\'\\\\]|\\\\.)*\')")]
LiteralString,
#[regex("-?[0-9]*\\.[0-9]+")]
Float,
#[regex("-?[0-9]+")]
Integer,
#[error]
#[regex(r"[ \t\n\f]+", logos::skip)]
Error,
}
#[test]
fn test_lex() {
let source = r###"
array (
"double" => "quote",
'single' => 'quote',
"escaped" => "\"quote\"",
1 => 2,
"nested" => [
"sub" => "key",
],
"array" => [1,2,3,4],
"bool" => false,
"negative" => -1,
)
"###;
let mut lex = Token::lexer(source);
assert_eq!(lex.next(), Some(Token::Array));
assert_eq!(lex.next(), Some(Token::BracketOpen));
assert_eq!(lex.next(), Some(Token::LiteralString));
assert_eq!(lex.next(), Some(Token::Arrow));
assert_eq!(lex.next(), Some(Token::LiteralString));
assert_eq!(lex.next(), Some(Token::Comma));
assert_eq!(lex.next(), Some(Token::LiteralString));
assert_eq!(lex.next(), Some(Token::Arrow));
assert_eq!(lex.next(), Some(Token::LiteralString));
assert_eq!(lex.next(), Some(Token::Comma));
assert_eq!(lex.next(), Some(Token::LiteralString));
assert_eq!(lex.next(), Some(Token::Arrow));
assert_eq!(lex.next(), Some(Token::LiteralString));
assert_eq!(lex.next(), Some(Token::Comma));
assert_eq!(lex.next(), Some(Token::Integer));
assert_eq!(lex.next(), Some(Token::Arrow));
assert_eq!(lex.next(), Some(Token::Integer));
assert_eq!(lex.next(), Some(Token::Comma));
assert_eq!(lex.next(), Some(Token::LiteralString));
assert_eq!(lex.next(), Some(Token::Arrow));
assert_eq!(lex.next(), Some(Token::SquareOpen));
assert_eq!(lex.next(), Some(Token::LiteralString));
assert_eq!(lex.next(), Some(Token::Arrow));
assert_eq!(lex.next(), Some(Token::LiteralString));
assert_eq!(lex.next(), Some(Token::Comma));
assert_eq!(lex.next(), Some(Token::SquareClose));
assert_eq!(lex.next(), Some(Token::Comma));
assert_eq!(lex.next(), Some(Token::LiteralString));
assert_eq!(lex.next(), Some(Token::Arrow));
assert_eq!(lex.next(), Some(Token::SquareOpen));
assert_eq!(lex.next(), Some(Token::Integer));
assert_eq!(lex.next(), Some(Token::Comma));
assert_eq!(lex.next(), Some(Token::Integer));
assert_eq!(lex.next(), Some(Token::Comma));
assert_eq!(lex.next(), Some(Token::Integer));
assert_eq!(lex.next(), Some(Token::Comma));
assert_eq!(lex.next(), Some(Token::Integer));
assert_eq!(lex.next(), Some(Token::SquareClose));
assert_eq!(lex.next(), Some(Token::Comma));
assert_eq!(lex.next(), Some(Token::LiteralString));
assert_eq!(lex.next(), Some(Token::Arrow));
assert_eq!(lex.next(), Some(Token::Bool));
assert_eq!(lex.next(), Some(Token::Comma));
assert_eq!(lex.next(), Some(Token::LiteralString));
assert_eq!(lex.next(), Some(Token::Arrow));
assert_eq!(lex.next(), Some(Token::Integer));
assert_eq!(lex.next(), Some(Token::Comma));
assert_eq!(lex.next(), Some(Token::BracketClose));
assert_eq!(lex.next(), None);
}

View file

@ -1,3 +1,11 @@
mod ast;
mod error;
mod lexer;
mod string;
pub use ast::{parse, Key, Value};
pub use error::{ParseError, SpannedError};
#[cfg(test)]
mod tests {
#[test]

247
src/string.rs Normal file
View file

@ -0,0 +1,247 @@
/// unescaping php string literals borrowed mostly from `escape8259`
use std::char::decode_utf16;
use std::iter::Peekable;
#[derive(Debug, Clone, Eq, PartialEq)]
/// An error occurred while
pub struct UnescapeError;
type UnescapeResult<T> = Result<T, UnescapeError>;
// Used to collect output characters and queue u16 values for translation.
struct UnescapeState {
// The accumulated characters
out: String,
// Store a fragment of a large character for later decoding
stash: u16,
}
impl UnescapeState {
fn new() -> UnescapeState {
UnescapeState {
out: String::new(),
stash: 0,
}
}
// Collect a new character
fn push_char(&mut self, c: char) -> UnescapeResult<()> {
if self.stash != 0 {
return Err(UnescapeError);
}
self.out.push(c);
Ok(())
}
// Collect a new UTF16 word. This can either be one whole character,
// or part of a larger character.
fn push_u16(&mut self, x: u16) -> UnescapeResult<()> {
let surrogate = x >= 0xD800 && x <= 0xDFFF;
match (self.stash, surrogate) {
(0, false) => {
// The std library only provides utf16 decode of an iterator,
// so to decode a single character we wrap it in an array.
// Hopefully the compiler will elide most of this extra work.
let words = [x];
match decode_utf16(words.iter().copied()).next() {
Some(Ok(c)) => {
self.out.push(c);
}
_ => return Err(UnescapeError),
}
}
(0, true) => self.stash = x,
(_, false) => {
return Err(UnescapeError);
}
(w, true) => {
let words = [w, x];
match decode_utf16(words.iter().copied()).next() {
Some(Ok(c)) => {
self.out.push(c);
self.stash = 0;
}
_ => return Err(UnescapeError),
}
}
}
Ok(())
}
// If we queued up part of a UTF-16 encoded word but didn't
// finish it, return an error. Otherwise, consume self and
// return the accumulated String.
fn finalize(self) -> UnescapeResult<String> {
if self.stash != 0 {
return Err(UnescapeError);
}
Ok(self.out)
}
}
fn parse_u16_hex<S>(s: &mut Peekable<S>, max: Option<u8>) -> UnescapeResult<u16>
where
S: Iterator<Item = char>,
{
let mut result = 0;
let mut max = max.unwrap_or(u8::max_value());
while s.peek().map(|c| c.is_ascii_hexdigit()).unwrap_or_default() {
result *= 16;
result += s.next().unwrap().to_digit(16).unwrap() as u16;
max -= 1;
if max == 0 {
break;
}
}
Ok(result)
}
fn parse_u16_oct<S>(s: &mut Peekable<S>, mut result: u16, max: Option<u8>) -> UnescapeResult<u16>
where
S: Iterator<Item = char>,
{
let mut max = max.unwrap_or(u8::max_value());
while s.peek().map(|c| c >= &'1' && c <= &'7').unwrap_or_default() {
let digit = s.next().unwrap();
dbg!(digit);
result *= 8;
result += digit.to_digit(8).unwrap() as u16;
max -= 1;
if max == 0 {
break;
}
}
Ok(result)
}
/// Un-escape a string, following php single quote rules
pub fn unescape_single(s: &str) -> UnescapeResult<String> {
let mut state = UnescapeState::new();
let mut ins = s.chars();
while let Some(c) = ins.next() {
if c == '\\' {
match ins.next() {
None => {
return Err(UnescapeError);
}
Some(d) => match d {
'\\' | '\'' => state.push_char(d)?,
_ => {
state.push_char('\\')?;
state.push_char(d)?
}
},
}
} else {
state.push_char(c)?;
}
}
state.finalize()
}
/// Un-escape a string, following php double quote rules
pub fn unescape_double(s: &str) -> UnescapeResult<String> {
let mut state = UnescapeState::new();
let mut ins = s.chars().peekable();
while let Some(c) = ins.next() {
if c == '\\' {
match ins.next() {
None => {
return Err(UnescapeError);
}
Some(d) => {
match d {
'$' | '"' | '\\' => state.push_char(d)?,
'n' => state.push_char('\n')?, // linefeed
'r' => state.push_char('\r')?, // carriage return
't' => state.push_char('\t')?, // tab
'v' => state.push_char('\x0B')?, // vertical tab
'f' => state.push_char('\x0C')?, // form feed
'x' => {
let val = parse_u16_hex(&mut ins, Some(2))?;
state.push_u16(val)?;
}
'u' => match ins.next() {
Some('{') => {
let val = parse_u16_hex(&mut ins, None)?;
state.push_u16(val)?;
if !matches!(ins.next(), Some('}')) {
return Err(UnescapeError);
}
}
Some(d) => {
state.push_char('\\')?;
state.push_char('u')?;
state.push_char(d)?;
}
None => {
state.push_char('\\')?;
state.push_char(d)?;
}
},
'0'..='7' => {
let val =
parse_u16_oct(&mut ins, d.to_digit(8).unwrap() as u16, Some(3))?;
state.push_u16(val)?;
}
_ => {
state.push_char('\\')?;
state.push_char(d)?
}
}
}
}
} else {
state.push_char(c)?;
}
}
state.finalize()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_unescape_single() {
assert_eq!(unescape_single(&r#"abc"#), Ok("abc".into()));
assert_eq!(unescape_single(&r#"ab\nc"#), Ok("ab\\nc".into()));
assert_eq!(unescape_single(r#"ab\zc"#), Ok("ab\\zc".into()));
assert_eq!(unescape_single(r#" \"abc\" "#), Ok(" \\\"abc\\\" ".into()));
assert_eq!(unescape_single(r#"𝄞"#), Ok("𝄞".into()));
assert_eq!(unescape_single(r#"\𝄞"#), Ok("\\𝄞".into()));
assert_eq!(
unescape_single(r#"\xD834\xDD1E"#),
Ok("\\xD834\\xDD1E".into())
);
assert_eq!(unescape_single(r#"\xD834"#), Ok("\\xD834".into()));
assert_eq!(unescape_single(r#"\xDD1E"#), Ok("\\xDD1E".into()));
assert_eq!(unescape_single("\t"), Ok("\t".into()));
}
#[test]
fn test_unescape_double() {
assert_eq!(unescape_double(&r#"abc"#), Ok("abc".into()));
assert_eq!(unescape_double(&r#"ab\nc"#), Ok("ab\nc".into()));
assert_eq!(unescape_double(r#"ab\zc"#), Ok("ab\\zc".into()));
assert_eq!(unescape_double(r#" \"abc\" "#), Ok(" \"abc\" ".into()));
assert_eq!(unescape_double(r#"𝄞"#), Ok("𝄞".into()));
assert_eq!(unescape_double(r#"\𝄞"#), Ok("\\𝄞".into()));
assert_eq!(unescape_double(r#"\u{D834}\u{DD1E}"#), Ok("𝄞".into()));
assert_eq!(unescape_double(r#"\xD834"#), Ok("\u{D8}34".into()));
assert_eq!(unescape_double(r#"\xDD1E"#), Ok("\u{DD}1E".into()));
assert_eq!(unescape_double(r#"\xD"#), Ok("\u{D}".into()));
assert_eq!(unescape_double("\t"), Ok("\t".into()));
assert_eq!(unescape_double(r#"\u{D834"#), Err(UnescapeError));
assert_eq!(unescape_double(r#"\uD834"#), Ok("\\uD834".into()));
assert_eq!(unescape_double(r#"\u"#), Ok("\\u".into()));
assert_eq!(unescape_double(r#"\47foo"#), Ok("'foo".into()));
assert_eq!(unescape_double(r#"\48foo"#), Ok("\u{4}8foo".into()));
assert_eq!(unescape_double(r#"\87foo"#), Ok("\\87foo".into()));
}
}