also extra throw statements

This commit is contained in:
Robin Appelman 2024-07-20 18:44:05 +02:00
commit 73e9c08413
13 changed files with 42758 additions and 9021 deletions

View file

@ -90,6 +90,7 @@ version = "0.1.0"
dependencies = [
"cc",
"insta",
"memchr",
"serde",
"serde_json",
"test-case",

View file

@ -18,6 +18,7 @@ serde_json = "1.0.120"
walkdir = "2.5.0"
tree-sitter = "0.22.6"
tree-sitter-php = "0.22.7"
memchr = "2.7.4"
[build-dependencies]
cc = "1.1.6"

View file

@ -1,9 +1,12 @@
use crate::string::{unescape, DoubleQuoteString, SingleQuoteString};
use crate::{LogLevel, LoggingStatement};
use std::borrow::Cow;
use tree_sitter::{Language, Node, Parser, Query, QueryCursor};
pub struct LogExtractor {
language: Language,
method_query: Query,
throw_query: Query,
string_query: Query,
}
@ -12,18 +15,27 @@ impl LogExtractor {
let language = tree_sitter_php::language_php();
let method_query = Query::new(
&language,
r#"(
member_call_expression
r#"(member_call_expression
name: (name)@name
arguments: (arguments ((argument)+ @args))
arguments: (arguments ((argument)+ @arg))
)"#,
)
.expect("invalid query");
let string_query =
Query::new(&language, r#"(string_content)@string"#).expect("invalid query");
let throw_query = Query::new(
&language,
r#"(throw_expression
(object_creation_expression
(arguments ((argument)+ @arg))
)
)"#,
)
.expect("invalid query");
let string_query = Query::new(&language, r#"[(string_content)(escape_sequence)]@string"#)
.expect("invalid query");
LogExtractor {
language,
method_query,
throw_query,
string_query,
}
}
@ -43,17 +55,33 @@ impl LogExtractor {
let tree = parser.parse(code, None).expect("parse timeout or canceled");
let mut log_call_cursor = QueryCursor::new();
let mut throw_call_cursor = QueryCursor::new();
let log_calls = self.get_log_calls(&mut log_call_cursor, code, tree.root_node());
log_calls
let throw_calls = self.get_throw_calls(&mut throw_call_cursor, code, tree.root_node());
let mut all = log_calls
.chain(throw_calls)
.map(|call| {
let mut string_cursor = QueryCursor::new();
let message_parts = string_cursor
.matches(&self.string_query, call.arguments, code.as_bytes())
.map(|result| {
result.captures[0]
.node
.utf8_text(code.as_bytes())
.unwrap_or("malformed utf8")
let node = result.captures[0].node;
let raw = node.utf8_text(code.as_bytes()).unwrap_or("malformed utf8");
if raw.contains('\\') {
let start_char =
code.as_bytes()[node.parent().unwrap().byte_range().start];
Cow::Owned(
if start_char == b'"' {
unescape::<DoubleQuoteString>(raw)
} else {
unescape::<SingleQuoteString>(raw)
}
.unwrap(),
)
} else {
Cow::Borrowed(raw)
}
})
.collect();
@ -64,8 +92,10 @@ impl LogExtractor {
message_parts,
}
})
.collect::<Vec<_>>()
.into_iter()
.collect::<Vec<_>>();
all.sort_by_key(|statement| statement.line);
all.into_iter()
}
fn get_log_calls<'a>(
@ -91,6 +121,26 @@ impl LogExtractor {
})
})
}
fn get_throw_calls<'a>(
&'a self,
cursor: &'a mut QueryCursor,
code: &'a str,
node: Node<'a>,
) -> impl Iterator<Item = LogCall> + 'a {
let throws = cursor.matches(&self.throw_query, node, code.as_bytes());
throws.filter_map(|method_call| {
let level = LogLevel::Exception;
let arguments = method_call.captures[0].node;
let line = arguments.start_position().row;
Some(LogCall {
level,
line,
arguments,
})
})
}
}
impl Default for LogExtractor {
@ -111,6 +161,7 @@ fn test_extract_logging() {
function test() {
$this->logger->warning("failed to find trash item for $rootTrashedItemName deleted at $rootTrashedItemDate in folder $groupFolderId", ['app' => 'groupfolders']);
$logger->info("foobar");
throw new FooException("foo \"bar\" \' {$blarg}");
}
?>
"#;
@ -123,9 +174,9 @@ fn test_extract_logging() {
line: 3,
level: LogLevel::Warn,
message_parts: vec![
"failed to find trash item for ",
" deleted at ",
" in folder "
"failed to find trash item for ".into(),
" deleted at ".into(),
" in folder ".into()
]
}
);
@ -135,7 +186,22 @@ fn test_extract_logging() {
path: "foo.php",
line: 4,
level: LogLevel::Info,
message_parts: vec!["foobar"]
message_parts: vec!["foobar".into()]
}
);
assert_eq!(
logs[2],
LoggingStatement {
path: "foo.php",
line: 5,
level: LogLevel::Exception,
message_parts: vec![
"foo ".into(),
"\"".into(),
"bar".into(),
"\"".into(),
" \\' ".into()
]
}
);
}

View file

@ -1,5 +1,5 @@
use std::fmt::{Display, Formatter};
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::fmt::{Display, Formatter};
#[derive(Debug, Default, PartialEq)]
pub enum LogLevel {
@ -11,6 +11,7 @@ pub enum LogLevel {
Alert,
Critical,
Emergency,
Exception,
#[default]
Unknown,
}
@ -26,6 +27,7 @@ impl LogLevel {
"alert" => Some(LogLevel::Alert),
"critical" => Some(LogLevel::Critical),
"emergency" => Some(LogLevel::Emergency),
"exception" => Some(LogLevel::Exception),
"log" => Some(LogLevel::Unknown),
_ => None,
}
@ -41,6 +43,7 @@ impl LogLevel {
LogLevel::Alert => "alert",
LogLevel::Critical => "critical",
LogLevel::Emergency => "emergency",
LogLevel::Exception => "exception",
LogLevel::Unknown => "log",
}
}
@ -64,9 +67,9 @@ impl Display for LogLevel {
impl<'de> Deserialize<'de> for LogLevel {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>
D: Deserializer<'de>,
{
let s = <&str>::deserialize(deserializer)?;
Ok(LogLevel::parse(s).unwrap_or_default())
}
}
}

View file

@ -1,6 +1,7 @@
use crate::error::Error;
use crate::extractor::LogExtractor;
use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use std::fs::File;
use std::io::{Read, Write};
use tracing::error;
@ -9,6 +10,7 @@ use walkdir::WalkDir;
pub mod error;
pub mod extractor;
mod level;
pub mod string;
pub use level::LogLevel;
@ -17,7 +19,7 @@ pub struct LoggingStatement<'a> {
level: LogLevel,
path: &'a str,
line: usize,
message_parts: Vec<&'a str>,
message_parts: Vec<Cow<'a, str>>,
}
pub fn extract_dir<W: Write>(root: &str, mut output: W) -> Result<(), Error> {

View file

@ -0,0 +1,326 @@
#[derive(Debug, Clone, Eq, PartialEq)]
/// An error occurred while
pub struct UnescapeError;
type UnescapeResult<T> = Result<T, UnescapeError>;
// Used to collect output characters and queue u16 values for translation.
pub struct UnescapeState {
// The accumulated characters
out: Vec<u8>,
}
impl UnescapeState {
fn with_capacity(capacity: usize) -> UnescapeState {
UnescapeState {
out: Vec::with_capacity(capacity),
}
}
// Collect a new character
fn push_char(&mut self, c: char) {
let mut buff = [0; 8];
self.out
.extend_from_slice(c.encode_utf8(&mut buff).as_bytes());
}
fn push_u8(&mut self, c: u8) {
self.out.push(c);
}
fn push_raw(&mut self, c: u32) -> UnescapeResult<()> {
match std::char::from_u32(c) {
Some(c) => {
self.push_char(c);
Ok(())
}
None => Err(UnescapeError),
}
}
fn push_slice(&mut self, slice: &[u8]) {
self.out.extend_from_slice(slice);
}
fn finalize(self) -> UnescapeResult<String> {
String::from_utf8(self.out).map_err(|_| UnescapeError)
}
}
fn parse_u32(
s: &mut PeekableBytes,
radix: u32,
mut result: u32,
max: Option<u8>,
) -> UnescapeResult<u32> {
let mut max = max.unwrap_or(u8::MAX);
while let Some(digit) = s.peek().and_then(|digit| (digit as char).to_digit(radix)) {
let _ = s.next(); // consume the digit we peeked
result = result.checked_mul(radix).ok_or(UnescapeError)?;
result = result.checked_add(digit).ok_or(UnescapeError)?;
max -= 1;
if max == 0 {
break;
}
}
Ok(result)
}
pub trait EscapedString {
fn handle_escape<'a>(bytes: &'a [u8], state: &mut UnescapeState) -> UnescapeResult<&'a [u8]>;
}
pub struct SingleQuoteString;
impl EscapedString for SingleQuoteString {
fn handle_escape<'a>(bytes: &'a [u8], state: &mut UnescapeState) -> UnescapeResult<&'a [u8]> {
let mut ins = PeekableBytes::new(bytes);
let _slash = ins.next();
debug_assert_eq!(_slash, Some(b'\\'));
match ins.next() {
None => {
return Err(UnescapeError);
}
Some(d) => match d {
b'\\' | b'\'' => state.push_u8(d),
_ => {
state.push_u8(b'\\');
state.push_u8(d)
}
},
}
Ok(ins.as_slice())
}
}
pub struct DoubleQuoteString;
impl EscapedString for DoubleQuoteString {
fn handle_escape<'a>(bytes: &'a [u8], state: &mut UnescapeState) -> UnescapeResult<&'a [u8]> {
let mut ins = PeekableBytes::new(bytes);
let _next = ins.next();
debug_assert_eq!(_next, Some(b'\\'));
match ins.next() {
None => {
return Err(UnescapeError);
}
Some(d) => {
match d {
b'$' | b'"' | b'\\' => state.push_u8(d),
b'n' => state.push_u8(b'\n'), // linefeed
b'r' => state.push_u8(b'\r'), // carriage return
b't' => state.push_u8(b'\t'), // tab
b'v' => state.push_u8(b'\x0B'), // vertical tab
b'f' => state.push_u8(b'\x0C'), // form feed
b'x' => {
let val = parse_u32(&mut ins, 16, 0, Some(2))?;
state.push_raw(val)?;
}
b'u' => match ins.next() {
Some(b'{') => {
let val = parse_u32(&mut ins, 16, 0, None)?;
state.push_raw(val)?;
if !matches!(ins.next(), Some(b'}')) {
return Err(UnescapeError);
}
}
Some(d) => {
state.push_u8(b'\\');
state.push_u8(b'u');
state.push_u8(d);
}
None => {
state.push_u8(b'\\');
state.push_u8(d);
}
},
b'0'..=b'7' => {
let val =
parse_u32(&mut ins, 8, (d as char).to_digit(8).unwrap(), Some(3))?;
state.push_raw(val)?;
}
_ => {
state.push_u8(b'\\');
state.push_u8(d)
}
}
}
}
Ok(ins.as_slice())
}
}
pub fn parse_string(literal: &str) -> Result<String, UnescapeError> {
let inner = &literal[1..(literal.len()) - 1];
if literal.bytes().next().unwrap() == b'\'' {
unescape::<SingleQuoteString>(inner)
} else {
unescape::<DoubleQuoteString>(inner)
}
}
pub fn unescape<S: EscapedString>(s: &str) -> UnescapeResult<String> {
let mut state = UnescapeState::with_capacity(s.len());
let mut bytes = s.as_bytes();
while let Some(escape_index) = memchr::memchr(b'\\', bytes) {
state.push_slice(&bytes[0..escape_index]);
bytes = &bytes[escape_index..];
bytes = S::handle_escape(bytes, &mut state)?;
}
state.push_slice(&bytes[0..]);
state.finalize()
}
struct PeekableBytes<'a> {
slice: &'a [u8],
pos: usize,
}
impl<'a> Iterator for PeekableBytes<'a> {
type Item = u8;
fn next(&mut self) -> Option<Self::Item> {
let byte = self.slice.get(self.pos)?;
self.pos += 1;
Some(*byte)
}
}
impl<'a> PeekableBytes<'a> {
pub fn new(slice: &'a [u8]) -> Self {
PeekableBytes { slice, pos: 0 }
}
pub fn peek(&self) -> Option<u8> {
self.slice.get(self.pos).copied()
}
pub fn as_slice(&self) -> &'a [u8] {
&self.slice[self.pos..]
}
}
pub fn is_array_key_numeric(string: &str) -> bool {
let mut bytes = string.bytes();
if !matches!(
(bytes.next(), string.len()),
(Some(b'-'), _) | (Some(b'0'..=b'9'), 1) | (Some(b'1'..=b'9'), _)
) {
return false;
}
bytes.all(|byte| byte.is_ascii_digit())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_numeric() {
assert!(is_array_key_numeric("123"));
assert!(is_array_key_numeric("-123"));
assert!(is_array_key_numeric("0"));
assert!(!is_array_key_numeric("0123"));
assert!(!is_array_key_numeric("123asd"));
assert!(!is_array_key_numeric("+123"));
}
#[test]
fn test_unescape_single() {
assert_eq!(unescape::<SingleQuoteString>(r#"abc"#), Ok("abc".into()));
assert_eq!(
unescape::<SingleQuoteString>(r#"ab\nc"#),
Ok("ab\\nc".into())
);
assert_eq!(
unescape::<SingleQuoteString>(r#"ab\zc"#),
Ok("ab\\zc".into())
);
assert_eq!(
unescape::<SingleQuoteString>(r#" \"abc\" "#),
Ok(" \\\"abc\\\" ".into())
);
assert_eq!(unescape::<SingleQuoteString>(r#"𝄞"#), Ok("𝄞".into()));
assert_eq!(unescape::<SingleQuoteString>(r#"\𝄞"#), Ok("\\𝄞".into()));
assert_eq!(
unescape::<SingleQuoteString>(r#"\xD834\xDD1E"#),
Ok("\\xD834\\xDD1E".into())
);
assert_eq!(
unescape::<SingleQuoteString>(r#"\xD834"#),
Ok("\\xD834".into())
);
assert_eq!(
unescape::<SingleQuoteString>(r#"\xDD1E"#),
Ok("\\xDD1E".into())
);
assert_eq!(unescape::<SingleQuoteString>("\t"), Ok("\t".into()));
}
#[test]
fn test_unescape_double() {
assert_eq!(unescape::<DoubleQuoteString>(r#"abc"#), Ok("abc".into()));
assert_eq!(
unescape::<DoubleQuoteString>(r#"ab\nc"#),
Ok("ab\nc".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"ab\zc"#),
Ok("ab\\zc".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#" \"abc\" "#),
Ok(" \"abc\" ".into())
);
assert_eq!(unescape::<DoubleQuoteString>(r#"𝄞"#), Ok("𝄞".into()));
assert_eq!(unescape::<DoubleQuoteString>(r#"\𝄞"#), Ok("\\𝄞".into()));
assert_eq!(
unescape::<DoubleQuoteString>(r#"\u{1D11E}"#),
Ok("𝄞".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\xD834"#),
Ok("\u{D8}34".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\xDD1E"#),
Ok("\u{DD}1E".into())
);
assert_eq!(unescape::<DoubleQuoteString>(r#"\xD"#), Ok("\u{D}".into()));
assert_eq!(unescape::<DoubleQuoteString>("\t"), Ok("\t".into()));
assert_eq!(
unescape::<DoubleQuoteString>(r#"\u{D834"#),
Err(UnescapeError)
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\uD834"#),
Ok("\\uD834".into())
);
assert_eq!(unescape::<DoubleQuoteString>(r#"\u"#), Ok("\\u".into()));
assert_eq!(
unescape::<DoubleQuoteString>(r#"\47foo"#),
Ok("'foo".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\48foo"#),
Ok("\u{4}8foo".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\87foo"#),
Ok("\\87foo".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\u{999999}"#),
Err(UnescapeError)
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\u{999999999999999999}"#),
Err(UnescapeError)
);
}
}

View file

@ -3,6 +3,88 @@ source: tests/snapshot.rs
expression: output
---
[
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 129,
"message_parts": [
"invalid share type!"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 305,
"message_parts": [
"Group \"",
"\" does not exist"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 309,
"message_parts": [
"Recipient not in receiving group"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 339,
"message_parts": [
"Recipient does not match"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 344,
"message_parts": [
"Invalid shareType"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 431,
"message_parts": [
"Group \"",
"\" does not exist"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 476,
"message_parts": [
"Recipient does not match"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 482,
"message_parts": [
"Invalid shareType"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 609,
"message_parts": [
"non-shallow getSharesInFolder is no longer supported"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 963,
"message_parts": [
"Invalid backend"
]
},
{
"level": "error",
"path": "/DefaultShareProvider.php",