also extra throw statements

This commit is contained in:
Robin Appelman 2024-07-20 18:44:05 +02:00
commit 73e9c08413
13 changed files with 42758 additions and 9021 deletions

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -90,6 +90,7 @@ version = "0.1.0"
dependencies = [ dependencies = [
"cc", "cc",
"insta", "insta",
"memchr",
"serde", "serde",
"serde_json", "serde_json",
"test-case", "test-case",

View file

@ -18,6 +18,7 @@ serde_json = "1.0.120"
walkdir = "2.5.0" walkdir = "2.5.0"
tree-sitter = "0.22.6" tree-sitter = "0.22.6"
tree-sitter-php = "0.22.7" tree-sitter-php = "0.22.7"
memchr = "2.7.4"
[build-dependencies] [build-dependencies]
cc = "1.1.6" cc = "1.1.6"

View file

@ -1,9 +1,12 @@
use crate::string::{unescape, DoubleQuoteString, SingleQuoteString};
use crate::{LogLevel, LoggingStatement}; use crate::{LogLevel, LoggingStatement};
use std::borrow::Cow;
use tree_sitter::{Language, Node, Parser, Query, QueryCursor}; use tree_sitter::{Language, Node, Parser, Query, QueryCursor};
pub struct LogExtractor { pub struct LogExtractor {
language: Language, language: Language,
method_query: Query, method_query: Query,
throw_query: Query,
string_query: Query, string_query: Query,
} }
@ -12,18 +15,27 @@ impl LogExtractor {
let language = tree_sitter_php::language_php(); let language = tree_sitter_php::language_php();
let method_query = Query::new( let method_query = Query::new(
&language, &language,
r#"( r#"(member_call_expression
member_call_expression
name: (name)@name name: (name)@name
arguments: (arguments ((argument)+ @args)) arguments: (arguments ((argument)+ @arg))
)"#, )"#,
) )
.expect("invalid query"); .expect("invalid query");
let string_query = let throw_query = Query::new(
Query::new(&language, r#"(string_content)@string"#).expect("invalid query"); &language,
r#"(throw_expression
(object_creation_expression
(arguments ((argument)+ @arg))
)
)"#,
)
.expect("invalid query");
let string_query = Query::new(&language, r#"[(string_content)(escape_sequence)]@string"#)
.expect("invalid query");
LogExtractor { LogExtractor {
language, language,
method_query, method_query,
throw_query,
string_query, string_query,
} }
} }
@ -43,17 +55,33 @@ impl LogExtractor {
let tree = parser.parse(code, None).expect("parse timeout or canceled"); let tree = parser.parse(code, None).expect("parse timeout or canceled");
let mut log_call_cursor = QueryCursor::new(); let mut log_call_cursor = QueryCursor::new();
let mut throw_call_cursor = QueryCursor::new();
let log_calls = self.get_log_calls(&mut log_call_cursor, code, tree.root_node()); let log_calls = self.get_log_calls(&mut log_call_cursor, code, tree.root_node());
log_calls let throw_calls = self.get_throw_calls(&mut throw_call_cursor, code, tree.root_node());
let mut all = log_calls
.chain(throw_calls)
.map(|call| { .map(|call| {
let mut string_cursor = QueryCursor::new(); let mut string_cursor = QueryCursor::new();
let message_parts = string_cursor let message_parts = string_cursor
.matches(&self.string_query, call.arguments, code.as_bytes()) .matches(&self.string_query, call.arguments, code.as_bytes())
.map(|result| { .map(|result| {
result.captures[0] let node = result.captures[0].node;
.node let raw = node.utf8_text(code.as_bytes()).unwrap_or("malformed utf8");
.utf8_text(code.as_bytes())
.unwrap_or("malformed utf8") if raw.contains('\\') {
let start_char =
code.as_bytes()[node.parent().unwrap().byte_range().start];
Cow::Owned(
if start_char == b'"' {
unescape::<DoubleQuoteString>(raw)
} else {
unescape::<SingleQuoteString>(raw)
}
.unwrap(),
)
} else {
Cow::Borrowed(raw)
}
}) })
.collect(); .collect();
@ -64,8 +92,10 @@ impl LogExtractor {
message_parts, message_parts,
} }
}) })
.collect::<Vec<_>>() .collect::<Vec<_>>();
.into_iter()
all.sort_by_key(|statement| statement.line);
all.into_iter()
} }
fn get_log_calls<'a>( fn get_log_calls<'a>(
@ -91,6 +121,26 @@ impl LogExtractor {
}) })
}) })
} }
fn get_throw_calls<'a>(
&'a self,
cursor: &'a mut QueryCursor,
code: &'a str,
node: Node<'a>,
) -> impl Iterator<Item = LogCall> + 'a {
let throws = cursor.matches(&self.throw_query, node, code.as_bytes());
throws.filter_map(|method_call| {
let level = LogLevel::Exception;
let arguments = method_call.captures[0].node;
let line = arguments.start_position().row;
Some(LogCall {
level,
line,
arguments,
})
})
}
} }
impl Default for LogExtractor { impl Default for LogExtractor {
@ -111,6 +161,7 @@ fn test_extract_logging() {
function test() { function test() {
$this->logger->warning("failed to find trash item for $rootTrashedItemName deleted at $rootTrashedItemDate in folder $groupFolderId", ['app' => 'groupfolders']); $this->logger->warning("failed to find trash item for $rootTrashedItemName deleted at $rootTrashedItemDate in folder $groupFolderId", ['app' => 'groupfolders']);
$logger->info("foobar"); $logger->info("foobar");
throw new FooException("foo \"bar\" \' {$blarg}");
} }
?> ?>
"#; "#;
@ -123,9 +174,9 @@ fn test_extract_logging() {
line: 3, line: 3,
level: LogLevel::Warn, level: LogLevel::Warn,
message_parts: vec![ message_parts: vec![
"failed to find trash item for ", "failed to find trash item for ".into(),
" deleted at ", " deleted at ".into(),
" in folder " " in folder ".into()
] ]
} }
); );
@ -135,7 +186,22 @@ fn test_extract_logging() {
path: "foo.php", path: "foo.php",
line: 4, line: 4,
level: LogLevel::Info, level: LogLevel::Info,
message_parts: vec!["foobar"] message_parts: vec!["foobar".into()]
}
);
assert_eq!(
logs[2],
LoggingStatement {
path: "foo.php",
line: 5,
level: LogLevel::Exception,
message_parts: vec![
"foo ".into(),
"\"".into(),
"bar".into(),
"\"".into(),
" \\' ".into()
]
} }
); );
} }

View file

@ -1,5 +1,5 @@
use std::fmt::{Display, Formatter};
use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::fmt::{Display, Formatter};
#[derive(Debug, Default, PartialEq)] #[derive(Debug, Default, PartialEq)]
pub enum LogLevel { pub enum LogLevel {
@ -11,6 +11,7 @@ pub enum LogLevel {
Alert, Alert,
Critical, Critical,
Emergency, Emergency,
Exception,
#[default] #[default]
Unknown, Unknown,
} }
@ -26,6 +27,7 @@ impl LogLevel {
"alert" => Some(LogLevel::Alert), "alert" => Some(LogLevel::Alert),
"critical" => Some(LogLevel::Critical), "critical" => Some(LogLevel::Critical),
"emergency" => Some(LogLevel::Emergency), "emergency" => Some(LogLevel::Emergency),
"exception" => Some(LogLevel::Exception),
"log" => Some(LogLevel::Unknown), "log" => Some(LogLevel::Unknown),
_ => None, _ => None,
} }
@ -41,6 +43,7 @@ impl LogLevel {
LogLevel::Alert => "alert", LogLevel::Alert => "alert",
LogLevel::Critical => "critical", LogLevel::Critical => "critical",
LogLevel::Emergency => "emergency", LogLevel::Emergency => "emergency",
LogLevel::Exception => "exception",
LogLevel::Unknown => "log", LogLevel::Unknown => "log",
} }
} }
@ -64,7 +67,7 @@ impl Display for LogLevel {
impl<'de> Deserialize<'de> for LogLevel { impl<'de> Deserialize<'de> for LogLevel {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where where
D: Deserializer<'de> D: Deserializer<'de>,
{ {
let s = <&str>::deserialize(deserializer)?; let s = <&str>::deserialize(deserializer)?;
Ok(LogLevel::parse(s).unwrap_or_default()) Ok(LogLevel::parse(s).unwrap_or_default())

View file

@ -1,6 +1,7 @@
use crate::error::Error; use crate::error::Error;
use crate::extractor::LogExtractor; use crate::extractor::LogExtractor;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use std::fs::File; use std::fs::File;
use std::io::{Read, Write}; use std::io::{Read, Write};
use tracing::error; use tracing::error;
@ -9,6 +10,7 @@ use walkdir::WalkDir;
pub mod error; pub mod error;
pub mod extractor; pub mod extractor;
mod level; mod level;
pub mod string;
pub use level::LogLevel; pub use level::LogLevel;
@ -17,7 +19,7 @@ pub struct LoggingStatement<'a> {
level: LogLevel, level: LogLevel,
path: &'a str, path: &'a str,
line: usize, line: usize,
message_parts: Vec<&'a str>, message_parts: Vec<Cow<'a, str>>,
} }
pub fn extract_dir<W: Write>(root: &str, mut output: W) -> Result<(), Error> { pub fn extract_dir<W: Write>(root: &str, mut output: W) -> Result<(), Error> {

View file

@ -0,0 +1,326 @@
#[derive(Debug, Clone, Eq, PartialEq)]
/// An error occurred while
pub struct UnescapeError;
type UnescapeResult<T> = Result<T, UnescapeError>;
// Used to collect output characters and queue u16 values for translation.
pub struct UnescapeState {
// The accumulated characters
out: Vec<u8>,
}
impl UnescapeState {
fn with_capacity(capacity: usize) -> UnescapeState {
UnescapeState {
out: Vec::with_capacity(capacity),
}
}
// Collect a new character
fn push_char(&mut self, c: char) {
let mut buff = [0; 8];
self.out
.extend_from_slice(c.encode_utf8(&mut buff).as_bytes());
}
fn push_u8(&mut self, c: u8) {
self.out.push(c);
}
fn push_raw(&mut self, c: u32) -> UnescapeResult<()> {
match std::char::from_u32(c) {
Some(c) => {
self.push_char(c);
Ok(())
}
None => Err(UnescapeError),
}
}
fn push_slice(&mut self, slice: &[u8]) {
self.out.extend_from_slice(slice);
}
fn finalize(self) -> UnescapeResult<String> {
String::from_utf8(self.out).map_err(|_| UnescapeError)
}
}
fn parse_u32(
s: &mut PeekableBytes,
radix: u32,
mut result: u32,
max: Option<u8>,
) -> UnescapeResult<u32> {
let mut max = max.unwrap_or(u8::MAX);
while let Some(digit) = s.peek().and_then(|digit| (digit as char).to_digit(radix)) {
let _ = s.next(); // consume the digit we peeked
result = result.checked_mul(radix).ok_or(UnescapeError)?;
result = result.checked_add(digit).ok_or(UnescapeError)?;
max -= 1;
if max == 0 {
break;
}
}
Ok(result)
}
pub trait EscapedString {
fn handle_escape<'a>(bytes: &'a [u8], state: &mut UnescapeState) -> UnescapeResult<&'a [u8]>;
}
pub struct SingleQuoteString;
impl EscapedString for SingleQuoteString {
fn handle_escape<'a>(bytes: &'a [u8], state: &mut UnescapeState) -> UnescapeResult<&'a [u8]> {
let mut ins = PeekableBytes::new(bytes);
let _slash = ins.next();
debug_assert_eq!(_slash, Some(b'\\'));
match ins.next() {
None => {
return Err(UnescapeError);
}
Some(d) => match d {
b'\\' | b'\'' => state.push_u8(d),
_ => {
state.push_u8(b'\\');
state.push_u8(d)
}
},
}
Ok(ins.as_slice())
}
}
pub struct DoubleQuoteString;
impl EscapedString for DoubleQuoteString {
fn handle_escape<'a>(bytes: &'a [u8], state: &mut UnescapeState) -> UnescapeResult<&'a [u8]> {
let mut ins = PeekableBytes::new(bytes);
let _next = ins.next();
debug_assert_eq!(_next, Some(b'\\'));
match ins.next() {
None => {
return Err(UnescapeError);
}
Some(d) => {
match d {
b'$' | b'"' | b'\\' => state.push_u8(d),
b'n' => state.push_u8(b'\n'), // linefeed
b'r' => state.push_u8(b'\r'), // carriage return
b't' => state.push_u8(b'\t'), // tab
b'v' => state.push_u8(b'\x0B'), // vertical tab
b'f' => state.push_u8(b'\x0C'), // form feed
b'x' => {
let val = parse_u32(&mut ins, 16, 0, Some(2))?;
state.push_raw(val)?;
}
b'u' => match ins.next() {
Some(b'{') => {
let val = parse_u32(&mut ins, 16, 0, None)?;
state.push_raw(val)?;
if !matches!(ins.next(), Some(b'}')) {
return Err(UnescapeError);
}
}
Some(d) => {
state.push_u8(b'\\');
state.push_u8(b'u');
state.push_u8(d);
}
None => {
state.push_u8(b'\\');
state.push_u8(d);
}
},
b'0'..=b'7' => {
let val =
parse_u32(&mut ins, 8, (d as char).to_digit(8).unwrap(), Some(3))?;
state.push_raw(val)?;
}
_ => {
state.push_u8(b'\\');
state.push_u8(d)
}
}
}
}
Ok(ins.as_slice())
}
}
pub fn parse_string(literal: &str) -> Result<String, UnescapeError> {
let inner = &literal[1..(literal.len()) - 1];
if literal.bytes().next().unwrap() == b'\'' {
unescape::<SingleQuoteString>(inner)
} else {
unescape::<DoubleQuoteString>(inner)
}
}
pub fn unescape<S: EscapedString>(s: &str) -> UnescapeResult<String> {
let mut state = UnescapeState::with_capacity(s.len());
let mut bytes = s.as_bytes();
while let Some(escape_index) = memchr::memchr(b'\\', bytes) {
state.push_slice(&bytes[0..escape_index]);
bytes = &bytes[escape_index..];
bytes = S::handle_escape(bytes, &mut state)?;
}
state.push_slice(&bytes[0..]);
state.finalize()
}
struct PeekableBytes<'a> {
slice: &'a [u8],
pos: usize,
}
impl<'a> Iterator for PeekableBytes<'a> {
type Item = u8;
fn next(&mut self) -> Option<Self::Item> {
let byte = self.slice.get(self.pos)?;
self.pos += 1;
Some(*byte)
}
}
impl<'a> PeekableBytes<'a> {
pub fn new(slice: &'a [u8]) -> Self {
PeekableBytes { slice, pos: 0 }
}
pub fn peek(&self) -> Option<u8> {
self.slice.get(self.pos).copied()
}
pub fn as_slice(&self) -> &'a [u8] {
&self.slice[self.pos..]
}
}
pub fn is_array_key_numeric(string: &str) -> bool {
let mut bytes = string.bytes();
if !matches!(
(bytes.next(), string.len()),
(Some(b'-'), _) | (Some(b'0'..=b'9'), 1) | (Some(b'1'..=b'9'), _)
) {
return false;
}
bytes.all(|byte| byte.is_ascii_digit())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_numeric() {
assert!(is_array_key_numeric("123"));
assert!(is_array_key_numeric("-123"));
assert!(is_array_key_numeric("0"));
assert!(!is_array_key_numeric("0123"));
assert!(!is_array_key_numeric("123asd"));
assert!(!is_array_key_numeric("+123"));
}
#[test]
fn test_unescape_single() {
assert_eq!(unescape::<SingleQuoteString>(r#"abc"#), Ok("abc".into()));
assert_eq!(
unescape::<SingleQuoteString>(r#"ab\nc"#),
Ok("ab\\nc".into())
);
assert_eq!(
unescape::<SingleQuoteString>(r#"ab\zc"#),
Ok("ab\\zc".into())
);
assert_eq!(
unescape::<SingleQuoteString>(r#" \"abc\" "#),
Ok(" \\\"abc\\\" ".into())
);
assert_eq!(unescape::<SingleQuoteString>(r#"𝄞"#), Ok("𝄞".into()));
assert_eq!(unescape::<SingleQuoteString>(r#"\𝄞"#), Ok("\\𝄞".into()));
assert_eq!(
unescape::<SingleQuoteString>(r#"\xD834\xDD1E"#),
Ok("\\xD834\\xDD1E".into())
);
assert_eq!(
unescape::<SingleQuoteString>(r#"\xD834"#),
Ok("\\xD834".into())
);
assert_eq!(
unescape::<SingleQuoteString>(r#"\xDD1E"#),
Ok("\\xDD1E".into())
);
assert_eq!(unescape::<SingleQuoteString>("\t"), Ok("\t".into()));
}
#[test]
fn test_unescape_double() {
assert_eq!(unescape::<DoubleQuoteString>(r#"abc"#), Ok("abc".into()));
assert_eq!(
unescape::<DoubleQuoteString>(r#"ab\nc"#),
Ok("ab\nc".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"ab\zc"#),
Ok("ab\\zc".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#" \"abc\" "#),
Ok(" \"abc\" ".into())
);
assert_eq!(unescape::<DoubleQuoteString>(r#"𝄞"#), Ok("𝄞".into()));
assert_eq!(unescape::<DoubleQuoteString>(r#"\𝄞"#), Ok("\\𝄞".into()));
assert_eq!(
unescape::<DoubleQuoteString>(r#"\u{1D11E}"#),
Ok("𝄞".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\xD834"#),
Ok("\u{D8}34".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\xDD1E"#),
Ok("\u{DD}1E".into())
);
assert_eq!(unescape::<DoubleQuoteString>(r#"\xD"#), Ok("\u{D}".into()));
assert_eq!(unescape::<DoubleQuoteString>("\t"), Ok("\t".into()));
assert_eq!(
unescape::<DoubleQuoteString>(r#"\u{D834"#),
Err(UnescapeError)
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\uD834"#),
Ok("\\uD834".into())
);
assert_eq!(unescape::<DoubleQuoteString>(r#"\u"#), Ok("\\u".into()));
assert_eq!(
unescape::<DoubleQuoteString>(r#"\47foo"#),
Ok("'foo".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\48foo"#),
Ok("\u{4}8foo".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\87foo"#),
Ok("\\87foo".into())
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\u{999999}"#),
Err(UnescapeError)
);
assert_eq!(
unescape::<DoubleQuoteString>(r#"\u{999999999999999999}"#),
Err(UnescapeError)
);
}
}

View file

@ -3,6 +3,88 @@ source: tests/snapshot.rs
expression: output expression: output
--- ---
[ [
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 129,
"message_parts": [
"invalid share type!"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 305,
"message_parts": [
"Group \"",
"\" does not exist"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 309,
"message_parts": [
"Recipient not in receiving group"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 339,
"message_parts": [
"Recipient does not match"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 344,
"message_parts": [
"Invalid shareType"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 431,
"message_parts": [
"Group \"",
"\" does not exist"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 476,
"message_parts": [
"Recipient does not match"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 482,
"message_parts": [
"Invalid shareType"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 609,
"message_parts": [
"non-shallow getSharesInFolder is no longer supported"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 963,
"message_parts": [
"Invalid backend"
]
},
{ {
"level": "error", "level": "error",
"path": "/DefaultShareProvider.php", "path": "/DefaultShareProvider.php",