generate better regexes and capture placeholder names

This commit is contained in:
Robin Appelman 2024-07-21 15:02:41 +02:00
commit 04e391aea1
15 changed files with 21695 additions and 65312 deletions

View file

@ -115,6 +115,8 @@ dependencies = [
"databake",
"insta",
"memchr",
"regex",
"regex-syntax",
"serde",
"serde_json",
"test-case",

View file

@ -20,6 +20,8 @@ tree-sitter = "0.22.6"
tree-sitter-php = "0.22.7"
memchr = "2.7.4"
databake = { version = "0.1.8", features = ["derive"] }
regex-syntax = "0.8.4"
regex = "1.10.5"
[build-dependencies]
cc = "1.1.6"

View file

@ -1,5 +1,4 @@
use databake::Bake;
use std::borrow::Cow;
#[derive(Debug, Default, PartialEq, Clone, Copy, Bake)]
#[databake(path = crate)]
@ -40,34 +39,43 @@ pub struct LoggingStatement<'a> {
pub level: LogLevel,
pub path: &'a str,
pub line: usize,
pub message_parts: &'a [&'a str],
pub placeholders: &'a [&'a str],
pub regex: &'a str,
}
fn build_pattern<'a>(parts: &[crate::MessagePart]) -> String {
let mut pattern = String::with_capacity(128);
pattern.push('^');
for part in parts {
match part {
crate::MessagePart::Literal(literal) => {
pattern.push_str(&regex_syntax::escape(literal))
}
crate::MessagePart::PlaceHolder(_placeholder) => {
pattern.push_str("(.*)");
}
}
}
pattern.push('$');
pattern
}
pub fn bake_statement(output: &mut String, statement: &crate::LoggingStatement) {
let message_parts: Vec<_> = statement.message_parts.iter().map(Cow::as_ref).collect();
let placeholders: Vec<_> = statement
.message_parts
.iter()
.filter_map(|part| match part {
crate::MessagePart::PlaceHolder(placeholder) => Some(placeholder.as_str()),
_ => None,
})
.collect();
let pattern = build_pattern(&statement.message_parts);
let statement = LoggingStatement {
level: statement.level.into(),
path: statement.path,
line: statement.line,
message_parts: &message_parts,
placeholders: &placeholders,
regex: &pattern,
};
output.push_str(&statement.bake(&Default::default()).to_string());
}
#[cfg(feature = "bake")]
mod bake_test {
#[test]
fn test_bake() {
use databake::test_bake;
test_bake!(
crate::LoggingStatement,
const: crate::LoggingStatement {
level: crate::LogLevel::Debug,
path: "foo",
line: 12usize,
message_parts: &["part1", "part2"]
},
cloud_log_analyser,
);
}
}

View file

@ -1,13 +1,11 @@
use crate::string::{unescape, DoubleQuoteString, SingleQuoteString};
use crate::{LogLevel, LoggingStatement};
use std::borrow::Cow;
use crate::{LogLevel, LoggingStatement, MessagePart};
use tree_sitter::{Language, Node, Parser, Query, QueryCursor};
pub struct LogExtractor {
language: Language,
method_query: Query,
throw_query: Query,
string_query: Query,
}
impl LogExtractor {
@ -30,13 +28,10 @@ impl LogExtractor {
)"#,
)
.expect("invalid query");
let string_query = Query::new(&language, r#"[(string_content)(escape_sequence)]@string"#)
.expect("invalid query");
LogExtractor {
language,
method_query,
throw_query,
string_query,
}
}
@ -56,41 +51,53 @@ impl LogExtractor {
let mut log_call_cursor = QueryCursor::new();
let mut throw_call_cursor = QueryCursor::new();
let mut tree_cursor = tree.walk();
let log_calls = self.get_log_calls(&mut log_call_cursor, code, tree.root_node());
let throw_calls = self.get_throw_calls(&mut throw_call_cursor, code, tree.root_node());
let mut all = log_calls
.chain(throw_calls)
.map(|call| {
let mut string_cursor = QueryCursor::new();
let message_parts = string_cursor
.matches(&self.string_query, call.arguments, code.as_bytes())
.map(|result| {
let node = result.captures[0].node;
let raw = node.utf8_text(code.as_bytes()).unwrap_or("malformed utf8");
.filter_map(|call| {
let argument = call.arguments.child(0)?;
if argument.grammar_name() != "string"
&& argument.grammar_name() != "encapsed_string"
{
return None;
}
let mut argument_string_parts = argument.children(&mut tree_cursor);
let is_double_quote = argument_string_parts.next()?.grammar_name() == r#"""#;
let mut message_builder =
MessageBuilder::with_capacity(argument_string_parts.len());
if raw.contains('\\') {
let start_char =
code.as_bytes()[node.parent().unwrap().byte_range().start];
Cow::Owned(
if start_char == b'"' {
unescape::<DoubleQuoteString>(raw)
} else {
unescape::<SingleQuoteString>(raw)
}
.unwrap(),
)
} else {
Cow::Borrowed(raw)
for string_part in argument_string_parts {
match string_part.grammar_name() {
"string_content" => {
let content = string_part.utf8_text(code.as_bytes()).unwrap();
message_builder.push_literal(content);
}
})
.collect();
"escape_sequence" => {
let raw = string_part.utf8_text(code.as_bytes()).unwrap();
let content = if is_double_quote {
unescape::<DoubleQuoteString>(raw)
} else {
unescape::<SingleQuoteString>(raw)
}
.unwrap();
message_builder.push_literal(&content);
}
r#"'"# | r#"""# | r#"{"# | r#"}"# => {}
_ => {
let placeholder = string_part.utf8_text(code.as_bytes()).unwrap();
message_builder.push_placeholder(placeholder);
}
}
}
LoggingStatement {
Some(LoggingStatement {
level: call.level,
line: call.line + 1,
path,
message_parts,
}
message_parts: message_builder.0,
})
})
.collect::<Vec<_>>();
@ -155,13 +162,33 @@ struct LogCall<'tree> {
arguments: Node<'tree>,
}
struct MessageBuilder(Vec<MessagePart>);
impl MessageBuilder {
pub fn with_capacity(cap: usize) -> Self {
MessageBuilder(Vec::with_capacity(cap))
}
pub fn push_literal(&mut self, content: &str) {
if let Some(MessagePart::Literal(last_part)) = self.0.last_mut() {
last_part.push_str(content);
} else {
self.0.push(MessagePart::Literal(content.into()))
}
}
pub fn push_placeholder(&mut self, placeholder: &str) {
self.0.push(MessagePart::PlaceHolder(placeholder.into()));
}
}
#[test]
fn test_extract_logging() {
let code = r#"<?php
function test() {
$this->logger->warning("failed to find trash item for $rootTrashedItemName deleted at $rootTrashedItemDate in folder $groupFolderId", ['app' => 'groupfolders']);
$logger->info("foobar");
throw new FooException("foo \"bar\" \' {$blarg}");
$logger->info('foobar');
throw new FooException("foo \"bar\" \' {$this->blarg}");
}
?>
"#;
@ -174,9 +201,12 @@ fn test_extract_logging() {
line: 3,
level: LogLevel::Warn,
message_parts: vec![
"failed to find trash item for ".into(),
" deleted at ".into(),
" in folder ".into()
MessagePart::Literal("failed to find trash item for ".into()),
MessagePart::PlaceHolder("$rootTrashedItemName".into()),
MessagePart::Literal(" deleted at ".into()),
MessagePart::PlaceHolder("$rootTrashedItemDate".into()),
MessagePart::Literal(" in folder ".into()),
MessagePart::PlaceHolder("$groupFolderId".into()),
]
}
);
@ -186,7 +216,7 @@ fn test_extract_logging() {
path: "foo.php",
line: 4,
level: LogLevel::Info,
message_parts: vec!["foobar".into()]
message_parts: vec![MessagePart::Literal("foobar".into())]
}
);
assert_eq!(
@ -196,11 +226,8 @@ fn test_extract_logging() {
line: 5,
level: LogLevel::Exception,
message_parts: vec![
"foo ".into(),
"\"".into(),
"bar".into(),
"\"".into(),
" \\' ".into()
MessagePart::Literal(r#"foo "bar" \' "#.into()),
MessagePart::PlaceHolder("$this->blarg".into())
]
}
);

View file

@ -1,7 +1,6 @@
use crate::error::Error;
use crate::extractor::LogExtractor;
use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use std::fs::File;
use std::io::{Read, Write};
use tracing::error;
@ -21,7 +20,13 @@ pub struct LoggingStatement<'a> {
level: LogLevel,
path: &'a str,
line: usize,
message_parts: Vec<Cow<'a, str>>,
message_parts: Vec<MessagePart>,
}
#[derive(Debug, PartialEq, Serialize, Deserialize)]
pub enum MessagePart {
Literal(String),
PlaceHolder(String),
}
pub fn extract_dir<W: Write>(root: &str, mut output: W, bake: bool) -> Result<(), Error> {

View file

@ -8,16 +8,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 129,
"message_parts": [
"invalid share type!"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 305,
"message_parts": [
"Group \"",
"\" does not exist"
{
"Literal": "invalid share type!"
}
]
},
{
@ -25,7 +18,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 309,
"message_parts": [
"Recipient not in receiving group"
{
"Literal": "Recipient not in receiving group"
}
]
},
{
@ -33,7 +28,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 339,
"message_parts": [
"Recipient does not match"
{
"Literal": "Recipient does not match"
}
]
},
{
@ -41,16 +38,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 344,
"message_parts": [
"Invalid shareType"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 431,
"message_parts": [
"Group \"",
"\" does not exist"
{
"Literal": "Invalid shareType"
}
]
},
{
@ -58,7 +48,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 476,
"message_parts": [
"Recipient does not match"
{
"Literal": "Recipient does not match"
}
]
},
{
@ -66,7 +58,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 482,
"message_parts": [
"Invalid shareType"
{
"Literal": "Invalid shareType"
}
]
},
{
@ -74,7 +68,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 609,
"message_parts": [
"non-shallow getSharesInFolder is no longer supported"
{
"Literal": "non-shallow getSharesInFolder is no longer supported"
}
]
},
{
@ -82,41 +78,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 963,
"message_parts": [
"Invalid backend"
]
},
{
"level": "error",
"path": "/DefaultShareProvider.php",
"line": 1171,
"message_parts": []
},
{
"level": "debug",
"path": "/DefaultShareProvider.php",
"line": 1392,
"message_parts": [
"Share notification not sent to ",
" because user could not be found."
]
},
{
"level": "debug",
"path": "/DefaultShareProvider.php",
"line": 1401,
"message_parts": [
"Share notification not sent to ",
" because email address is not set."
]
},
{
"level": "debug",
"path": "/DefaultShareProvider.php",
"line": 1416,
"message_parts": [
"Sent share notification to ",
" for share with ID ",
"."
{
"Literal": "Invalid backend"
}
]
},
{
@ -124,16 +88,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 1420,
"message_parts": [
"Share notification mail could not be sent."
]
},
{
"level": "error",
"path": "/DefaultShareProvider.php",
"line": 1507,
"message_parts": [
"Share notification mail could not be sent to: ",
", "
{
"Literal": "Share notification mail could not be sent."
}
]
}
]