generate better regexes and capture placeholder names

This commit is contained in:
Robin Appelman 2024-07-21 15:02:41 +02:00
commit 04e391aea1
15 changed files with 21695 additions and 65312 deletions

7
README.md Normal file
View file

@ -0,0 +1,7 @@
# cloud-log-analyser
### Updating baked data
rm -r data/src/data
nix build .#extracted-logs-rust
cp -rL result data/src/data && chmod -R +w data/src/data

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -26,10 +26,11 @@ impl From<i64> for LogLevel {
}
}
#[derive(Debug, PartialEq)]
#[derive(Debug, PartialEq, Clone)]
pub struct LoggingStatement {
pub level: LogLevel,
pub path: &'static str,
pub line: usize,
pub message_parts: &'static [&'static str],
pub placeholders: &'static [&'static str],
pub regex: &'static str,
}

View file

@ -115,6 +115,8 @@ dependencies = [
"databake",
"insta",
"memchr",
"regex",
"regex-syntax",
"serde",
"serde_json",
"test-case",

View file

@ -20,6 +20,8 @@ tree-sitter = "0.22.6"
tree-sitter-php = "0.22.7"
memchr = "2.7.4"
databake = { version = "0.1.8", features = ["derive"] }
regex-syntax = "0.8.4"
regex = "1.10.5"
[build-dependencies]
cc = "1.1.6"

View file

@ -1,5 +1,4 @@
use databake::Bake;
use std::borrow::Cow;
#[derive(Debug, Default, PartialEq, Clone, Copy, Bake)]
#[databake(path = crate)]
@ -40,34 +39,43 @@ pub struct LoggingStatement<'a> {
pub level: LogLevel,
pub path: &'a str,
pub line: usize,
pub message_parts: &'a [&'a str],
pub placeholders: &'a [&'a str],
pub regex: &'a str,
}
fn build_pattern<'a>(parts: &[crate::MessagePart]) -> String {
let mut pattern = String::with_capacity(128);
pattern.push('^');
for part in parts {
match part {
crate::MessagePart::Literal(literal) => {
pattern.push_str(&regex_syntax::escape(literal))
}
crate::MessagePart::PlaceHolder(_placeholder) => {
pattern.push_str("(.*)");
}
}
}
pattern.push('$');
pattern
}
pub fn bake_statement(output: &mut String, statement: &crate::LoggingStatement) {
let message_parts: Vec<_> = statement.message_parts.iter().map(Cow::as_ref).collect();
let placeholders: Vec<_> = statement
.message_parts
.iter()
.filter_map(|part| match part {
crate::MessagePart::PlaceHolder(placeholder) => Some(placeholder.as_str()),
_ => None,
})
.collect();
let pattern = build_pattern(&statement.message_parts);
let statement = LoggingStatement {
level: statement.level.into(),
path: statement.path,
line: statement.line,
message_parts: &message_parts,
placeholders: &placeholders,
regex: &pattern,
};
output.push_str(&statement.bake(&Default::default()).to_string());
}
#[cfg(feature = "bake")]
mod bake_test {
#[test]
fn test_bake() {
use databake::test_bake;
test_bake!(
crate::LoggingStatement,
const: crate::LoggingStatement {
level: crate::LogLevel::Debug,
path: "foo",
line: 12usize,
message_parts: &["part1", "part2"]
},
cloud_log_analyser,
);
}
}

View file

@ -1,13 +1,11 @@
use crate::string::{unescape, DoubleQuoteString, SingleQuoteString};
use crate::{LogLevel, LoggingStatement};
use std::borrow::Cow;
use crate::{LogLevel, LoggingStatement, MessagePart};
use tree_sitter::{Language, Node, Parser, Query, QueryCursor};
pub struct LogExtractor {
language: Language,
method_query: Query,
throw_query: Query,
string_query: Query,
}
impl LogExtractor {
@ -30,13 +28,10 @@ impl LogExtractor {
)"#,
)
.expect("invalid query");
let string_query = Query::new(&language, r#"[(string_content)(escape_sequence)]@string"#)
.expect("invalid query");
LogExtractor {
language,
method_query,
throw_query,
string_query,
}
}
@ -56,41 +51,53 @@ impl LogExtractor {
let mut log_call_cursor = QueryCursor::new();
let mut throw_call_cursor = QueryCursor::new();
let mut tree_cursor = tree.walk();
let log_calls = self.get_log_calls(&mut log_call_cursor, code, tree.root_node());
let throw_calls = self.get_throw_calls(&mut throw_call_cursor, code, tree.root_node());
let mut all = log_calls
.chain(throw_calls)
.map(|call| {
let mut string_cursor = QueryCursor::new();
let message_parts = string_cursor
.matches(&self.string_query, call.arguments, code.as_bytes())
.map(|result| {
let node = result.captures[0].node;
let raw = node.utf8_text(code.as_bytes()).unwrap_or("malformed utf8");
.filter_map(|call| {
let argument = call.arguments.child(0)?;
if argument.grammar_name() != "string"
&& argument.grammar_name() != "encapsed_string"
{
return None;
}
let mut argument_string_parts = argument.children(&mut tree_cursor);
let is_double_quote = argument_string_parts.next()?.grammar_name() == r#"""#;
let mut message_builder =
MessageBuilder::with_capacity(argument_string_parts.len());
if raw.contains('\\') {
let start_char =
code.as_bytes()[node.parent().unwrap().byte_range().start];
Cow::Owned(
if start_char == b'"' {
for string_part in argument_string_parts {
match string_part.grammar_name() {
"string_content" => {
let content = string_part.utf8_text(code.as_bytes()).unwrap();
message_builder.push_literal(content);
}
"escape_sequence" => {
let raw = string_part.utf8_text(code.as_bytes()).unwrap();
let content = if is_double_quote {
unescape::<DoubleQuoteString>(raw)
} else {
unescape::<SingleQuoteString>(raw)
}
.unwrap(),
)
} else {
Cow::Borrowed(raw)
.unwrap();
message_builder.push_literal(&content);
}
r#"'"# | r#"""# | r#"{"# | r#"}"# => {}
_ => {
let placeholder = string_part.utf8_text(code.as_bytes()).unwrap();
message_builder.push_placeholder(placeholder);
}
}
}
})
.collect();
LoggingStatement {
Some(LoggingStatement {
level: call.level,
line: call.line + 1,
path,
message_parts,
}
message_parts: message_builder.0,
})
})
.collect::<Vec<_>>();
@ -155,13 +162,33 @@ struct LogCall<'tree> {
arguments: Node<'tree>,
}
struct MessageBuilder(Vec<MessagePart>);
impl MessageBuilder {
pub fn with_capacity(cap: usize) -> Self {
MessageBuilder(Vec::with_capacity(cap))
}
pub fn push_literal(&mut self, content: &str) {
if let Some(MessagePart::Literal(last_part)) = self.0.last_mut() {
last_part.push_str(content);
} else {
self.0.push(MessagePart::Literal(content.into()))
}
}
pub fn push_placeholder(&mut self, placeholder: &str) {
self.0.push(MessagePart::PlaceHolder(placeholder.into()));
}
}
#[test]
fn test_extract_logging() {
let code = r#"<?php
function test() {
$this->logger->warning("failed to find trash item for $rootTrashedItemName deleted at $rootTrashedItemDate in folder $groupFolderId", ['app' => 'groupfolders']);
$logger->info("foobar");
throw new FooException("foo \"bar\" \' {$blarg}");
$logger->info('foobar');
throw new FooException("foo \"bar\" \' {$this->blarg}");
}
?>
"#;
@ -174,9 +201,12 @@ fn test_extract_logging() {
line: 3,
level: LogLevel::Warn,
message_parts: vec![
"failed to find trash item for ".into(),
" deleted at ".into(),
" in folder ".into()
MessagePart::Literal("failed to find trash item for ".into()),
MessagePart::PlaceHolder("$rootTrashedItemName".into()),
MessagePart::Literal(" deleted at ".into()),
MessagePart::PlaceHolder("$rootTrashedItemDate".into()),
MessagePart::Literal(" in folder ".into()),
MessagePart::PlaceHolder("$groupFolderId".into()),
]
}
);
@ -186,7 +216,7 @@ fn test_extract_logging() {
path: "foo.php",
line: 4,
level: LogLevel::Info,
message_parts: vec!["foobar".into()]
message_parts: vec![MessagePart::Literal("foobar".into())]
}
);
assert_eq!(
@ -196,11 +226,8 @@ fn test_extract_logging() {
line: 5,
level: LogLevel::Exception,
message_parts: vec![
"foo ".into(),
"\"".into(),
"bar".into(),
"\"".into(),
" \\' ".into()
MessagePart::Literal(r#"foo "bar" \' "#.into()),
MessagePart::PlaceHolder("$this->blarg".into())
]
}
);

View file

@ -1,7 +1,6 @@
use crate::error::Error;
use crate::extractor::LogExtractor;
use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use std::fs::File;
use std::io::{Read, Write};
use tracing::error;
@ -21,7 +20,13 @@ pub struct LoggingStatement<'a> {
level: LogLevel,
path: &'a str,
line: usize,
message_parts: Vec<Cow<'a, str>>,
message_parts: Vec<MessagePart>,
}
#[derive(Debug, PartialEq, Serialize, Deserialize)]
pub enum MessagePart {
Literal(String),
PlaceHolder(String),
}
pub fn extract_dir<W: Write>(root: &str, mut output: W, bake: bool) -> Result<(), Error> {

View file

@ -8,16 +8,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 129,
"message_parts": [
"invalid share type!"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 305,
"message_parts": [
"Group \"",
"\" does not exist"
"Literal": "invalid share type!"
}
]
},
{
@ -25,7 +18,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 309,
"message_parts": [
"Recipient not in receiving group"
{
"Literal": "Recipient not in receiving group"
}
]
},
{
@ -33,7 +28,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 339,
"message_parts": [
"Recipient does not match"
{
"Literal": "Recipient does not match"
}
]
},
{
@ -41,16 +38,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 344,
"message_parts": [
"Invalid shareType"
]
},
{
"level": "exception",
"path": "/DefaultShareProvider.php",
"line": 431,
"message_parts": [
"Group \"",
"\" does not exist"
"Literal": "Invalid shareType"
}
]
},
{
@ -58,7 +48,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 476,
"message_parts": [
"Recipient does not match"
{
"Literal": "Recipient does not match"
}
]
},
{
@ -66,7 +58,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 482,
"message_parts": [
"Invalid shareType"
{
"Literal": "Invalid shareType"
}
]
},
{
@ -74,7 +68,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 609,
"message_parts": [
"non-shallow getSharesInFolder is no longer supported"
{
"Literal": "non-shallow getSharesInFolder is no longer supported"
}
]
},
{
@ -82,41 +78,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 963,
"message_parts": [
"Invalid backend"
]
},
{
"level": "error",
"path": "/DefaultShareProvider.php",
"line": 1171,
"message_parts": []
},
{
"level": "debug",
"path": "/DefaultShareProvider.php",
"line": 1392,
"message_parts": [
"Share notification not sent to ",
" because user could not be found."
]
},
{
"level": "debug",
"path": "/DefaultShareProvider.php",
"line": 1401,
"message_parts": [
"Share notification not sent to ",
" because email address is not set."
]
},
{
"level": "debug",
"path": "/DefaultShareProvider.php",
"line": 1416,
"message_parts": [
"Sent share notification to ",
" for share with ID ",
"."
"Literal": "Invalid backend"
}
]
},
{
@ -124,16 +88,9 @@ expression: output
"path": "/DefaultShareProvider.php",
"line": 1420,
"message_parts": [
"Share notification mail could not be sent."
]
},
{
"level": "error",
"path": "/DefaultShareProvider.php",
"line": 1507,
"message_parts": [
"Share notification mail could not be sent to: ",
", "
"Literal": "Share notification mail could not be sent."
}
]
}
]

View file

@ -1,5 +1,5 @@
use cloud_log_analyser_data::{LogLevel, LoggingStatement};
use regex::{escape, Regex, RegexBuilder};
use regex::Regex;
pub struct LogMatch {
level: LogLevel,
@ -11,8 +11,8 @@ impl LogMatch {
pub fn new(statement: &LoggingStatement) -> LogMatch {
LogMatch {
level: statement.level,
pattern: build_pattern(statement.message_parts),
pattern_length: statement.message_parts.iter().copied().map(str::len).sum(),
pattern: Regex::new(statement.regex).unwrap(),
pattern_length: statement.regex.len(),
}
}
}
@ -48,25 +48,6 @@ impl Matcher {
}
}
fn build_pattern<'a>(parts: &[&str]) -> Regex {
let mut pattern = String::with_capacity(128);
for part in parts {
pattern.push_str(&escape(part));
pattern.push_str("(.*)");
}
RegexBuilder::new(&pattern)
.build()
.expect("Failed to build regex")
}
#[test]
fn test_build_pattern() {
let regex = build_pattern(["foobar", "asd"]);
assert!(regex.is_match("foobar with asd and more"));
assert!(regex.is_match("foobarasd"));
assert!(!regex.is_match("fooasd"));
}
#[test]
fn test_matcher() {
let statements = &[
@ -74,32 +55,29 @@ fn test_matcher() {
line: 68,
level: LogLevel::Exception,
path: "foo",
message_parts: vec!["Not allowed to rename a shared album".into()],
placeholders: &[],
regex: "^Not allowed to rename a shared album$",
},
LoggingStatement {
line: 69,
level: LogLevel::Error,
path: "bar",
message_parts: vec![
"You are not allowed to edit link shares that you don".into(),
"'".into(),
"t own".into(),
],
placeholders: &[],
regex: "^You are not allowed to edit link shares that you don't own$",
},
LoggingStatement {
line: 69,
level: LogLevel::Error,
path: "asd",
message_parts: vec![
"Unsupported query value for mimetype: ".into(),
", only values in the format \"mime/type\" or \"mime/%\" are supported".into(),
],
placeholders: &["$mimeType"],
regex: r#"^Unsupported query value for mimetype: (.*), only values in the format "mime/type" or "mime/%" are supported$"#,
},
LoggingStatement {
line: 68,
level: LogLevel::Exception,
path: "short",
message_parts: vec!["Not allowed to rename".into()],
placeholders: &["$path"],
regex: "^Not allowed to rename (.*)$",
},
];
let matcher = Matcher::new(statements);