prepare for more flexible matchers/extractors

This commit is contained in:
Robin Appelman 2025-11-03 20:13:05 +01:00
commit e20f0d7661
9 changed files with 401 additions and 115 deletions

7
Cargo.lock generated
View file

@ -235,6 +235,7 @@ dependencies = [
"hex",
"home",
"main_error",
"maplit",
"notify-debouncer-full",
"regex",
"serde",
@ -368,6 +369,12 @@ version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "155db5e86c6e45ee456bf32fad5a290ee1f7151c2faca27ea27097568da67d1a"
[[package]]
name = "maplit"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
[[package]]
name = "memchr"
version = "2.7.6"

View file

@ -19,4 +19,7 @@ tracing-subscriber = "0.3.20"
notify-debouncer-full = "0.6.0"
ctrlc = "3.5.0"
sha2 = "0.11.0-rc.2"
hex = "0.4.3"
hex = "0.4.3"
[dev-dependencies]
maplit = "1.0.2"

View file

@ -1,7 +1,9 @@
use crate::matchers::get_matcher;
use crate::rule::Rule;
use home::home_dir;
use regex::Regex;
use serde::Deserialize;
use std::collections::HashMap;
use std::error::Error;
use std::fs::read_to_string;
use std::path::{Path, PathBuf};
use thiserror::Error;
@ -55,39 +57,40 @@ pub fn normalize_path<P: Into<String> + AsRef<str>>(path: P) -> String {
#[derive(Debug, Deserialize)]
pub(crate) struct RuleConfig {
name: Option<String>,
referrer: Option<String>,
url: Option<String>,
#[serde(rename = "move")]
target: Option<String>,
rename: Option<String>,
pub target: Option<String>,
pub rename: Option<String>,
#[serde(flatten)]
pub matchers: HashMap<String, String>,
}
impl TryFrom<RuleConfig> for Rule {
type Error = RuleError;
fn try_from(value: RuleConfig) -> Result<Self, Self::Error> {
if value.name.is_none() && value.referrer.is_none() && value.url.is_none() {
if value.matchers.is_empty() {
return Err(RuleError::NoMatches);
}
if value.rename.is_none() && value.target.is_none() {
return Err(RuleError::NoAction);
}
fn parse_rule(val: Option<String>) -> Result<Option<Regex>, RuleError> {
let Some(val) = val else {
return Ok(None);
};
Ok(Some(
Regex::new(&val).map_err(|error| RuleError::Regex { input: val, error })?,
))
}
let matchers = value
.matchers
.into_iter()
.map(|(name, value)| {
let res = get_matcher(&name, &value)
.ok_or_else(|| RuleError::UnknownRule(name.clone()))?;
res.map_err(|error| RuleError::InvalidRule {
field: name,
value,
error,
})
})
.collect::<Result<Vec<_>, _>>()?;
Ok(Rule {
name: parse_rule(value.name)?,
referrer: parse_rule(value.referrer)?,
url: parse_rule(value.url)?,
matchers,
target: value.target.map(normalize_path),
rename: value.rename,
})
@ -114,6 +117,12 @@ pub enum RuleError {
NoMatches,
#[error("at least one action rule needs to be defined")]
NoAction,
#[error("invalid regex {input}: {error:#}")]
Regex { input: String, error: regex::Error },
#[error("Unknown match rule '{0}'")]
UnknownRule(String),
#[error("Invalid match rule {field} = '{value}': {error:#}")]
InvalidRule {
field: String,
value: String,
error: Box<dyn Error>,
},
}

77
src/extractor/mod.rs Normal file
View file

@ -0,0 +1,77 @@
use crate::file::FileInfo;
use std::borrow::{Borrow, Cow};
use std::collections::HashMap;
use std::error::Error;
use std::hash::Hash;
#[derive(Default)]
pub struct MultiExtractor<'a> {
extractors: Vec<DynCow<'a>>,
}
enum DynCow<'a> {
Ref(&'a dyn Extractor),
Box(Box<dyn Extractor>),
}
impl<'a> AsRef<dyn Extractor + 'a> for DynCow<'a> {
fn as_ref(&self) -> &(dyn Extractor + 'a) {
match self {
DynCow::Ref(r) => *r,
DynCow::Box(b) => b.as_ref(),
}
}
}
impl<'a> MultiExtractor<'a> {
pub fn with_capacity(cap: usize) -> Self {
MultiExtractor {
extractors: Vec::with_capacity(cap),
}
}
pub fn push(&mut self, extractor: &'a dyn Extractor) {
self.extractors.push(DynCow::Ref(extractor))
}
pub fn push_box(&mut self, extractor: Box<dyn Extractor>) {
self.extractors.push(DynCow::Box(extractor))
}
}
impl Extractor for MultiExtractor<'_> {
fn extract<'this>(&'this self, field: &str) -> Option<Result<Cow<'this, str>, Box<dyn Error>>> {
self.extractors
.iter()
.find_map(|ex| ex.as_ref().extract(field))
}
}
pub trait Extractor {
fn extract<'this>(&'this self, field: &str) -> Option<Result<Cow<'this, str>, Box<dyn Error>>>;
}
pub struct FileInfoExtractor<'a> {
file: &'a FileInfo,
}
impl<'a> FileInfoExtractor<'a> {
pub fn new(file: &'a FileInfo) -> Self {
FileInfoExtractor { file }
}
}
impl Extractor for FileInfoExtractor<'_> {
fn extract<'this>(&'this self, field: &str) -> Option<Result<Cow<'this, str>, Box<dyn Error>>> {
Some(Ok(match field {
"mtime" => Cow::Owned(self.file.mtime.to_string()),
_ => return None,
}))
}
}
impl<T: AsRef<str> + Borrow<str> + Eq + Hash + Clone + 'static> Extractor for HashMap<T, T> {
fn extract<'this>(&'this self, field: &str) -> Option<Result<Cow<'this, str>, Box<dyn Error>>> {
self.get(field).map(T::as_ref).map(Cow::Borrowed).map(Ok)
}
}

View file

@ -14,7 +14,6 @@ pub struct FileInfo {
pub sha256: [u8; 32],
#[allow(dead_code)]
pub mtime: u64,
pub mtime_str: String,
pub size: u64,
}
@ -46,7 +45,6 @@ impl FileInfo {
referrer: None,
sha256: [0; 32],
mtime,
mtime_str: mtime.to_string(),
size: stat.size(),
};

View file

@ -1,6 +1,6 @@
use crate::config::{normalize_path, GaltonConfig};
use crate::file::FileInfo;
use crate::rule::{Rule, RuleMatch};
use crate::rule::{Rule, RuleResult};
use clap::builder::styling::{AnsiColor, Effects};
use clap::builder::Styles;
use clap::{Parser, Subcommand};
@ -18,7 +18,9 @@ use std::time::Duration;
use tracing::{debug, error, info, instrument};
mod config;
mod extractor;
mod file;
mod matchers;
mod rule;
fn styles() -> Styles {
@ -166,7 +168,7 @@ fn is_part(path: &Path) -> bool {
fn maybe_link(source: Option<&Path>, target: Option<&str>) {
if let (Some(source), Some(target)) = (source, target) {
if Path::new(target).exists() {
if Path::new(target).is_symlink() {
if let Err(error) = remove_file(target) {
error!(%error, "failed to remove link target");
return;
@ -183,11 +185,18 @@ fn maybe_link(source: Option<&Path>, target: Option<&str>) {
}
}
fn match_file(file: &FileInfo, rules: &[Rule]) -> Option<RuleMatch> {
fn match_file(file: &FileInfo, rules: &[Rule]) -> Option<RuleResult> {
for rule in rules {
if let Some(result) = rule.matches(file) {
debug!(?rule, ?result, "found matching rule");
return Some(result);
match result {
Ok(result) => {
debug!(?rule, ?result, "found matching rule");
return Some(result);
}
Err(error) => {
error!(?rule, %error, "error matching rule");
}
}
}
}
None

109
src/matchers/filemeta.rs Normal file
View file

@ -0,0 +1,109 @@
use crate::extractor::Extractor;
use crate::file::FileInfo;
use crate::matchers::{regex_matches, Matcher};
use regex::Regex;
use std::borrow::Cow;
use std::error::Error;
#[derive(Debug)]
enum FileMetaField {
Name,
Referrer,
Url,
}
impl FileMetaField {
fn new(s: &str) -> Option<FileMetaField> {
match s {
"name" => Some(FileMetaField::Name),
"referrer" => Some(FileMetaField::Referrer),
"url" => Some(FileMetaField::Url),
_ => None,
}
}
fn as_str(&self) -> &'static str {
match self {
FileMetaField::Name => "name",
FileMetaField::Referrer => "referrer",
FileMetaField::Url => "url",
}
}
}
#[derive(Debug)]
pub struct FileMetaMatcher {
field: FileMetaField,
regex: Regex,
}
pub fn parse(name: &str, value: &str) -> Option<Result<FileMetaMatcher, regex::Error>> {
let field = FileMetaField::new(name)?;
Some(Regex::new(value).map(|regex| FileMetaMatcher { field, regex }))
}
impl FileMetaMatcher {
fn value<'a>(&self, file: &'a FileInfo) -> Option<&'a str> {
match self.field {
FileMetaField::Name => Some(file.name()),
FileMetaField::Referrer => file.referrer.as_deref(),
FileMetaField::Url => file.url.as_deref(),
}
}
}
impl Matcher for FileMetaMatcher {
fn name(&self) -> &str {
self.field.as_str()
}
fn matches(&self, file: &FileInfo) -> Option<Result<Box<dyn Extractor>, Box<dyn Error>>> {
let value = self.value(file)?;
let matches = regex_matches(&self.regex, value)?;
let extractor = RegexMatchExtractor { matches };
Some(Ok(Box::new(extractor)))
}
}
struct RegexMatchExtractor {
matches: Vec<(String, String)>,
}
impl Extractor for RegexMatchExtractor {
fn extract(&self, field: &str) -> Option<Result<Cow<str>, Box<dyn Error>>> {
let value = self
.matches
.iter()
.find_map(|(name, value)| (name.as_str() == field).then_some(value.as_str()))?;
Some(Ok(Cow::Borrowed(value)))
}
}
#[test]
fn test_file_meta_matcher() {
let txt_file = FileInfo {
path: "/tmp/test.txt".into(),
url: Some("https://example.com/test.txt".into()),
referrer: Some("https://example.com/downloads".into()),
sha256: [0; 32],
mtime: 1234,
size: 100,
};
let png_file = FileInfo {
path: "/tmp/test.png".into(),
url: Some("https://example.com/test.png".into()),
referrer: Some("https://example.com/images".into()),
sha256: [0; 32],
mtime: 1234,
size: 100,
};
let txt_matcher = parse("name", r#"(?<txt_name>.+)\.txt"#).unwrap().unwrap();
let extracted = txt_matcher.matches(&txt_file).unwrap().unwrap();
assert!(txt_matcher.matches(&png_file).is_none());
assert_eq!(extracted.extract("txt_name").unwrap().unwrap(), "test");
let downloads_matcher = parse("referrer", r#"downloads"#).unwrap().unwrap();
assert!(downloads_matcher.matches(&txt_file).is_some());
assert!(downloads_matcher.matches(&png_file).is_none());
}

49
src/matchers/mod.rs Normal file
View file

@ -0,0 +1,49 @@
use crate::extractor::Extractor;
use crate::file::FileInfo;
use regex::Regex;
use std::error::Error;
use std::fmt::Debug;
use thiserror::Error;
mod filemeta;
#[derive(Debug, Error)]
#[error("Malformed match rule '{input}': {error}")]
pub struct MatcherParseError {
input: String,
error: Box<dyn Error>,
}
fn map_result<T: Matcher + 'static, E: Error + 'static>(
res: Result<T, E>,
) -> Result<Box<dyn Matcher>, Box<dyn Error>> {
res.map(|matcher| Box::new(matcher) as Box<dyn Matcher>)
.map_err(|err| Box::new(err) as Box<dyn Error>)
}
pub fn get_matcher(name: &str, value: &str) -> Option<Result<Box<dyn Matcher>, Box<dyn Error>>> {
if let Some(res) = filemeta::parse(name, value) {
return Some(map_result(res));
}
None
}
pub trait Matcher: Debug {
fn name(&self) -> &str;
fn matches(&self, file: &FileInfo) -> Option<Result<Box<dyn Extractor>, Box<dyn Error>>>;
}
fn regex_matches(regex: &Regex, string: &str) -> Option<Vec<(String, String)>> {
let captures = regex.captures(string)?;
Some(
captures
.iter()
.zip(regex.capture_names())
.skip(1)
.filter_map(|(m, name)| m.zip(name))
.map(|(m, name)| (name.into(), m.as_str().into()))
.collect(),
)
}

View file

@ -1,117 +1,142 @@
use crate::extractor::{Extractor, FileInfoExtractor, MultiExtractor};
use crate::file::FileInfo;
use crate::matchers::Matcher;
use regex::Regex;
use serde::Deserialize;
use std::borrow::Cow;
use std::collections::HashMap;
use std::error::Error;
use std::sync::OnceLock;
use thiserror::Error;
#[derive(Debug, Deserialize)]
#[serde(try_from = "crate::config::RuleConfig")]
pub struct Rule {
pub name: Option<Regex>,
pub referrer: Option<Regex>,
pub url: Option<Regex>,
pub matchers: Vec<Box<dyn Matcher>>,
pub target: Option<String>,
pub rename: Option<String>,
}
#[derive(Debug)]
pub struct RuleMatch {
pub struct RuleResult {
pub target: Option<String>,
pub rename: Option<String>,
}
#[derive(Hash, PartialEq, Eq, Debug)]
enum CaptureName<'a> {
Named(&'a str),
Unnamed(usize),
}
impl<'a> CaptureName<'a> {
pub fn to_str(&self) -> Cow<'a, str> {
match self {
CaptureName::Named(s) => Cow::Borrowed(s),
CaptureName::Unnamed(1) => Cow::Borrowed("1"),
CaptureName::Unnamed(2) => Cow::Borrowed("2"),
CaptureName::Unnamed(3) => Cow::Borrowed("3"),
CaptureName::Unnamed(4) => Cow::Borrowed("4"),
CaptureName::Unnamed(5) => Cow::Borrowed("5"),
CaptureName::Unnamed(6) => Cow::Borrowed("6"),
CaptureName::Unnamed(7) => Cow::Borrowed("7"),
CaptureName::Unnamed(8) => Cow::Borrowed("8"),
CaptureName::Unnamed(9) => Cow::Borrowed("9"),
CaptureName::Unnamed(10) => Cow::Borrowed("10"),
CaptureName::Unnamed(i) => Cow::Owned(i.to_string()),
}
}
}
impl Rule {
pub fn matches(&self, file: &FileInfo) -> Option<RuleMatch> {
let mut captures: HashMap<CaptureName, &str> = HashMap::new();
captures.insert(CaptureName::Named("mtime"), &file.mtime_str);
pub fn matches(&self, file: &FileInfo) -> Option<Result<RuleResult, RuleMatchError>> {
let file_extractor = FileInfoExtractor::new(file);
let mut extractors = MultiExtractor::with_capacity(self.matchers.len() + 1);
extractors.push(&file_extractor);
if let Some(name) = &self.name {
if !extract_matches(name, file.name(), &mut captures) {
return None;
}
}
if let Some(referrer) = &self.referrer {
if !extract_matches(
referrer,
file.referrer.as_deref().unwrap_or_default(),
&mut captures,
) {
return None;
}
}
if let Some(url) = &self.url {
if !extract_matches(url, file.url.as_deref().unwrap_or_default(), &mut captures) {
return None;
}
}
let apply = |input| apply_captures(input, &captures);
Some(RuleMatch {
target: self.target.as_deref().map(apply),
rename: self.rename.as_deref().map(apply),
})
}
}
fn apply_captures(input: &str, captures: &HashMap<CaptureName, &str>) -> String {
let mut output = input.to_string();
for (name, value) in captures {
let name = name.to_str();
if output.contains(name.as_ref()) && output.contains('$') {
output = output.replace(&format!("${name}"), value);
}
}
output
}
fn extract_matches<'a, 'b>(
regex: &'a Regex,
string: &'b str,
output: &mut HashMap<CaptureName<'a>, &'b str>,
) -> bool {
match regex.captures(string) {
Some(caps) => {
for (i, (m, name)) in caps.iter().zip(regex.capture_names()).enumerate().skip(1) {
if let Some(m) = m {
let cap_name = match name {
Some(name) => CaptureName::Named(name),
None => CaptureName::Unnamed(i),
};
output.insert(cap_name, m.as_str());
for matcher in &self.matchers {
match matcher.matches(file)? {
Ok(extractor) => extractors.push_box(extractor),
Err(error) => {
return Some(Err(RuleMatchError::Matcher {
field: matcher.name().into(),
error,
}));
}
}
}
None => {
return false;
}
let apply = |input: Option<&str>| {
let Some(input) = input else { return Ok(None) };
apply_extractors(input, &extractors).map(Some)
};
let target = match apply(self.target.as_deref()) {
Ok(target) => target,
Err(e) => return Some(Err(e)),
};
let rename = match apply(self.rename.as_deref()) {
Ok(target) => target,
Err(e) => return Some(Err(e)),
};
Some(Ok(RuleResult { target, rename }))
}
true
}
static SUBST_REGEX: OnceLock<Regex> = OnceLock::new();
fn apply_extractors<E: Extractor>(input: &str, extractor: &E) -> Result<String, RuleMatchError> {
let subst_regex =
SUBST_REGEX.get_or_init(|| Regex::new(r#"\$(\{([^})]+)}|([a-zA-Z0-9]+))"#).unwrap());
// copied from `Regex::replace_all` adjusted to support returning errors
let mut it = subst_regex.captures_iter(input).enumerate().peekable();
if it.peek().is_none() {
return Ok(input.into());
}
let mut new = String::with_capacity(input.len());
let mut last_match = 0;
for (_, cap) in it {
let m = &cap.get(0).unwrap();
new.push_str(&input[last_match..m.start()]);
let name: &str = cap.get(2).or_else(|| cap.get(3)).unwrap().as_str();
let extracted = extractor
.extract(name)
.ok_or_else(|| RuleMatchError::UnknownSubstitution { name: name.into() })?
.map_err(|error| RuleMatchError::Matcher {
field: name.into(),
error,
})?;
new.push_str(&extracted);
last_match = m.end();
}
new.push_str(&input[last_match..]);
Ok(new)
}
#[derive(Debug, Error)]
pub enum RuleMatchError {
#[error("Error matching {field}: {error:#}")]
Matcher {
field: String,
error: Box<dyn Error>,
},
#[error("Unknown substitution {name}")]
UnknownSubstitution { name: String },
}
#[test]
fn test_apply_extractors() {
use maplit::hashmap;
let extractor = hashmap! {
"foo" => "bar",
"longer-key" => "value"
};
assert_eq!(
"test bar",
apply_extractors("test $foo", &extractor).unwrap()
);
assert!(apply_extractors("$foobar", &extractor).is_err());
assert_eq!("barbar", apply_extractors("${foo}bar", &extractor).unwrap());
}
#[test]
fn test_rule() {
use crate::matchers::get_matcher;
let rule = Rule {
matchers: vec![get_matcher("name", r#"\.txt"#).unwrap().unwrap()],
target: Some("/target/dir".into()),
rename: None,
};
let txt_file = FileInfo {
path: "/tmp/test.txt".into(),
url: Some("https://example.com/test.txt".into()),
referrer: Some("https://example.com/downloads".into()),
sha256: [0; 32],
mtime: 1234,
size: 100,
};
let result = rule.matches(&txt_file).unwrap().unwrap();
assert_eq!(Some("/target/dir"), result.target.as_deref());
assert!(result.rename.is_none());
}