add xpath extractor

This commit is contained in:
Robin Appelman 2025-11-03 21:21:25 +01:00
commit b423323473
8 changed files with 1198 additions and 57 deletions

957
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -20,6 +20,7 @@ notify-debouncer-full = "0.6.0"
ctrlc = "3.5.0" ctrlc = "3.5.0"
sha2 = "0.11.0-rc.2" sha2 = "0.11.0-rc.2"
hex = "0.4.3" hex = "0.4.3"
xrust = "1.3.0"
[dev-dependencies] [dev-dependencies]
maplit = "1.0.2" maplit = "1.0.2"

View file

@ -41,8 +41,12 @@ And two action options:
- `move`: directory to move the file into, will be created if necessary - `move`: directory to move the file into, will be created if necessary
- `rename`: rename the file - `rename`: rename the file
Action options can refer to capture groups from the match options and the file Actions options can refer to the following data extracted from the rule to
mtime to dynamically set the target directory and name. dynamically set the target directory and name.
- Any named regex capture group
- `mtime`: unix timestamp of the downloaded file
- `xpath('....')` an xpath expression to match on the download file
Multiple rule sections can be configured, the first matching rule will be used. Multiple rule sections can be configured, the first matching rule will be used.
@ -73,6 +77,15 @@ url = "https://www.paypal.com"
move = "~/Downloads/Paypal Statements/$mtime.csv" # $mtime is set to the unix timestamp move = "~/Downloads/Paypal Statements/$mtime.csv" # $mtime is set to the unix timestamp
``` ```
Rename based on an XPath expression
```toml
[[rule]]
name = ".+\\.(lss)"
move = "~/Livesplits/${xpath('//GameName/text()')}"
rename = "${xpath('//CategoryName/text()')} - ${xpath('//Metadata/Variables/Variable[contains(@name, \"Subcategory\")]/text()')}.lss"
```
### Url and referrer ### Url and referrer
Galton uses the standard `user.xdg.origin.url` and `user.xdg.referrer.url` Galton uses the standard `user.xdg.origin.url` and `user.xdg.referrer.url`
@ -92,7 +105,7 @@ to automatically create a symlink to the new location of the file.
symlink = "~/Downloads/last" symlink = "~/Downloads/last"
``` ```
## Lastest download symlink ## Remove duplicate downloads
Galton can also be used to clean up duplicate downloads, when enabled, it will Galton can also be used to clean up duplicate downloads, when enabled, it will
check for any existing file with the same contents in the target directory and check for any existing file with the same contents in the target directory and

View file

@ -1,8 +1,11 @@
mod xpath;
use crate::file::FileInfo; use crate::file::FileInfo;
use std::borrow::{Borrow, Cow}; use std::borrow::{Borrow, Cow};
use std::collections::HashMap; use std::collections::HashMap;
use std::error::Error; use std::error::Error;
use std::hash::Hash; use std::hash::Hash;
pub use xpath::XPathExtractor;
#[derive(Default)] #[derive(Default)]
pub struct MultiExtractor<'a> { pub struct MultiExtractor<'a> {

132
src/extractor/xpath.rs Normal file
View file

@ -0,0 +1,132 @@
use crate::extractor::Extractor;
use crate::file::{FileError, FileInfo};
use std::borrow::Cow;
use std::error::Error;
use std::path::PathBuf;
use thiserror::Error;
use xrust::parser::xml::parse as xmlparse;
use xrust::parser::xpath::parse;
use xrust::transform::context::{ContextBuilder, StaticContextBuilder};
use xrust::trees::smite::RNode;
use xrust::{Error as XPathParseError, Item, Node, SequenceTrait};
pub struct XPathExtractor<'a> {
file: &'a FileInfo,
}
impl<'a> XPathExtractor<'a> {
pub fn new(file: &'a FileInfo) -> XPathExtractor<'a> {
XPathExtractor { file }
}
}
impl Extractor for XPathExtractor<'_> {
fn extract<'this>(&'this self, field: &str) -> Option<Result<Cow<'this, str>, Box<dyn Error>>> {
let query = field
.strip_prefix("xpath('")
.and_then(|query| query.strip_suffix("')"))?;
let transform = match parse::<RNode>(query, None) {
Ok(transform) => transform,
Err(error) => {
return Some(Err(XPathError::Parse {
query: query.into(),
error,
}
.into()));
}
};
let content = match self.file.str_content() {
Ok(content) => content,
Err(error) => return Some(Err(XPathError::ReadFile(error).into())),
};
let xml = match xmlparse(RNode::new_document(), content, None) {
Ok(xml) => xml,
Err(error) => {
return Some(Err(XPathError::ParseXml {
error,
path: self.file.path.clone().into(),
}
.into()));
}
};
let mut static_context = StaticContextBuilder::new()
.message(|_| Ok(()))
.fetcher(|_| Ok(String::new()))
.parser(|_| unreachable!())
.build();
let context = ContextBuilder::new().context(vec![Item::Node(xml)]).build();
let sequence = match context.dispatch(&mut static_context, &transform) {
Ok(res) => res,
Err(error) => {
return Some(Err(XPathError::MatchError {
error,
query: query.into(),
}
.into()));
}
};
Some(Ok(sequence.to_xml().into()))
}
}
#[derive(Debug, Error)]
enum XPathError {
#[error("Failed to parse xpath '{query}': {error:#}")]
Parse {
error: XPathParseError,
query: String,
},
#[error(transparent)]
ReadFile(FileError),
#[error("Failed to parse xml '{}': {error:#}", path.display())]
ParseXml {
error: XPathParseError,
path: PathBuf,
},
#[error("Failed to match xpath '{query}': {error:#}")]
MatchError {
error: XPathParseError,
query: String,
},
}
#[test]
fn test_xpath() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<Run version="1.7.0">
<GameIcon></GameIcon>
<GameName>Hollow Knight: Silksong</GameName>
<CategoryName>Any%</CategoryName>
<Metadata>
<Run id=""/>
<Platform usesEmulator="False"/>
<Variables>
<Variable name="Any% Subcategory">No Major Glitches</Variable>
</Variables>
</Metadata>
<Offset>00:00:00</Offset>
</Run>"#;
use std::cell::OnceCell;
let xml_file = FileInfo {
path: "/tmp/test.lss".into(),
url: None,
referrer: None,
sha256: [0; 32],
mtime: 1234,
size: 100,
content: OnceCell::from(xml.as_bytes().to_vec()),
};
let matcher = XPathExtractor::new(&xml_file);
assert_eq!(
"Hollow Knight: Silksong",
matcher
.extract("xpath('//GameName/text()')")
.unwrap()
.unwrap()
);
}

View file

@ -1,9 +1,11 @@
use hex::FromHex; use hex::FromHex;
use sha2::{Digest, Sha256}; use sha2::{Digest, Sha256};
use std::fs::File; use std::cell::OnceCell;
use std::fs::{read, File};
use std::io::Read; use std::io::Read;
use std::os::unix::fs::MetadataExt; use std::os::unix::fs::MetadataExt;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::str::Utf8Error;
use std::time::UNIX_EPOCH; use std::time::UNIX_EPOCH;
use thiserror::Error; use thiserror::Error;
@ -15,6 +17,7 @@ pub struct FileInfo {
#[allow(dead_code)] #[allow(dead_code)]
pub mtime: u64, pub mtime: u64,
pub size: u64, pub size: u64,
pub content: OnceCell<Vec<u8>>,
} }
impl FileInfo { impl FileInfo {
@ -46,6 +49,7 @@ impl FileInfo {
sha256: [0; 32], sha256: [0; 32],
mtime, mtime,
size: stat.size(), size: stat.size(),
content: OnceCell::new(),
}; };
let attributes = xattr::list(path).unwrap_or_default(); let attributes = xattr::list(path).unwrap_or_default();
@ -119,6 +123,30 @@ impl FileInfo {
error, error,
})?) })?)
} }
fn read_content(&self) -> Result<Vec<u8>, FileError> {
let path = Path::new(&self.path);
read(path).map_err(|error| FileError::Read {
path: path.into(),
error,
})
}
pub fn content(&self) -> Result<&[u8], FileError> {
if let Some(content) = self.content.get() {
return Ok(content);
}
self.content.set(self.read_content()?).unwrap();
Ok(self.content.get().unwrap())
}
pub fn str_content(&self) -> Result<&str, FileError> {
let raw = self.content()?;
std::str::from_utf8(raw).map_err(|error| FileError::Utf8 {
error,
path: self.path.clone().into(),
})
}
} }
fn load_or_calculate_hash(path: impl AsRef<Path>) -> Result<[u8; 32], std::io::Error> { fn load_or_calculate_hash(path: impl AsRef<Path>) -> Result<[u8; 32], std::io::Error> {
@ -163,9 +191,16 @@ pub enum FileError {
path: PathBuf, path: PathBuf,
error: std::io::Error, error: std::io::Error,
}, },
#[error("Failed to store hash for: {}", path.display())] #[error("Failed to store hash for {}: {error:#}", path.display())]
StoreHash { StoreHash {
path: PathBuf, path: PathBuf,
error: std::io::Error, error: std::io::Error,
}, },
#[error("Failed to read {}: {error:#}", path.display())]
Read {
path: PathBuf,
error: std::io::Error,
},
#[error("File {} is not valid utf8: {error:#}", path.display())]
Utf8 { error: Utf8Error, path: PathBuf },
} }

View file

@ -80,6 +80,8 @@ impl Extractor for RegexMatchExtractor {
#[test] #[test]
fn test_file_meta_matcher() { fn test_file_meta_matcher() {
use std::cell::OnceCell;
let txt_file = FileInfo { let txt_file = FileInfo {
path: "/tmp/test.txt".into(), path: "/tmp/test.txt".into(),
url: Some("https://example.com/test.txt".into()), url: Some("https://example.com/test.txt".into()),
@ -87,6 +89,7 @@ fn test_file_meta_matcher() {
sha256: [0; 32], sha256: [0; 32],
mtime: 1234, mtime: 1234,
size: 100, size: 100,
content: OnceCell::new(),
}; };
let png_file = FileInfo { let png_file = FileInfo {
path: "/tmp/test.png".into(), path: "/tmp/test.png".into(),
@ -95,6 +98,7 @@ fn test_file_meta_matcher() {
sha256: [0; 32], sha256: [0; 32],
mtime: 1234, mtime: 1234,
size: 100, size: 100,
content: OnceCell::new(),
}; };
let txt_matcher = parse("name", r#"(?<txt_name>.+)\.txt"#).unwrap().unwrap(); let txt_matcher = parse("name", r#"(?<txt_name>.+)\.txt"#).unwrap().unwrap();

View file

@ -1,4 +1,4 @@
use crate::extractor::{Extractor, FileInfoExtractor, MultiExtractor}; use crate::extractor::{Extractor, FileInfoExtractor, MultiExtractor, XPathExtractor};
use crate::file::FileInfo; use crate::file::FileInfo;
use crate::matchers::Matcher; use crate::matchers::Matcher;
use regex::Regex; use regex::Regex;
@ -24,8 +24,11 @@ pub struct RuleResult {
impl Rule { impl Rule {
pub fn matches(&self, file: &FileInfo) -> Option<Result<RuleResult, RuleMatchError>> { pub fn matches(&self, file: &FileInfo) -> Option<Result<RuleResult, RuleMatchError>> {
let file_extractor = FileInfoExtractor::new(file); let file_extractor = FileInfoExtractor::new(file);
let mut extractors = MultiExtractor::with_capacity(self.matchers.len() + 1); let xpath_extractor = XPathExtractor::new(file);
let mut extractors = MultiExtractor::with_capacity(self.matchers.len() + 2);
extractors.push(&file_extractor); extractors.push(&file_extractor);
extractors.push(&xpath_extractor);
for matcher in &self.matchers { for matcher in &self.matchers {
match matcher.matches(file)? { match matcher.matches(file)? {
@ -61,7 +64,7 @@ static SUBST_REGEX: OnceLock<Regex> = OnceLock::new();
fn apply_extractors<E: Extractor>(input: &str, extractor: &E) -> Result<String, RuleMatchError> { fn apply_extractors<E: Extractor>(input: &str, extractor: &E) -> Result<String, RuleMatchError> {
let subst_regex = let subst_regex =
SUBST_REGEX.get_or_init(|| Regex::new(r#"\$(\{([^})]+)}|([a-zA-Z0-9]+))"#).unwrap()); SUBST_REGEX.get_or_init(|| Regex::new(r#"\$(\{([^}]+)}|([a-zA-Z0-9]+))"#).unwrap());
// copied from `Regex::replace_all` adjusted to support returning errors // copied from `Regex::replace_all` adjusted to support returning errors
let mut it = subst_regex.captures_iter(input).enumerate().peekable(); let mut it = subst_regex.captures_iter(input).enumerate().peekable();
@ -121,6 +124,8 @@ fn test_apply_extractors() {
#[test] #[test]
fn test_rule() { fn test_rule() {
use crate::matchers::get_matcher; use crate::matchers::get_matcher;
use std::cell::OnceCell;
let rule = Rule { let rule = Rule {
matchers: vec![get_matcher("name", r#"\.txt"#).unwrap().unwrap()], matchers: vec![get_matcher("name", r#"\.txt"#).unwrap().unwrap()],
target: Some("/target/dir".into()), target: Some("/target/dir".into()),
@ -134,6 +139,7 @@ fn test_rule() {
sha256: [0; 32], sha256: [0; 32],
mtime: 1234, mtime: 1234,
size: 100, size: 100,
content: OnceCell::new(),
}; };
let result = rule.matches(&txt_file).unwrap().unwrap(); let result = rule.matches(&txt_file).unwrap().unwrap();