add xpath extractor

2026-08-02 12:14:51 +02:00 · 2025-11-03 21:21:25 +01:00 · 2025-11-03 21:21:25 +01:00 · b423323473
commit b423323473
parent e20f0d7661
8 changed files with 1198 additions and 57 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -20,6 +20,7 @@ notify-debouncer-full = "0.6.0"
 ctrlc = "3.5.0"
 sha2 = "0.11.0-rc.2"
 hex = "0.4.3"
 xrust = "1.3.0"
 [dev-dependencies]
 maplit = "1.0.2"
--- a/README.md
+++ b/README.md
@ -41,8 +41,12 @@ And two action options:
 - `move`: directory to move the file into, will be created if necessary
 - `rename`: rename the file
-Action options can refer to capture groups from the match options and the file
+Actions options can refer to the following data extracted from the rule to
-mtime to dynamically set the target directory and name.
+dynamically set the target directory and name.
 - Any named regex capture group
 - `mtime`: unix timestamp of the downloaded file
 - `xpath('....')` an xpath expression to match on the download file
 Multiple rule sections can be configured, the first matching rule will be used.
@ -73,6 +77,15 @@ url = "https://www.paypal.com"
 move = "~/Downloads/Paypal Statements/$mtime.csv" # $mtime is set to the unix timestamp
 ```
 Rename based on an XPath expression
 ```toml
 [[rule]]
 name = ".+\\.(lss)"
 move = "~/Livesplits/${xpath('//GameName/text()')}"
 rename = "${xpath('//CategoryName/text()')} - ${xpath('//Metadata/Variables/Variable[contains(@name, \"Subcategory\")]/text()')}.lss"
 ```
 ### Url and referrer
 Galton uses the standard `user.xdg.origin.url` and `user.xdg.referrer.url`
@ -92,7 +105,7 @@ to automatically create a symlink to the new location of the file.
 symlink = "~/Downloads/last"
 ```
-## Lastest download symlink
+## Remove duplicate downloads
 Galton can also be used to clean up duplicate downloads, when enabled, it will
 check for any existing file with the same contents in the target directory and
--- a/src/extractor/mod.rs
+++ b/src/extractor/mod.rs
@ -1,8 +1,11 @@
 mod xpath;
 use crate::file::FileInfo;
 use std::borrow::{Borrow, Cow};
 use std::collections::HashMap;
 use std::error::Error;
 use std::hash::Hash;
 pub use xpath::XPathExtractor;
 #[derive(Default)]
 pub struct MultiExtractor<'a> {
--- a/src/extractor/xpath.rs
+++ b/src/extractor/xpath.rs
@ -0,0 +1,132 @@
 use crate::extractor::Extractor;
 use crate::file::{FileError, FileInfo};
 use std::borrow::Cow;
 use std::error::Error;
 use std::path::PathBuf;
 use thiserror::Error;
 use xrust::parser::xml::parse as xmlparse;
 use xrust::parser::xpath::parse;
 use xrust::transform::context::{ContextBuilder, StaticContextBuilder};
 use xrust::trees::smite::RNode;
 use xrust::{Error as XPathParseError, Item, Node, SequenceTrait};
 pub struct XPathExtractor<'a> {
    file: &'a FileInfo,
 }
 impl<'a> XPathExtractor<'a> {
    pub fn new(file: &'a FileInfo) -> XPathExtractor<'a> {
        XPathExtractor { file }
    }
 }
 impl Extractor for XPathExtractor<'_> {
    fn extract<'this>(&'this self, field: &str) -> Option<Result<Cow<'this, str>, Box<dyn Error>>> {
        let query = field
            .strip_prefix("xpath('")
            .and_then(|query| query.strip_suffix("')"))?;
        let transform = match parse::<RNode>(query, None) {
            Ok(transform) => transform,
            Err(error) => {
                return Some(Err(XPathError::Parse {
                    query: query.into(),
                    error,
                }
                .into()));
            }
        };
        let content = match self.file.str_content() {
            Ok(content) => content,
            Err(error) => return Some(Err(XPathError::ReadFile(error).into())),
        };
        let xml = match xmlparse(RNode::new_document(), content, None) {
            Ok(xml) => xml,
            Err(error) => {
                return Some(Err(XPathError::ParseXml {
                    error,
                    path: self.file.path.clone().into(),
                }
                .into()));
            }
        };
        let mut static_context = StaticContextBuilder::new()
            .message(|_| Ok(()))
            .fetcher(|_| Ok(String::new()))
            .parser(|_| unreachable!())
            .build();
        let context = ContextBuilder::new().context(vec![Item::Node(xml)]).build();
        let sequence = match context.dispatch(&mut static_context, &transform) {
            Ok(res) => res,
            Err(error) => {
                return Some(Err(XPathError::MatchError {
                    error,
                    query: query.into(),
                }
                .into()));
            }
        };
        Some(Ok(sequence.to_xml().into()))
    }
 }
 #[derive(Debug, Error)]
 enum XPathError {
    #[error("Failed to parse xpath '{query}': {error:#}")]
    Parse {
        error: XPathParseError,
        query: String,
    },
    #[error(transparent)]
    ReadFile(FileError),
    #[error("Failed to parse xml '{}': {error:#}", path.display())]
    ParseXml {
        error: XPathParseError,
        path: PathBuf,
    },
    #[error("Failed to match xpath '{query}': {error:#}")]
    MatchError {
        error: XPathParseError,
        query: String,
    },
 }
 #[test]
 fn test_xpath() {
    let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
    <Run version="1.7.0">
      <GameIcon></GameIcon>
      <GameName>Hollow Knight: Silksong</GameName>
      <CategoryName>Any%</CategoryName>
      <Metadata>
        <Run id=""/>
        <Platform usesEmulator="False"/>
        <Variables>
          <Variable name="Any% Subcategory">No Major Glitches</Variable>
        </Variables>
      </Metadata>
      <Offset>00:00:00</Offset>
    </Run>"#;
    use std::cell::OnceCell;
    let xml_file = FileInfo {
        path: "/tmp/test.lss".into(),
        url: None,
        referrer: None,
        sha256: [0; 32],
        mtime: 1234,
        size: 100,
        content: OnceCell::from(xml.as_bytes().to_vec()),
    };
    let matcher = XPathExtractor::new(&xml_file);
    assert_eq!(
        "Hollow Knight: Silksong",
        matcher
            .extract("xpath('//GameName/text()')")
            .unwrap()
            .unwrap()
    );
 }
--- a/src/file.rs
+++ b/src/file.rs
@ -1,9 +1,11 @@
 use hex::FromHex;
 use sha2::{Digest, Sha256};
-use std::fs::File;
+use std::cell::OnceCell;
 use std::fs::{read, File};
 use std::io::Read;
 use std::os::unix::fs::MetadataExt;
 use std::path::{Path, PathBuf};
 use std::str::Utf8Error;
 use std::time::UNIX_EPOCH;
 use thiserror::Error;
@ -15,6 +17,7 @@ pub struct FileInfo {
    #[allow(dead_code)]
    pub mtime: u64,
    pub size: u64,
    pub content: OnceCell<Vec<u8>>,
 }
 impl FileInfo {
@ -46,6 +49,7 @@ impl FileInfo {
            sha256: [0; 32],
            mtime,
            size: stat.size(),
            content: OnceCell::new(),
        };
        let attributes = xattr::list(path).unwrap_or_default();
@ -119,6 +123,30 @@ impl FileInfo {
                error,
            })?)
    }
    fn read_content(&self) -> Result<Vec<u8>, FileError> {
        let path = Path::new(&self.path);
        read(path).map_err(|error| FileError::Read {
            path: path.into(),
            error,
        })
    }
    pub fn content(&self) -> Result<&[u8], FileError> {
        if let Some(content) = self.content.get() {
            return Ok(content);
        }
        self.content.set(self.read_content()?).unwrap();
        Ok(self.content.get().unwrap())
    }
    pub fn str_content(&self) -> Result<&str, FileError> {
        let raw = self.content()?;
        std::str::from_utf8(raw).map_err(|error| FileError::Utf8 {
            error,
            path: self.path.clone().into(),
        })
    }
 }
 fn load_or_calculate_hash(path: impl AsRef<Path>) -> Result<[u8; 32], std::io::Error> {
@ -163,9 +191,16 @@ pub enum FileError {
        path: PathBuf,
        error: std::io::Error,
    },
-    #[error("Failed to store hash for: {}", path.display())]
+    #[error("Failed to store hash for {}: {error:#}", path.display())]
    StoreHash {
        path: PathBuf,
        error: std::io::Error,
    },
    #[error("Failed to read {}: {error:#}", path.display())]
    Read {
        path: PathBuf,
        error: std::io::Error,
    },
    #[error("File {} is not valid utf8: {error:#}", path.display())]
    Utf8 { error: Utf8Error, path: PathBuf },
 }
--- a/src/matchers/filemeta.rs
+++ b/src/matchers/filemeta.rs
@ -80,6 +80,8 @@ impl Extractor for RegexMatchExtractor {
 #[test]
 fn test_file_meta_matcher() {
    use std::cell::OnceCell;
    let txt_file = FileInfo {
        path: "/tmp/test.txt".into(),
        url: Some("https://example.com/test.txt".into()),
@ -87,6 +89,7 @@ fn test_file_meta_matcher() {
        sha256: [0; 32],
        mtime: 1234,
        size: 100,
        content: OnceCell::new(),
    };
    let png_file = FileInfo {
        path: "/tmp/test.png".into(),
@ -95,6 +98,7 @@ fn test_file_meta_matcher() {
        sha256: [0; 32],
        mtime: 1234,
        size: 100,
        content: OnceCell::new(),
    };
    let txt_matcher = parse("name", r#"(?<txt_name>.+)\.txt"#).unwrap().unwrap();
--- a/src/rule.rs
+++ b/src/rule.rs
@ -1,4 +1,4 @@
-use crate::extractor::{Extractor, FileInfoExtractor, MultiExtractor};
+use crate::extractor::{Extractor, FileInfoExtractor, MultiExtractor, XPathExtractor};
 use crate::file::FileInfo;
 use crate::matchers::Matcher;
 use regex::Regex;
@ -24,8 +24,11 @@ pub struct RuleResult {
 impl Rule {
    pub fn matches(&self, file: &FileInfo) -> Option<Result<RuleResult, RuleMatchError>> {
        let file_extractor = FileInfoExtractor::new(file);
-        let mut extractors = MultiExtractor::with_capacity(self.matchers.len() + 1);
+        let xpath_extractor = XPathExtractor::new(file);
        let mut extractors = MultiExtractor::with_capacity(self.matchers.len() + 2);
        extractors.push(&file_extractor);
        extractors.push(&xpath_extractor);
        for matcher in &self.matchers {
            match matcher.matches(file)? {
@ -61,7 +64,7 @@ static SUBST_REGEX: OnceLock<Regex> = OnceLock::new();
 fn apply_extractors<E: Extractor>(input: &str, extractor: &E) -> Result<String, RuleMatchError> {
    let subst_regex =
-        SUBST_REGEX.get_or_init(|| Regex::new(r#"\$(\{([^})]+)}|([a-zA-Z0-9]+))"#).unwrap());
+        SUBST_REGEX.get_or_init(|| Regex::new(r#"\$(\{([^}]+)}|([a-zA-Z0-9]+))"#).unwrap());
    // copied from `Regex::replace_all` adjusted to support returning errors
    let mut it = subst_regex.captures_iter(input).enumerate().peekable();
@ -121,6 +124,8 @@ fn test_apply_extractors() {
 #[test]
 fn test_rule() {
    use crate::matchers::get_matcher;
    use std::cell::OnceCell;
    let rule = Rule {
        matchers: vec![get_matcher("name", r#"\.txt"#).unwrap().unwrap()],
        target: Some("/target/dir".into()),
@ -134,6 +139,7 @@ fn test_rule() {
        sha256: [0; 32],
        mtime: 1234,
        size: 100,
        content: OnceCell::new(),
    };
    let result = rule.matches(&txt_file).unwrap().unwrap();