mirror of
https://codeberg.org/icewind/galton.git
synced 2026-06-03 10:24:07 +02:00
add xpath extractor
This commit is contained in:
parent
e20f0d7661
commit
b423323473
8 changed files with 1198 additions and 57 deletions
957
Cargo.lock
generated
957
Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -20,6 +20,7 @@ notify-debouncer-full = "0.6.0"
|
||||||
ctrlc = "3.5.0"
|
ctrlc = "3.5.0"
|
||||||
sha2 = "0.11.0-rc.2"
|
sha2 = "0.11.0-rc.2"
|
||||||
hex = "0.4.3"
|
hex = "0.4.3"
|
||||||
|
xrust = "1.3.0"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
maplit = "1.0.2"
|
maplit = "1.0.2"
|
||||||
19
README.md
19
README.md
|
|
@ -41,8 +41,12 @@ And two action options:
|
||||||
- `move`: directory to move the file into, will be created if necessary
|
- `move`: directory to move the file into, will be created if necessary
|
||||||
- `rename`: rename the file
|
- `rename`: rename the file
|
||||||
|
|
||||||
Action options can refer to capture groups from the match options and the file
|
Actions options can refer to the following data extracted from the rule to
|
||||||
mtime to dynamically set the target directory and name.
|
dynamically set the target directory and name.
|
||||||
|
|
||||||
|
- Any named regex capture group
|
||||||
|
- `mtime`: unix timestamp of the downloaded file
|
||||||
|
- `xpath('....')` an xpath expression to match on the download file
|
||||||
|
|
||||||
Multiple rule sections can be configured, the first matching rule will be used.
|
Multiple rule sections can be configured, the first matching rule will be used.
|
||||||
|
|
||||||
|
|
@ -73,6 +77,15 @@ url = "https://www.paypal.com"
|
||||||
move = "~/Downloads/Paypal Statements/$mtime.csv" # $mtime is set to the unix timestamp
|
move = "~/Downloads/Paypal Statements/$mtime.csv" # $mtime is set to the unix timestamp
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Rename based on an XPath expression
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[[rule]]
|
||||||
|
name = ".+\\.(lss)"
|
||||||
|
move = "~/Livesplits/${xpath('//GameName/text()')}"
|
||||||
|
rename = "${xpath('//CategoryName/text()')} - ${xpath('//Metadata/Variables/Variable[contains(@name, \"Subcategory\")]/text()')}.lss"
|
||||||
|
```
|
||||||
|
|
||||||
### Url and referrer
|
### Url and referrer
|
||||||
|
|
||||||
Galton uses the standard `user.xdg.origin.url` and `user.xdg.referrer.url`
|
Galton uses the standard `user.xdg.origin.url` and `user.xdg.referrer.url`
|
||||||
|
|
@ -92,7 +105,7 @@ to automatically create a symlink to the new location of the file.
|
||||||
symlink = "~/Downloads/last"
|
symlink = "~/Downloads/last"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Lastest download symlink
|
## Remove duplicate downloads
|
||||||
|
|
||||||
Galton can also be used to clean up duplicate downloads, when enabled, it will
|
Galton can also be used to clean up duplicate downloads, when enabled, it will
|
||||||
check for any existing file with the same contents in the target directory and
|
check for any existing file with the same contents in the target directory and
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,11 @@
|
||||||
|
mod xpath;
|
||||||
|
|
||||||
use crate::file::FileInfo;
|
use crate::file::FileInfo;
|
||||||
use std::borrow::{Borrow, Cow};
|
use std::borrow::{Borrow, Cow};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use std::hash::Hash;
|
use std::hash::Hash;
|
||||||
|
pub use xpath::XPathExtractor;
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct MultiExtractor<'a> {
|
pub struct MultiExtractor<'a> {
|
||||||
|
|
|
||||||
132
src/extractor/xpath.rs
Normal file
132
src/extractor/xpath.rs
Normal file
|
|
@ -0,0 +1,132 @@
|
||||||
|
use crate::extractor::Extractor;
|
||||||
|
use crate::file::{FileError, FileInfo};
|
||||||
|
use std::borrow::Cow;
|
||||||
|
use std::error::Error;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use thiserror::Error;
|
||||||
|
use xrust::parser::xml::parse as xmlparse;
|
||||||
|
use xrust::parser::xpath::parse;
|
||||||
|
use xrust::transform::context::{ContextBuilder, StaticContextBuilder};
|
||||||
|
use xrust::trees::smite::RNode;
|
||||||
|
use xrust::{Error as XPathParseError, Item, Node, SequenceTrait};
|
||||||
|
|
||||||
|
pub struct XPathExtractor<'a> {
|
||||||
|
file: &'a FileInfo,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> XPathExtractor<'a> {
|
||||||
|
pub fn new(file: &'a FileInfo) -> XPathExtractor<'a> {
|
||||||
|
XPathExtractor { file }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Extractor for XPathExtractor<'_> {
|
||||||
|
fn extract<'this>(&'this self, field: &str) -> Option<Result<Cow<'this, str>, Box<dyn Error>>> {
|
||||||
|
let query = field
|
||||||
|
.strip_prefix("xpath('")
|
||||||
|
.and_then(|query| query.strip_suffix("')"))?;
|
||||||
|
let transform = match parse::<RNode>(query, None) {
|
||||||
|
Ok(transform) => transform,
|
||||||
|
Err(error) => {
|
||||||
|
return Some(Err(XPathError::Parse {
|
||||||
|
query: query.into(),
|
||||||
|
error,
|
||||||
|
}
|
||||||
|
.into()));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let content = match self.file.str_content() {
|
||||||
|
Ok(content) => content,
|
||||||
|
Err(error) => return Some(Err(XPathError::ReadFile(error).into())),
|
||||||
|
};
|
||||||
|
let xml = match xmlparse(RNode::new_document(), content, None) {
|
||||||
|
Ok(xml) => xml,
|
||||||
|
Err(error) => {
|
||||||
|
return Some(Err(XPathError::ParseXml {
|
||||||
|
error,
|
||||||
|
path: self.file.path.clone().into(),
|
||||||
|
}
|
||||||
|
.into()));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut static_context = StaticContextBuilder::new()
|
||||||
|
.message(|_| Ok(()))
|
||||||
|
.fetcher(|_| Ok(String::new()))
|
||||||
|
.parser(|_| unreachable!())
|
||||||
|
.build();
|
||||||
|
let context = ContextBuilder::new().context(vec![Item::Node(xml)]).build();
|
||||||
|
let sequence = match context.dispatch(&mut static_context, &transform) {
|
||||||
|
Ok(res) => res,
|
||||||
|
Err(error) => {
|
||||||
|
return Some(Err(XPathError::MatchError {
|
||||||
|
error,
|
||||||
|
query: query.into(),
|
||||||
|
}
|
||||||
|
.into()));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
Some(Ok(sequence.to_xml().into()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Error)]
|
||||||
|
enum XPathError {
|
||||||
|
#[error("Failed to parse xpath '{query}': {error:#}")]
|
||||||
|
Parse {
|
||||||
|
error: XPathParseError,
|
||||||
|
query: String,
|
||||||
|
},
|
||||||
|
#[error(transparent)]
|
||||||
|
ReadFile(FileError),
|
||||||
|
#[error("Failed to parse xml '{}': {error:#}", path.display())]
|
||||||
|
ParseXml {
|
||||||
|
error: XPathParseError,
|
||||||
|
path: PathBuf,
|
||||||
|
},
|
||||||
|
#[error("Failed to match xpath '{query}': {error:#}")]
|
||||||
|
MatchError {
|
||||||
|
error: XPathParseError,
|
||||||
|
query: String,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_xpath() {
|
||||||
|
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<Run version="1.7.0">
|
||||||
|
<GameIcon></GameIcon>
|
||||||
|
<GameName>Hollow Knight: Silksong</GameName>
|
||||||
|
<CategoryName>Any%</CategoryName>
|
||||||
|
<Metadata>
|
||||||
|
<Run id=""/>
|
||||||
|
<Platform usesEmulator="False"/>
|
||||||
|
<Variables>
|
||||||
|
<Variable name="Any% Subcategory">No Major Glitches</Variable>
|
||||||
|
</Variables>
|
||||||
|
</Metadata>
|
||||||
|
<Offset>00:00:00</Offset>
|
||||||
|
</Run>"#;
|
||||||
|
|
||||||
|
use std::cell::OnceCell;
|
||||||
|
|
||||||
|
let xml_file = FileInfo {
|
||||||
|
path: "/tmp/test.lss".into(),
|
||||||
|
url: None,
|
||||||
|
referrer: None,
|
||||||
|
sha256: [0; 32],
|
||||||
|
mtime: 1234,
|
||||||
|
size: 100,
|
||||||
|
content: OnceCell::from(xml.as_bytes().to_vec()),
|
||||||
|
};
|
||||||
|
let matcher = XPathExtractor::new(&xml_file);
|
||||||
|
assert_eq!(
|
||||||
|
"Hollow Knight: Silksong",
|
||||||
|
matcher
|
||||||
|
.extract("xpath('//GameName/text()')")
|
||||||
|
.unwrap()
|
||||||
|
.unwrap()
|
||||||
|
);
|
||||||
|
}
|
||||||
39
src/file.rs
39
src/file.rs
|
|
@ -1,9 +1,11 @@
|
||||||
use hex::FromHex;
|
use hex::FromHex;
|
||||||
use sha2::{Digest, Sha256};
|
use sha2::{Digest, Sha256};
|
||||||
use std::fs::File;
|
use std::cell::OnceCell;
|
||||||
|
use std::fs::{read, File};
|
||||||
use std::io::Read;
|
use std::io::Read;
|
||||||
use std::os::unix::fs::MetadataExt;
|
use std::os::unix::fs::MetadataExt;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::str::Utf8Error;
|
||||||
use std::time::UNIX_EPOCH;
|
use std::time::UNIX_EPOCH;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
|
||||||
|
|
@ -15,6 +17,7 @@ pub struct FileInfo {
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
pub mtime: u64,
|
pub mtime: u64,
|
||||||
pub size: u64,
|
pub size: u64,
|
||||||
|
pub content: OnceCell<Vec<u8>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FileInfo {
|
impl FileInfo {
|
||||||
|
|
@ -46,6 +49,7 @@ impl FileInfo {
|
||||||
sha256: [0; 32],
|
sha256: [0; 32],
|
||||||
mtime,
|
mtime,
|
||||||
size: stat.size(),
|
size: stat.size(),
|
||||||
|
content: OnceCell::new(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let attributes = xattr::list(path).unwrap_or_default();
|
let attributes = xattr::list(path).unwrap_or_default();
|
||||||
|
|
@ -119,6 +123,30 @@ impl FileInfo {
|
||||||
error,
|
error,
|
||||||
})?)
|
})?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn read_content(&self) -> Result<Vec<u8>, FileError> {
|
||||||
|
let path = Path::new(&self.path);
|
||||||
|
read(path).map_err(|error| FileError::Read {
|
||||||
|
path: path.into(),
|
||||||
|
error,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn content(&self) -> Result<&[u8], FileError> {
|
||||||
|
if let Some(content) = self.content.get() {
|
||||||
|
return Ok(content);
|
||||||
|
}
|
||||||
|
self.content.set(self.read_content()?).unwrap();
|
||||||
|
Ok(self.content.get().unwrap())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn str_content(&self) -> Result<&str, FileError> {
|
||||||
|
let raw = self.content()?;
|
||||||
|
std::str::from_utf8(raw).map_err(|error| FileError::Utf8 {
|
||||||
|
error,
|
||||||
|
path: self.path.clone().into(),
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn load_or_calculate_hash(path: impl AsRef<Path>) -> Result<[u8; 32], std::io::Error> {
|
fn load_or_calculate_hash(path: impl AsRef<Path>) -> Result<[u8; 32], std::io::Error> {
|
||||||
|
|
@ -163,9 +191,16 @@ pub enum FileError {
|
||||||
path: PathBuf,
|
path: PathBuf,
|
||||||
error: std::io::Error,
|
error: std::io::Error,
|
||||||
},
|
},
|
||||||
#[error("Failed to store hash for: {}", path.display())]
|
#[error("Failed to store hash for {}: {error:#}", path.display())]
|
||||||
StoreHash {
|
StoreHash {
|
||||||
path: PathBuf,
|
path: PathBuf,
|
||||||
error: std::io::Error,
|
error: std::io::Error,
|
||||||
},
|
},
|
||||||
|
#[error("Failed to read {}: {error:#}", path.display())]
|
||||||
|
Read {
|
||||||
|
path: PathBuf,
|
||||||
|
error: std::io::Error,
|
||||||
|
},
|
||||||
|
#[error("File {} is not valid utf8: {error:#}", path.display())]
|
||||||
|
Utf8 { error: Utf8Error, path: PathBuf },
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -80,6 +80,8 @@ impl Extractor for RegexMatchExtractor {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_file_meta_matcher() {
|
fn test_file_meta_matcher() {
|
||||||
|
use std::cell::OnceCell;
|
||||||
|
|
||||||
let txt_file = FileInfo {
|
let txt_file = FileInfo {
|
||||||
path: "/tmp/test.txt".into(),
|
path: "/tmp/test.txt".into(),
|
||||||
url: Some("https://example.com/test.txt".into()),
|
url: Some("https://example.com/test.txt".into()),
|
||||||
|
|
@ -87,6 +89,7 @@ fn test_file_meta_matcher() {
|
||||||
sha256: [0; 32],
|
sha256: [0; 32],
|
||||||
mtime: 1234,
|
mtime: 1234,
|
||||||
size: 100,
|
size: 100,
|
||||||
|
content: OnceCell::new(),
|
||||||
};
|
};
|
||||||
let png_file = FileInfo {
|
let png_file = FileInfo {
|
||||||
path: "/tmp/test.png".into(),
|
path: "/tmp/test.png".into(),
|
||||||
|
|
@ -95,6 +98,7 @@ fn test_file_meta_matcher() {
|
||||||
sha256: [0; 32],
|
sha256: [0; 32],
|
||||||
mtime: 1234,
|
mtime: 1234,
|
||||||
size: 100,
|
size: 100,
|
||||||
|
content: OnceCell::new(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let txt_matcher = parse("name", r#"(?<txt_name>.+)\.txt"#).unwrap().unwrap();
|
let txt_matcher = parse("name", r#"(?<txt_name>.+)\.txt"#).unwrap().unwrap();
|
||||||
|
|
|
||||||
12
src/rule.rs
12
src/rule.rs
|
|
@ -1,4 +1,4 @@
|
||||||
use crate::extractor::{Extractor, FileInfoExtractor, MultiExtractor};
|
use crate::extractor::{Extractor, FileInfoExtractor, MultiExtractor, XPathExtractor};
|
||||||
use crate::file::FileInfo;
|
use crate::file::FileInfo;
|
||||||
use crate::matchers::Matcher;
|
use crate::matchers::Matcher;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
|
|
@ -24,8 +24,11 @@ pub struct RuleResult {
|
||||||
impl Rule {
|
impl Rule {
|
||||||
pub fn matches(&self, file: &FileInfo) -> Option<Result<RuleResult, RuleMatchError>> {
|
pub fn matches(&self, file: &FileInfo) -> Option<Result<RuleResult, RuleMatchError>> {
|
||||||
let file_extractor = FileInfoExtractor::new(file);
|
let file_extractor = FileInfoExtractor::new(file);
|
||||||
let mut extractors = MultiExtractor::with_capacity(self.matchers.len() + 1);
|
let xpath_extractor = XPathExtractor::new(file);
|
||||||
|
|
||||||
|
let mut extractors = MultiExtractor::with_capacity(self.matchers.len() + 2);
|
||||||
extractors.push(&file_extractor);
|
extractors.push(&file_extractor);
|
||||||
|
extractors.push(&xpath_extractor);
|
||||||
|
|
||||||
for matcher in &self.matchers {
|
for matcher in &self.matchers {
|
||||||
match matcher.matches(file)? {
|
match matcher.matches(file)? {
|
||||||
|
|
@ -61,7 +64,7 @@ static SUBST_REGEX: OnceLock<Regex> = OnceLock::new();
|
||||||
|
|
||||||
fn apply_extractors<E: Extractor>(input: &str, extractor: &E) -> Result<String, RuleMatchError> {
|
fn apply_extractors<E: Extractor>(input: &str, extractor: &E) -> Result<String, RuleMatchError> {
|
||||||
let subst_regex =
|
let subst_regex =
|
||||||
SUBST_REGEX.get_or_init(|| Regex::new(r#"\$(\{([^})]+)}|([a-zA-Z0-9]+))"#).unwrap());
|
SUBST_REGEX.get_or_init(|| Regex::new(r#"\$(\{([^}]+)}|([a-zA-Z0-9]+))"#).unwrap());
|
||||||
|
|
||||||
// copied from `Regex::replace_all` adjusted to support returning errors
|
// copied from `Regex::replace_all` adjusted to support returning errors
|
||||||
let mut it = subst_regex.captures_iter(input).enumerate().peekable();
|
let mut it = subst_regex.captures_iter(input).enumerate().peekable();
|
||||||
|
|
@ -121,6 +124,8 @@ fn test_apply_extractors() {
|
||||||
#[test]
|
#[test]
|
||||||
fn test_rule() {
|
fn test_rule() {
|
||||||
use crate::matchers::get_matcher;
|
use crate::matchers::get_matcher;
|
||||||
|
use std::cell::OnceCell;
|
||||||
|
|
||||||
let rule = Rule {
|
let rule = Rule {
|
||||||
matchers: vec![get_matcher("name", r#"\.txt"#).unwrap().unwrap()],
|
matchers: vec![get_matcher("name", r#"\.txt"#).unwrap().unwrap()],
|
||||||
target: Some("/target/dir".into()),
|
target: Some("/target/dir".into()),
|
||||||
|
|
@ -134,6 +139,7 @@ fn test_rule() {
|
||||||
sha256: [0; 32],
|
sha256: [0; 32],
|
||||||
mtime: 1234,
|
mtime: 1234,
|
||||||
size: 100,
|
size: 100,
|
||||||
|
content: OnceCell::new(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let result = rule.matches(&txt_file).unwrap().unwrap();
|
let result = rule.matches(&txt_file).unwrap().unwrap();
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue