add xpath extractor

This commit is contained in:
Robin Appelman 2025-11-03 21:21:25 +01:00
commit b423323473
8 changed files with 1198 additions and 57 deletions

View file

@ -1,8 +1,11 @@
mod xpath;
use crate::file::FileInfo;
use std::borrow::{Borrow, Cow};
use std::collections::HashMap;
use std::error::Error;
use std::hash::Hash;
pub use xpath::XPathExtractor;
#[derive(Default)]
pub struct MultiExtractor<'a> {

132
src/extractor/xpath.rs Normal file
View file

@ -0,0 +1,132 @@
use crate::extractor::Extractor;
use crate::file::{FileError, FileInfo};
use std::borrow::Cow;
use std::error::Error;
use std::path::PathBuf;
use thiserror::Error;
use xrust::parser::xml::parse as xmlparse;
use xrust::parser::xpath::parse;
use xrust::transform::context::{ContextBuilder, StaticContextBuilder};
use xrust::trees::smite::RNode;
use xrust::{Error as XPathParseError, Item, Node, SequenceTrait};
pub struct XPathExtractor<'a> {
file: &'a FileInfo,
}
impl<'a> XPathExtractor<'a> {
pub fn new(file: &'a FileInfo) -> XPathExtractor<'a> {
XPathExtractor { file }
}
}
impl Extractor for XPathExtractor<'_> {
fn extract<'this>(&'this self, field: &str) -> Option<Result<Cow<'this, str>, Box<dyn Error>>> {
let query = field
.strip_prefix("xpath('")
.and_then(|query| query.strip_suffix("')"))?;
let transform = match parse::<RNode>(query, None) {
Ok(transform) => transform,
Err(error) => {
return Some(Err(XPathError::Parse {
query: query.into(),
error,
}
.into()));
}
};
let content = match self.file.str_content() {
Ok(content) => content,
Err(error) => return Some(Err(XPathError::ReadFile(error).into())),
};
let xml = match xmlparse(RNode::new_document(), content, None) {
Ok(xml) => xml,
Err(error) => {
return Some(Err(XPathError::ParseXml {
error,
path: self.file.path.clone().into(),
}
.into()));
}
};
let mut static_context = StaticContextBuilder::new()
.message(|_| Ok(()))
.fetcher(|_| Ok(String::new()))
.parser(|_| unreachable!())
.build();
let context = ContextBuilder::new().context(vec![Item::Node(xml)]).build();
let sequence = match context.dispatch(&mut static_context, &transform) {
Ok(res) => res,
Err(error) => {
return Some(Err(XPathError::MatchError {
error,
query: query.into(),
}
.into()));
}
};
Some(Ok(sequence.to_xml().into()))
}
}
#[derive(Debug, Error)]
enum XPathError {
#[error("Failed to parse xpath '{query}': {error:#}")]
Parse {
error: XPathParseError,
query: String,
},
#[error(transparent)]
ReadFile(FileError),
#[error("Failed to parse xml '{}': {error:#}", path.display())]
ParseXml {
error: XPathParseError,
path: PathBuf,
},
#[error("Failed to match xpath '{query}': {error:#}")]
MatchError {
error: XPathParseError,
query: String,
},
}
#[test]
fn test_xpath() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<Run version="1.7.0">
<GameIcon></GameIcon>
<GameName>Hollow Knight: Silksong</GameName>
<CategoryName>Any%</CategoryName>
<Metadata>
<Run id=""/>
<Platform usesEmulator="False"/>
<Variables>
<Variable name="Any% Subcategory">No Major Glitches</Variable>
</Variables>
</Metadata>
<Offset>00:00:00</Offset>
</Run>"#;
use std::cell::OnceCell;
let xml_file = FileInfo {
path: "/tmp/test.lss".into(),
url: None,
referrer: None,
sha256: [0; 32],
mtime: 1234,
size: 100,
content: OnceCell::from(xml.as_bytes().to_vec()),
};
let matcher = XPathExtractor::new(&xml_file);
assert_eq!(
"Hollow Knight: Silksong",
matcher
.extract("xpath('//GameName/text()')")
.unwrap()
.unwrap()
);
}

View file

@ -1,9 +1,11 @@
use hex::FromHex;
use sha2::{Digest, Sha256};
use std::fs::File;
use std::cell::OnceCell;
use std::fs::{read, File};
use std::io::Read;
use std::os::unix::fs::MetadataExt;
use std::path::{Path, PathBuf};
use std::str::Utf8Error;
use std::time::UNIX_EPOCH;
use thiserror::Error;
@ -15,6 +17,7 @@ pub struct FileInfo {
#[allow(dead_code)]
pub mtime: u64,
pub size: u64,
pub content: OnceCell<Vec<u8>>,
}
impl FileInfo {
@ -46,6 +49,7 @@ impl FileInfo {
sha256: [0; 32],
mtime,
size: stat.size(),
content: OnceCell::new(),
};
let attributes = xattr::list(path).unwrap_or_default();
@ -119,6 +123,30 @@ impl FileInfo {
error,
})?)
}
fn read_content(&self) -> Result<Vec<u8>, FileError> {
let path = Path::new(&self.path);
read(path).map_err(|error| FileError::Read {
path: path.into(),
error,
})
}
pub fn content(&self) -> Result<&[u8], FileError> {
if let Some(content) = self.content.get() {
return Ok(content);
}
self.content.set(self.read_content()?).unwrap();
Ok(self.content.get().unwrap())
}
pub fn str_content(&self) -> Result<&str, FileError> {
let raw = self.content()?;
std::str::from_utf8(raw).map_err(|error| FileError::Utf8 {
error,
path: self.path.clone().into(),
})
}
}
fn load_or_calculate_hash(path: impl AsRef<Path>) -> Result<[u8; 32], std::io::Error> {
@ -163,9 +191,16 @@ pub enum FileError {
path: PathBuf,
error: std::io::Error,
},
#[error("Failed to store hash for: {}", path.display())]
#[error("Failed to store hash for {}: {error:#}", path.display())]
StoreHash {
path: PathBuf,
error: std::io::Error,
},
#[error("Failed to read {}: {error:#}", path.display())]
Read {
path: PathBuf,
error: std::io::Error,
},
#[error("File {} is not valid utf8: {error:#}", path.display())]
Utf8 { error: Utf8Error, path: PathBuf },
}

View file

@ -80,6 +80,8 @@ impl Extractor for RegexMatchExtractor {
#[test]
fn test_file_meta_matcher() {
use std::cell::OnceCell;
let txt_file = FileInfo {
path: "/tmp/test.txt".into(),
url: Some("https://example.com/test.txt".into()),
@ -87,6 +89,7 @@ fn test_file_meta_matcher() {
sha256: [0; 32],
mtime: 1234,
size: 100,
content: OnceCell::new(),
};
let png_file = FileInfo {
path: "/tmp/test.png".into(),
@ -95,6 +98,7 @@ fn test_file_meta_matcher() {
sha256: [0; 32],
mtime: 1234,
size: 100,
content: OnceCell::new(),
};
let txt_matcher = parse("name", r#"(?<txt_name>.+)\.txt"#).unwrap().unwrap();

View file

@ -1,4 +1,4 @@
use crate::extractor::{Extractor, FileInfoExtractor, MultiExtractor};
use crate::extractor::{Extractor, FileInfoExtractor, MultiExtractor, XPathExtractor};
use crate::file::FileInfo;
use crate::matchers::Matcher;
use regex::Regex;
@ -24,8 +24,11 @@ pub struct RuleResult {
impl Rule {
pub fn matches(&self, file: &FileInfo) -> Option<Result<RuleResult, RuleMatchError>> {
let file_extractor = FileInfoExtractor::new(file);
let mut extractors = MultiExtractor::with_capacity(self.matchers.len() + 1);
let xpath_extractor = XPathExtractor::new(file);
let mut extractors = MultiExtractor::with_capacity(self.matchers.len() + 2);
extractors.push(&file_extractor);
extractors.push(&xpath_extractor);
for matcher in &self.matchers {
match matcher.matches(file)? {
@ -61,7 +64,7 @@ static SUBST_REGEX: OnceLock<Regex> = OnceLock::new();
fn apply_extractors<E: Extractor>(input: &str, extractor: &E) -> Result<String, RuleMatchError> {
let subst_regex =
SUBST_REGEX.get_or_init(|| Regex::new(r#"\$(\{([^})]+)}|([a-zA-Z0-9]+))"#).unwrap());
SUBST_REGEX.get_or_init(|| Regex::new(r#"\$(\{([^}]+)}|([a-zA-Z0-9]+))"#).unwrap());
// copied from `Regex::replace_all` adjusted to support returning errors
let mut it = subst_regex.captures_iter(input).enumerate().peekable();
@ -121,6 +124,8 @@ fn test_apply_extractors() {
#[test]
fn test_rule() {
use crate::matchers::get_matcher;
use std::cell::OnceCell;
let rule = Rule {
matchers: vec![get_matcher("name", r#"\.txt"#).unwrap().unwrap()],
target: Some("/target/dir".into()),
@ -134,6 +139,7 @@ fn test_rule() {
sha256: [0; 32],
mtime: 1234,
size: 100,
content: OnceCell::new(),
};
let result = rule.matches(&txt_file).unwrap().unwrap();