add remove-duplicates option

This commit is contained in:
Robin Appelman 2025-10-22 19:32:51 +02:00
commit 6705debd2a
8 changed files with 308 additions and 30 deletions

View file

@ -31,6 +31,8 @@ impl GaltonConfig {
#[derive(Debug, Deserialize, Default)]
pub struct WatchConfig {
symlink: Option<String>,
#[serde(rename = "remove-duplicates", default)]
pub remove_duplicates: bool,
}
impl WatchConfig {

View file

@ -1,3 +1,8 @@
use hex::FromHex;
use sha2::{Digest, Sha256};
use std::fs::File;
use std::io::Read;
use std::os::unix::fs::MetadataExt;
use std::path::{Path, PathBuf};
use std::time::UNIX_EPOCH;
use thiserror::Error;
@ -6,9 +11,11 @@ pub struct FileInfo {
pub path: String,
pub url: Option<String>,
pub referrer: Option<String>,
pub sha256: [u8; 32],
#[allow(dead_code)]
pub mtime: u64,
pub mtime_str: String,
pub size: u64,
}
impl FileInfo {
@ -37,8 +44,10 @@ impl FileInfo {
path: path.into(),
url: None,
referrer: None,
sha256: [0; 32],
mtime,
mtime_str: mtime.to_string(),
size: stat.size(),
};
let attributes = xattr::list(path).unwrap_or_default();
@ -52,12 +61,33 @@ impl FileInfo {
match attr {
"user.xdg.origin.url" => file.url = Some(val),
"user.xdg.referrer.url" => file.referrer = Some(val),
"user.checksum.sha256" => {
if let Ok(sha) = <[u8; 32]>::from_hex(&val) {
file.sha256 = sha;
}
}
_ => {}
}
}
}
}
if file.sha256 == [0; 32] {
file.sha256 = hash_file(path).map_err(|error| FileError::Hash {
path: path.into(),
error,
})?;
xattr::set(
path,
"user.checksum.sha256",
hex::encode(file.sha256).as_bytes(),
)
.map_err(|error| FileError::StoreHash {
path: path.into(),
error,
})?;
}
Ok(file)
}
@ -74,6 +104,51 @@ impl FileInfo {
.map(|(parent, _)| parent)
.unwrap_or("")
}
pub fn is_duplicate(&self, other: impl AsRef<Path>) -> Result<bool, FileError> {
let other = other.as_ref();
let other_stat = other.metadata().map_err(|error| FileError::Stat {
path: other.into(),
error,
})?;
if other_stat.size() != self.size {
return Ok(false);
}
Ok(self.sha256
== load_or_calculate_hash(other).map_err(|error| FileError::Hash {
path: other.into(),
error,
})?)
}
}
fn load_or_calculate_hash(path: impl AsRef<Path>) -> Result<[u8; 32], std::io::Error> {
let path = path.as_ref();
if let Ok(Some(val)) = xattr::get(path, "user.checksum.sha256") {
if let Ok(Ok(hash)) = String::from_utf8(val).as_deref().map(<[u8; 32]>::from_hex) {
return Ok(hash);
}
}
let hash = hash_file(path)?;
xattr::set(path, "user.checksum.sha256", hex::encode(hash).as_bytes())?;
Ok(hash)
}
fn hash_file(path: impl AsRef<Path>) -> Result<[u8; 32], std::io::Error> {
let mut file = File::open(path)?;
let mut buffer = [0u8; 8192];
let mut hasher = Sha256::new();
loop {
let count = file.read(&mut buffer)?;
if count == 0 {
break;
}
hasher.update(&buffer[..count]);
}
Ok(hasher.finalize().0)
}
#[derive(Debug, Error)]
@ -85,4 +160,14 @@ pub enum FileError {
path: PathBuf,
error: std::io::Error,
},
#[error("Failed to hash: {}", path.display())]
Hash {
path: PathBuf,
error: std::io::Error,
},
#[error("Failed to store hash for: {}", path.display())]
StoreHash {
path: PathBuf,
error: std::io::Error,
},
}

View file

@ -5,10 +5,10 @@ use clap::builder::styling::{AnsiColor, Effects};
use clap::builder::Styles;
use clap::{Parser, Subcommand};
use main_error::MainResult;
use notify_debouncer_full::notify::event::{AccessKind, AccessMode};
use notify_debouncer_full::notify::event::{AccessKind, AccessMode, ModifyKind, RenameMode};
use notify_debouncer_full::notify::{EventKind, RecursiveMode};
use notify_debouncer_full::{new_debouncer, DebounceEventResult};
use std::fs::{copy, create_dir_all, remove_file, rename};
use std::fs::{copy, create_dir_all, read_dir, remove_file, rename};
use std::io::ErrorKind;
use std::os::unix::fs::symlink;
use std::path::{Path, PathBuf};
@ -63,7 +63,7 @@ fn main() -> MainResult {
match args.command {
Commands::File { path } => {
let file = FileInfo::load(path)?;
handle_file(&file, &config.rule);
handle_file(&file, &config.rule, false);
}
Commands::Watch { path, recursive } => {
let path = normalize_path(path);
@ -87,7 +87,12 @@ fn main() -> MainResult {
}
})?;
for res in rx {
handle_watch_event(res, &rules, symlink.as_deref());
handle_watch_event(
res,
&rules,
symlink.as_deref(),
config.watch.remove_duplicates,
);
}
}
}
@ -95,24 +100,55 @@ fn main() -> MainResult {
Ok(())
}
fn handle_watch_event(result: DebounceEventResult, rules: &[Rule], link_target: Option<&str>) {
fn handle_watch_event(
result: DebounceEventResult,
rules: &[Rule],
link_target: Option<&str>,
remove_duplicates: bool,
) {
let handle_path = |path: &Path| {
// give originfox time to set xattr
sleep(Duration::from_millis(200));
if is_part(path) {
debug!("skipping part file");
return;
}
match FileInfo::load(path) {
Ok(file) => maybe_link(
handle_file(&file, rules, remove_duplicates).as_deref(),
link_target,
),
Err(error) => {
error!(%error, "failed to load file info");
}
}
};
match result {
Ok(events) => {
for event in events {
if event.kind == EventKind::Access(AccessKind::Close(AccessMode::Write)) {
for path in &event.paths {
debug!("write event for {}", path.display());
// give originfox time to set xattr
sleep(Duration::from_millis(200));
match FileInfo::load(path) {
Ok(file) => {
maybe_link(handle_file(&file, rules).as_deref(), link_target)
}
Err(error) => {
error!(%error, "failed to load file info");
}
match event.kind {
EventKind::Access(AccessKind::Close(AccessMode::Write)) => {
for path in &event.paths {
debug!("write event for {}", path.display());
handle_path(path);
}
}
EventKind::Modify(ModifyKind::Name(RenameMode::Both)) => {
if event.paths.len() == 2 {
let from = &event.paths[0];
let to = &event.paths[1];
debug!("rename event for {} -> {}", from.display(), to.display());
if is_part(from) && !is_part(to) {
handle_path(to);
}
} else {
error!("Invalid rename event");
}
}
_ => {}
}
}
}
@ -124,7 +160,11 @@ fn handle_watch_event(result: DebounceEventResult, rules: &[Rule], link_target:
}
}
fn maybe_link(source: Option<&str>, target: Option<&str>) {
fn is_part(path: &Path) -> bool {
path.extension().and_then(|ext| ext.to_str()) == Some("part")
}
fn maybe_link(source: Option<&Path>, target: Option<&str>) {
if let (Some(source), Some(target)) = (source, target) {
if Path::new(target).exists() {
if let Err(error) = remove_file(target) {
@ -134,7 +174,7 @@ fn maybe_link(source: Option<&str>, target: Option<&str>) {
}
match symlink(source, target) {
Ok(()) => {
info!(to = target, from = source, "created symlink");
info!(to = target, from = %source.display(), "created symlink");
}
Err(error) => {
error!(%error, "failed to link target");
@ -154,10 +194,22 @@ fn match_file(file: &FileInfo, rules: &[Rule]) -> Option<RuleMatch> {
}
#[instrument(skip_all, fields(file = file.path))]
fn handle_file(file: &FileInfo, rules: &[Rule]) -> Option<String> {
fn handle_file(file: &FileInfo, rules: &[Rule], remove_duplicates: bool) -> Option<PathBuf> {
let Some(result) = match_file(file, rules) else {
info!("no matches");
return None;
info!(url = file.url, "removing duplicate");
if remove_duplicates {
let parent = Path::new(&file.path).parent().unwrap();
if let Some(duplicate) = has_duplicate(file, parent) {
info!(url = file.url, diplicate = %duplicate.display(), "removing duplicate");
if let Err(error) = remove_file(&file.path) {
error!(%error, "failed to remove duplicate");
}
return Some(duplicate);
}
}
return Some(file.path.clone().into());
};
let parent = result.target.as_deref().unwrap_or_else(|| file.parent());
@ -168,12 +220,22 @@ fn handle_file(file: &FileInfo, rules: &[Rule]) -> Option<String> {
return None;
}
if remove_duplicates {
if let Some(duplicate) = has_duplicate(file, parent) {
info!(url = file.url, diplicate = %duplicate.display(), "removing duplicate");
if let Err(error) = remove_file(&file.path) {
error!(%error, "failed to remove duplicate");
}
return Some(duplicate);
}
}
let target = format!("{parent}/{name}");
match cross_storage_move(&file.path, &target) {
Ok(()) => {
info!(target, "moved file");
Some(target)
Some(target.into())
}
Err(error) => {
info!(target, ?error, "failed to moved file");
@ -182,6 +244,37 @@ fn handle_file(file: &FileInfo, rules: &[Rule]) -> Option<String> {
}
}
fn has_duplicate(file: &FileInfo, dir: impl AsRef<Path>) -> Option<PathBuf> {
let dir = match read_dir(dir) {
Ok(dir) => dir,
Err(error) => {
error!(%error, "failed to list target directory");
return None;
}
};
for entry in dir.flatten() {
let path = entry.path();
if path.to_str() == Some(file.path.as_str()) {
continue;
}
if !path.is_file() {
continue;
}
match file.is_duplicate(&path) {
Ok(is_dup) => {
if is_dup {
return Some(path);
}
}
Err(error) => {
error!(%error, path = %path.display(), "failed to determine if a file is duplicate");
}
}
}
None
}
fn cross_storage_move(source: impl AsRef<Path>, target: impl AsRef<Path>) -> std::io::Result<()> {
let source = source.as_ref();
let target = target.as_ref();