add remove-duplicates option

This commit is contained in:
Robin Appelman 2025-10-22 19:32:51 +02:00
commit 6705debd2a
8 changed files with 308 additions and 30 deletions

86
Cargo.lock generated
View file

@ -73,6 +73,15 @@ version = "2.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394"
[[package]]
name = "block-buffer"
version = "0.11.0-rc.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9ef36a6fcdb072aa548f3da057640ec10859eb4e91ddf526ee648d50c76a949"
dependencies = [
"hybrid-array",
]
[[package]] [[package]]
name = "cfg-if" name = "cfg-if"
version = "1.0.3" version = "1.0.3"
@ -131,6 +140,30 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
[[package]]
name = "const-oid"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0dabb6555f92fb9ee4140454eb5dcd14c7960e1225c6d1a6cc361f032947713e"
[[package]]
name = "cpufeatures"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
dependencies = [
"libc",
]
[[package]]
name = "crypto-common"
version = "0.2.0-rc.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a8235645834fbc6832939736ce2f2d08192652269e11010a6240f61b908a1c6"
dependencies = [
"hybrid-array",
]
[[package]] [[package]]
name = "ctrlc" name = "ctrlc"
version = "3.5.0" version = "3.5.0"
@ -142,6 +175,17 @@ dependencies = [
"windows-sys 0.61.2", "windows-sys 0.61.2",
] ]
[[package]]
name = "digest"
version = "0.11.0-rc.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dac89f8a64533a9b0eaa73a68e424db0fb1fd6271c74cc0125336a05f090568d"
dependencies = [
"block-buffer",
"const-oid",
"crypto-common",
]
[[package]] [[package]]
name = "dispatch" name = "dispatch"
version = "0.2.0" version = "0.2.0"
@ -161,7 +205,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
dependencies = [ dependencies = [
"libc", "libc",
"windows-sys 0.60.2", "windows-sys 0.61.2",
] ]
[[package]] [[package]]
@ -188,11 +232,13 @@ version = "0.1.0"
dependencies = [ dependencies = [
"clap", "clap",
"ctrlc", "ctrlc",
"hex",
"home", "home",
"main_error", "main_error",
"notify-debouncer-full", "notify-debouncer-full",
"regex", "regex",
"serde", "serde",
"sha2",
"thiserror", "thiserror",
"toml", "toml",
"tracing", "tracing",
@ -212,6 +258,12 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hex"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]] [[package]]
name = "home" name = "home"
version = "0.5.11" version = "0.5.11"
@ -221,6 +273,15 @@ dependencies = [
"windows-sys 0.59.0", "windows-sys 0.59.0",
] ]
[[package]]
name = "hybrid-array"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f471e0a81b2f90ffc0cb2f951ae04da57de8baa46fa99112b062a5173a5088d0"
dependencies = [
"typenum",
]
[[package]] [[package]]
name = "indexmap" name = "indexmap"
version = "2.11.4" version = "2.11.4"
@ -380,7 +441,7 @@ version = "0.50.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
dependencies = [ dependencies = [
"windows-sys 0.60.2", "windows-sys 0.61.2",
] ]
[[package]] [[package]]
@ -458,7 +519,7 @@ dependencies = [
"errno", "errno",
"libc", "libc",
"linux-raw-sys", "linux-raw-sys",
"windows-sys 0.60.2", "windows-sys 0.61.2",
] ]
[[package]] [[package]]
@ -509,6 +570,17 @@ dependencies = [
"serde_core", "serde_core",
] ]
[[package]]
name = "sha2"
version = "0.11.0-rc.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1e3878ab0f98e35b2df35fe53201d088299b41a6bb63e3e34dada2ac4abd924"
dependencies = [
"cfg-if",
"cpufeatures",
"digest",
]
[[package]] [[package]]
name = "sharded-slab" name = "sharded-slab"
version = "0.1.7" version = "0.1.7"
@ -666,6 +738,12 @@ dependencies = [
"tracing-log", "tracing-log",
] ]
[[package]]
name = "typenum"
version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
[[package]] [[package]]
name = "unicode-ident" name = "unicode-ident"
version = "1.0.19" version = "1.0.19"
@ -706,7 +784,7 @@ version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
dependencies = [ dependencies = [
"windows-sys 0.60.2", "windows-sys 0.61.2",
] ]
[[package]] [[package]]

View file

@ -17,4 +17,6 @@ main_error = "0.1.2"
tracing = "0.1.41" tracing = "0.1.41"
tracing-subscriber = "0.3.20" tracing-subscriber = "0.3.20"
notify-debouncer-full = "0.6.0" notify-debouncer-full = "0.6.0"
ctrlc = "3.5.0" ctrlc = "3.5.0"
sha2 = "0.11.0-rc.2"
hex = "0.4.3"

View file

@ -92,5 +92,13 @@ to automatically create a symlink to the new location of the file.
symlink = "~/Downloads/last" symlink = "~/Downloads/last"
``` ```
Note that this symlink will only be set for files that match any of the ## Lastest download symlink
configured rules.
Galton can also be used to clean up duplicate downloads, when enabled, it will
check for any existing file with the same contents in the target directory and
delete the newly download file if a duplicate is found.
```toml
[watch]
remove-duplicates = true
```

View file

@ -1,5 +1,6 @@
[watch] [watch]
symlink = "~/Downloads/last" symlink = "~/Downloads/last"
remove-duplicates = true
[[rule]] [[rule]]
name = "\\.(csv|CSV)" name = "\\.(csv|CSV)"

View file

@ -9,7 +9,10 @@ with lib; let
format = pkgs.formats.toml {}; format = pkgs.formats.toml {};
removeNulls = filterAttrs (_: val: val != null); removeNulls = filterAttrs (_: val: val != null);
configFile = format.generate "galton.toml" { configFile = format.generate "galton.toml" {
watch = removeNulls {inherit (cfg) symlink;}; watch = removeNulls {
inherit (cfg) symlink;
remove-duplicates = cfg.removeDuplicates;
};
rule = map removeNulls cfg.rules; rule = map removeNulls cfg.rules;
}; };
in { in {
@ -28,6 +31,12 @@ in {
description = "Create a symlink to matched files"; description = "Create a symlink to matched files";
}; };
removeDuplicates = mkOption {
type = types.bool;
default = false;
description = "Remove duplicate downloads";
};
rules = mkOption { rules = mkOption {
default = []; default = [];
type = types.listOf (types.submodule { type = types.listOf (types.submodule {

View file

@ -31,6 +31,8 @@ impl GaltonConfig {
#[derive(Debug, Deserialize, Default)] #[derive(Debug, Deserialize, Default)]
pub struct WatchConfig { pub struct WatchConfig {
symlink: Option<String>, symlink: Option<String>,
#[serde(rename = "remove-duplicates", default)]
pub remove_duplicates: bool,
} }
impl WatchConfig { impl WatchConfig {

View file

@ -1,3 +1,8 @@
use hex::FromHex;
use sha2::{Digest, Sha256};
use std::fs::File;
use std::io::Read;
use std::os::unix::fs::MetadataExt;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::time::UNIX_EPOCH; use std::time::UNIX_EPOCH;
use thiserror::Error; use thiserror::Error;
@ -6,9 +11,11 @@ pub struct FileInfo {
pub path: String, pub path: String,
pub url: Option<String>, pub url: Option<String>,
pub referrer: Option<String>, pub referrer: Option<String>,
pub sha256: [u8; 32],
#[allow(dead_code)] #[allow(dead_code)]
pub mtime: u64, pub mtime: u64,
pub mtime_str: String, pub mtime_str: String,
pub size: u64,
} }
impl FileInfo { impl FileInfo {
@ -37,8 +44,10 @@ impl FileInfo {
path: path.into(), path: path.into(),
url: None, url: None,
referrer: None, referrer: None,
sha256: [0; 32],
mtime, mtime,
mtime_str: mtime.to_string(), mtime_str: mtime.to_string(),
size: stat.size(),
}; };
let attributes = xattr::list(path).unwrap_or_default(); let attributes = xattr::list(path).unwrap_or_default();
@ -52,12 +61,33 @@ impl FileInfo {
match attr { match attr {
"user.xdg.origin.url" => file.url = Some(val), "user.xdg.origin.url" => file.url = Some(val),
"user.xdg.referrer.url" => file.referrer = Some(val), "user.xdg.referrer.url" => file.referrer = Some(val),
"user.checksum.sha256" => {
if let Ok(sha) = <[u8; 32]>::from_hex(&val) {
file.sha256 = sha;
}
}
_ => {} _ => {}
} }
} }
} }
} }
if file.sha256 == [0; 32] {
file.sha256 = hash_file(path).map_err(|error| FileError::Hash {
path: path.into(),
error,
})?;
xattr::set(
path,
"user.checksum.sha256",
hex::encode(file.sha256).as_bytes(),
)
.map_err(|error| FileError::StoreHash {
path: path.into(),
error,
})?;
}
Ok(file) Ok(file)
} }
@ -74,6 +104,51 @@ impl FileInfo {
.map(|(parent, _)| parent) .map(|(parent, _)| parent)
.unwrap_or("") .unwrap_or("")
} }
pub fn is_duplicate(&self, other: impl AsRef<Path>) -> Result<bool, FileError> {
let other = other.as_ref();
let other_stat = other.metadata().map_err(|error| FileError::Stat {
path: other.into(),
error,
})?;
if other_stat.size() != self.size {
return Ok(false);
}
Ok(self.sha256
== load_or_calculate_hash(other).map_err(|error| FileError::Hash {
path: other.into(),
error,
})?)
}
}
fn load_or_calculate_hash(path: impl AsRef<Path>) -> Result<[u8; 32], std::io::Error> {
let path = path.as_ref();
if let Ok(Some(val)) = xattr::get(path, "user.checksum.sha256") {
if let Ok(Ok(hash)) = String::from_utf8(val).as_deref().map(<[u8; 32]>::from_hex) {
return Ok(hash);
}
}
let hash = hash_file(path)?;
xattr::set(path, "user.checksum.sha256", hex::encode(hash).as_bytes())?;
Ok(hash)
}
fn hash_file(path: impl AsRef<Path>) -> Result<[u8; 32], std::io::Error> {
let mut file = File::open(path)?;
let mut buffer = [0u8; 8192];
let mut hasher = Sha256::new();
loop {
let count = file.read(&mut buffer)?;
if count == 0 {
break;
}
hasher.update(&buffer[..count]);
}
Ok(hasher.finalize().0)
} }
#[derive(Debug, Error)] #[derive(Debug, Error)]
@ -85,4 +160,14 @@ pub enum FileError {
path: PathBuf, path: PathBuf,
error: std::io::Error, error: std::io::Error,
}, },
#[error("Failed to hash: {}", path.display())]
Hash {
path: PathBuf,
error: std::io::Error,
},
#[error("Failed to store hash for: {}", path.display())]
StoreHash {
path: PathBuf,
error: std::io::Error,
},
} }

View file

@ -5,10 +5,10 @@ use clap::builder::styling::{AnsiColor, Effects};
use clap::builder::Styles; use clap::builder::Styles;
use clap::{Parser, Subcommand}; use clap::{Parser, Subcommand};
use main_error::MainResult; use main_error::MainResult;
use notify_debouncer_full::notify::event::{AccessKind, AccessMode}; use notify_debouncer_full::notify::event::{AccessKind, AccessMode, ModifyKind, RenameMode};
use notify_debouncer_full::notify::{EventKind, RecursiveMode}; use notify_debouncer_full::notify::{EventKind, RecursiveMode};
use notify_debouncer_full::{new_debouncer, DebounceEventResult}; use notify_debouncer_full::{new_debouncer, DebounceEventResult};
use std::fs::{copy, create_dir_all, remove_file, rename}; use std::fs::{copy, create_dir_all, read_dir, remove_file, rename};
use std::io::ErrorKind; use std::io::ErrorKind;
use std::os::unix::fs::symlink; use std::os::unix::fs::symlink;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
@ -63,7 +63,7 @@ fn main() -> MainResult {
match args.command { match args.command {
Commands::File { path } => { Commands::File { path } => {
let file = FileInfo::load(path)?; let file = FileInfo::load(path)?;
handle_file(&file, &config.rule); handle_file(&file, &config.rule, false);
} }
Commands::Watch { path, recursive } => { Commands::Watch { path, recursive } => {
let path = normalize_path(path); let path = normalize_path(path);
@ -87,7 +87,12 @@ fn main() -> MainResult {
} }
})?; })?;
for res in rx { for res in rx {
handle_watch_event(res, &rules, symlink.as_deref()); handle_watch_event(
res,
&rules,
symlink.as_deref(),
config.watch.remove_duplicates,
);
} }
} }
} }
@ -95,24 +100,55 @@ fn main() -> MainResult {
Ok(()) Ok(())
} }
fn handle_watch_event(result: DebounceEventResult, rules: &[Rule], link_target: Option<&str>) { fn handle_watch_event(
result: DebounceEventResult,
rules: &[Rule],
link_target: Option<&str>,
remove_duplicates: bool,
) {
let handle_path = |path: &Path| {
// give originfox time to set xattr
sleep(Duration::from_millis(200));
if is_part(path) {
debug!("skipping part file");
return;
}
match FileInfo::load(path) {
Ok(file) => maybe_link(
handle_file(&file, rules, remove_duplicates).as_deref(),
link_target,
),
Err(error) => {
error!(%error, "failed to load file info");
}
}
};
match result { match result {
Ok(events) => { Ok(events) => {
for event in events { for event in events {
if event.kind == EventKind::Access(AccessKind::Close(AccessMode::Write)) { match event.kind {
for path in &event.paths { EventKind::Access(AccessKind::Close(AccessMode::Write)) => {
debug!("write event for {}", path.display()); for path in &event.paths {
// give originfox time to set xattr debug!("write event for {}", path.display());
sleep(Duration::from_millis(200)); handle_path(path);
match FileInfo::load(path) {
Ok(file) => {
maybe_link(handle_file(&file, rules).as_deref(), link_target)
}
Err(error) => {
error!(%error, "failed to load file info");
}
} }
} }
EventKind::Modify(ModifyKind::Name(RenameMode::Both)) => {
if event.paths.len() == 2 {
let from = &event.paths[0];
let to = &event.paths[1];
debug!("rename event for {} -> {}", from.display(), to.display());
if is_part(from) && !is_part(to) {
handle_path(to);
}
} else {
error!("Invalid rename event");
}
}
_ => {}
} }
} }
} }
@ -124,7 +160,11 @@ fn handle_watch_event(result: DebounceEventResult, rules: &[Rule], link_target:
} }
} }
fn maybe_link(source: Option<&str>, target: Option<&str>) { fn is_part(path: &Path) -> bool {
path.extension().and_then(|ext| ext.to_str()) == Some("part")
}
fn maybe_link(source: Option<&Path>, target: Option<&str>) {
if let (Some(source), Some(target)) = (source, target) { if let (Some(source), Some(target)) = (source, target) {
if Path::new(target).exists() { if Path::new(target).exists() {
if let Err(error) = remove_file(target) { if let Err(error) = remove_file(target) {
@ -134,7 +174,7 @@ fn maybe_link(source: Option<&str>, target: Option<&str>) {
} }
match symlink(source, target) { match symlink(source, target) {
Ok(()) => { Ok(()) => {
info!(to = target, from = source, "created symlink"); info!(to = target, from = %source.display(), "created symlink");
} }
Err(error) => { Err(error) => {
error!(%error, "failed to link target"); error!(%error, "failed to link target");
@ -154,10 +194,22 @@ fn match_file(file: &FileInfo, rules: &[Rule]) -> Option<RuleMatch> {
} }
#[instrument(skip_all, fields(file = file.path))] #[instrument(skip_all, fields(file = file.path))]
fn handle_file(file: &FileInfo, rules: &[Rule]) -> Option<String> { fn handle_file(file: &FileInfo, rules: &[Rule], remove_duplicates: bool) -> Option<PathBuf> {
let Some(result) = match_file(file, rules) else { let Some(result) = match_file(file, rules) else {
info!("no matches"); info!(url = file.url, "removing duplicate");
return None;
if remove_duplicates {
let parent = Path::new(&file.path).parent().unwrap();
if let Some(duplicate) = has_duplicate(file, parent) {
info!(url = file.url, diplicate = %duplicate.display(), "removing duplicate");
if let Err(error) = remove_file(&file.path) {
error!(%error, "failed to remove duplicate");
}
return Some(duplicate);
}
}
return Some(file.path.clone().into());
}; };
let parent = result.target.as_deref().unwrap_or_else(|| file.parent()); let parent = result.target.as_deref().unwrap_or_else(|| file.parent());
@ -168,12 +220,22 @@ fn handle_file(file: &FileInfo, rules: &[Rule]) -> Option<String> {
return None; return None;
} }
if remove_duplicates {
if let Some(duplicate) = has_duplicate(file, parent) {
info!(url = file.url, diplicate = %duplicate.display(), "removing duplicate");
if let Err(error) = remove_file(&file.path) {
error!(%error, "failed to remove duplicate");
}
return Some(duplicate);
}
}
let target = format!("{parent}/{name}"); let target = format!("{parent}/{name}");
match cross_storage_move(&file.path, &target) { match cross_storage_move(&file.path, &target) {
Ok(()) => { Ok(()) => {
info!(target, "moved file"); info!(target, "moved file");
Some(target) Some(target.into())
} }
Err(error) => { Err(error) => {
info!(target, ?error, "failed to moved file"); info!(target, ?error, "failed to moved file");
@ -182,6 +244,37 @@ fn handle_file(file: &FileInfo, rules: &[Rule]) -> Option<String> {
} }
} }
fn has_duplicate(file: &FileInfo, dir: impl AsRef<Path>) -> Option<PathBuf> {
let dir = match read_dir(dir) {
Ok(dir) => dir,
Err(error) => {
error!(%error, "failed to list target directory");
return None;
}
};
for entry in dir.flatten() {
let path = entry.path();
if path.to_str() == Some(file.path.as_str()) {
continue;
}
if !path.is_file() {
continue;
}
match file.is_duplicate(&path) {
Ok(is_dup) => {
if is_dup {
return Some(path);
}
}
Err(error) => {
error!(%error, path = %path.display(), "failed to determine if a file is duplicate");
}
}
}
None
}
fn cross_storage_move(source: impl AsRef<Path>, target: impl AsRef<Path>) -> std::io::Result<()> { fn cross_storage_move(source: impl AsRef<Path>, target: impl AsRef<Path>) -> std::io::Result<()> {
let source = source.as_ref(); let source = source.as_ref();
let target = target.as_ref(); let target = target.as_ref();