add remove-duplicates option

This commit is contained in:
Robin Appelman 2025-10-22 19:32:51 +02:00
commit 6705debd2a
8 changed files with 308 additions and 30 deletions

86
Cargo.lock generated
View file

@ -73,6 +73,15 @@ version = "2.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394"
[[package]]
name = "block-buffer"
version = "0.11.0-rc.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9ef36a6fcdb072aa548f3da057640ec10859eb4e91ddf526ee648d50c76a949"
dependencies = [
"hybrid-array",
]
[[package]]
name = "cfg-if"
version = "1.0.3"
@ -131,6 +140,30 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
[[package]]
name = "const-oid"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0dabb6555f92fb9ee4140454eb5dcd14c7960e1225c6d1a6cc361f032947713e"
[[package]]
name = "cpufeatures"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
dependencies = [
"libc",
]
[[package]]
name = "crypto-common"
version = "0.2.0-rc.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a8235645834fbc6832939736ce2f2d08192652269e11010a6240f61b908a1c6"
dependencies = [
"hybrid-array",
]
[[package]]
name = "ctrlc"
version = "3.5.0"
@ -142,6 +175,17 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "digest"
version = "0.11.0-rc.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dac89f8a64533a9b0eaa73a68e424db0fb1fd6271c74cc0125336a05f090568d"
dependencies = [
"block-buffer",
"const-oid",
"crypto-common",
]
[[package]]
name = "dispatch"
version = "0.2.0"
@ -161,7 +205,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
dependencies = [
"libc",
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]
@ -188,11 +232,13 @@ version = "0.1.0"
dependencies = [
"clap",
"ctrlc",
"hex",
"home",
"main_error",
"notify-debouncer-full",
"regex",
"serde",
"sha2",
"thiserror",
"toml",
"tracing",
@ -212,6 +258,12 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hex"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]]
name = "home"
version = "0.5.11"
@ -221,6 +273,15 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "hybrid-array"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f471e0a81b2f90ffc0cb2f951ae04da57de8baa46fa99112b062a5173a5088d0"
dependencies = [
"typenum",
]
[[package]]
name = "indexmap"
version = "2.11.4"
@ -380,7 +441,7 @@ version = "0.50.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
dependencies = [
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]
@ -458,7 +519,7 @@ dependencies = [
"errno",
"libc",
"linux-raw-sys",
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]
@ -509,6 +570,17 @@ dependencies = [
"serde_core",
]
[[package]]
name = "sha2"
version = "0.11.0-rc.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1e3878ab0f98e35b2df35fe53201d088299b41a6bb63e3e34dada2ac4abd924"
dependencies = [
"cfg-if",
"cpufeatures",
"digest",
]
[[package]]
name = "sharded-slab"
version = "0.1.7"
@ -666,6 +738,12 @@ dependencies = [
"tracing-log",
]
[[package]]
name = "typenum"
version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
[[package]]
name = "unicode-ident"
version = "1.0.19"
@ -706,7 +784,7 @@ version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
dependencies = [
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]

View file

@ -18,3 +18,5 @@ tracing = "0.1.41"
tracing-subscriber = "0.3.20"
notify-debouncer-full = "0.6.0"
ctrlc = "3.5.0"
sha2 = "0.11.0-rc.2"
hex = "0.4.3"

View file

@ -92,5 +92,13 @@ to automatically create a symlink to the new location of the file.
symlink = "~/Downloads/last"
```
Note that this symlink will only be set for files that match any of the
configured rules.
## Lastest download symlink
Galton can also be used to clean up duplicate downloads, when enabled, it will
check for any existing file with the same contents in the target directory and
delete the newly download file if a duplicate is found.
```toml
[watch]
remove-duplicates = true
```

View file

@ -1,5 +1,6 @@
[watch]
symlink = "~/Downloads/last"
remove-duplicates = true
[[rule]]
name = "\\.(csv|CSV)"

View file

@ -9,7 +9,10 @@ with lib; let
format = pkgs.formats.toml {};
removeNulls = filterAttrs (_: val: val != null);
configFile = format.generate "galton.toml" {
watch = removeNulls {inherit (cfg) symlink;};
watch = removeNulls {
inherit (cfg) symlink;
remove-duplicates = cfg.removeDuplicates;
};
rule = map removeNulls cfg.rules;
};
in {
@ -28,6 +31,12 @@ in {
description = "Create a symlink to matched files";
};
removeDuplicates = mkOption {
type = types.bool;
default = false;
description = "Remove duplicate downloads";
};
rules = mkOption {
default = [];
type = types.listOf (types.submodule {

View file

@ -31,6 +31,8 @@ impl GaltonConfig {
#[derive(Debug, Deserialize, Default)]
pub struct WatchConfig {
symlink: Option<String>,
#[serde(rename = "remove-duplicates", default)]
pub remove_duplicates: bool,
}
impl WatchConfig {

View file

@ -1,3 +1,8 @@
use hex::FromHex;
use sha2::{Digest, Sha256};
use std::fs::File;
use std::io::Read;
use std::os::unix::fs::MetadataExt;
use std::path::{Path, PathBuf};
use std::time::UNIX_EPOCH;
use thiserror::Error;
@ -6,9 +11,11 @@ pub struct FileInfo {
pub path: String,
pub url: Option<String>,
pub referrer: Option<String>,
pub sha256: [u8; 32],
#[allow(dead_code)]
pub mtime: u64,
pub mtime_str: String,
pub size: u64,
}
impl FileInfo {
@ -37,8 +44,10 @@ impl FileInfo {
path: path.into(),
url: None,
referrer: None,
sha256: [0; 32],
mtime,
mtime_str: mtime.to_string(),
size: stat.size(),
};
let attributes = xattr::list(path).unwrap_or_default();
@ -52,12 +61,33 @@ impl FileInfo {
match attr {
"user.xdg.origin.url" => file.url = Some(val),
"user.xdg.referrer.url" => file.referrer = Some(val),
"user.checksum.sha256" => {
if let Ok(sha) = <[u8; 32]>::from_hex(&val) {
file.sha256 = sha;
}
}
_ => {}
}
}
}
}
if file.sha256 == [0; 32] {
file.sha256 = hash_file(path).map_err(|error| FileError::Hash {
path: path.into(),
error,
})?;
xattr::set(
path,
"user.checksum.sha256",
hex::encode(file.sha256).as_bytes(),
)
.map_err(|error| FileError::StoreHash {
path: path.into(),
error,
})?;
}
Ok(file)
}
@ -74,6 +104,51 @@ impl FileInfo {
.map(|(parent, _)| parent)
.unwrap_or("")
}
pub fn is_duplicate(&self, other: impl AsRef<Path>) -> Result<bool, FileError> {
let other = other.as_ref();
let other_stat = other.metadata().map_err(|error| FileError::Stat {
path: other.into(),
error,
})?;
if other_stat.size() != self.size {
return Ok(false);
}
Ok(self.sha256
== load_or_calculate_hash(other).map_err(|error| FileError::Hash {
path: other.into(),
error,
})?)
}
}
fn load_or_calculate_hash(path: impl AsRef<Path>) -> Result<[u8; 32], std::io::Error> {
let path = path.as_ref();
if let Ok(Some(val)) = xattr::get(path, "user.checksum.sha256") {
if let Ok(Ok(hash)) = String::from_utf8(val).as_deref().map(<[u8; 32]>::from_hex) {
return Ok(hash);
}
}
let hash = hash_file(path)?;
xattr::set(path, "user.checksum.sha256", hex::encode(hash).as_bytes())?;
Ok(hash)
}
fn hash_file(path: impl AsRef<Path>) -> Result<[u8; 32], std::io::Error> {
let mut file = File::open(path)?;
let mut buffer = [0u8; 8192];
let mut hasher = Sha256::new();
loop {
let count = file.read(&mut buffer)?;
if count == 0 {
break;
}
hasher.update(&buffer[..count]);
}
Ok(hasher.finalize().0)
}
#[derive(Debug, Error)]
@ -85,4 +160,14 @@ pub enum FileError {
path: PathBuf,
error: std::io::Error,
},
#[error("Failed to hash: {}", path.display())]
Hash {
path: PathBuf,
error: std::io::Error,
},
#[error("Failed to store hash for: {}", path.display())]
StoreHash {
path: PathBuf,
error: std::io::Error,
},
}

View file

@ -5,10 +5,10 @@ use clap::builder::styling::{AnsiColor, Effects};
use clap::builder::Styles;
use clap::{Parser, Subcommand};
use main_error::MainResult;
use notify_debouncer_full::notify::event::{AccessKind, AccessMode};
use notify_debouncer_full::notify::event::{AccessKind, AccessMode, ModifyKind, RenameMode};
use notify_debouncer_full::notify::{EventKind, RecursiveMode};
use notify_debouncer_full::{new_debouncer, DebounceEventResult};
use std::fs::{copy, create_dir_all, remove_file, rename};
use std::fs::{copy, create_dir_all, read_dir, remove_file, rename};
use std::io::ErrorKind;
use std::os::unix::fs::symlink;
use std::path::{Path, PathBuf};
@ -63,7 +63,7 @@ fn main() -> MainResult {
match args.command {
Commands::File { path } => {
let file = FileInfo::load(path)?;
handle_file(&file, &config.rule);
handle_file(&file, &config.rule, false);
}
Commands::Watch { path, recursive } => {
let path = normalize_path(path);
@ -87,7 +87,12 @@ fn main() -> MainResult {
}
})?;
for res in rx {
handle_watch_event(res, &rules, symlink.as_deref());
handle_watch_event(
res,
&rules,
symlink.as_deref(),
config.watch.remove_duplicates,
);
}
}
}
@ -95,25 +100,56 @@ fn main() -> MainResult {
Ok(())
}
fn handle_watch_event(result: DebounceEventResult, rules: &[Rule], link_target: Option<&str>) {
match result {
Ok(events) => {
for event in events {
if event.kind == EventKind::Access(AccessKind::Close(AccessMode::Write)) {
for path in &event.paths {
debug!("write event for {}", path.display());
fn handle_watch_event(
result: DebounceEventResult,
rules: &[Rule],
link_target: Option<&str>,
remove_duplicates: bool,
) {
let handle_path = |path: &Path| {
// give originfox time to set xattr
sleep(Duration::from_millis(200));
match FileInfo::load(path) {
Ok(file) => {
maybe_link(handle_file(&file, rules).as_deref(), link_target)
if is_part(path) {
debug!("skipping part file");
return;
}
match FileInfo::load(path) {
Ok(file) => maybe_link(
handle_file(&file, rules, remove_duplicates).as_deref(),
link_target,
),
Err(error) => {
error!(%error, "failed to load file info");
}
}
};
match result {
Ok(events) => {
for event in events {
match event.kind {
EventKind::Access(AccessKind::Close(AccessMode::Write)) => {
for path in &event.paths {
debug!("write event for {}", path.display());
handle_path(path);
}
}
EventKind::Modify(ModifyKind::Name(RenameMode::Both)) => {
if event.paths.len() == 2 {
let from = &event.paths[0];
let to = &event.paths[1];
debug!("rename event for {} -> {}", from.display(), to.display());
if is_part(from) && !is_part(to) {
handle_path(to);
}
} else {
error!("Invalid rename event");
}
}
_ => {}
}
}
}
Err(errors) => {
@ -124,7 +160,11 @@ fn handle_watch_event(result: DebounceEventResult, rules: &[Rule], link_target:
}
}
fn maybe_link(source: Option<&str>, target: Option<&str>) {
fn is_part(path: &Path) -> bool {
path.extension().and_then(|ext| ext.to_str()) == Some("part")
}
fn maybe_link(source: Option<&Path>, target: Option<&str>) {
if let (Some(source), Some(target)) = (source, target) {
if Path::new(target).exists() {
if let Err(error) = remove_file(target) {
@ -134,7 +174,7 @@ fn maybe_link(source: Option<&str>, target: Option<&str>) {
}
match symlink(source, target) {
Ok(()) => {
info!(to = target, from = source, "created symlink");
info!(to = target, from = %source.display(), "created symlink");
}
Err(error) => {
error!(%error, "failed to link target");
@ -154,10 +194,22 @@ fn match_file(file: &FileInfo, rules: &[Rule]) -> Option<RuleMatch> {
}
#[instrument(skip_all, fields(file = file.path))]
fn handle_file(file: &FileInfo, rules: &[Rule]) -> Option<String> {
fn handle_file(file: &FileInfo, rules: &[Rule], remove_duplicates: bool) -> Option<PathBuf> {
let Some(result) = match_file(file, rules) else {
info!("no matches");
return None;
info!(url = file.url, "removing duplicate");
if remove_duplicates {
let parent = Path::new(&file.path).parent().unwrap();
if let Some(duplicate) = has_duplicate(file, parent) {
info!(url = file.url, diplicate = %duplicate.display(), "removing duplicate");
if let Err(error) = remove_file(&file.path) {
error!(%error, "failed to remove duplicate");
}
return Some(duplicate);
}
}
return Some(file.path.clone().into());
};
let parent = result.target.as_deref().unwrap_or_else(|| file.parent());
@ -168,12 +220,22 @@ fn handle_file(file: &FileInfo, rules: &[Rule]) -> Option<String> {
return None;
}
if remove_duplicates {
if let Some(duplicate) = has_duplicate(file, parent) {
info!(url = file.url, diplicate = %duplicate.display(), "removing duplicate");
if let Err(error) = remove_file(&file.path) {
error!(%error, "failed to remove duplicate");
}
return Some(duplicate);
}
}
let target = format!("{parent}/{name}");
match cross_storage_move(&file.path, &target) {
Ok(()) => {
info!(target, "moved file");
Some(target)
Some(target.into())
}
Err(error) => {
info!(target, ?error, "failed to moved file");
@ -182,6 +244,37 @@ fn handle_file(file: &FileInfo, rules: &[Rule]) -> Option<String> {
}
}
fn has_duplicate(file: &FileInfo, dir: impl AsRef<Path>) -> Option<PathBuf> {
let dir = match read_dir(dir) {
Ok(dir) => dir,
Err(error) => {
error!(%error, "failed to list target directory");
return None;
}
};
for entry in dir.flatten() {
let path = entry.path();
if path.to_str() == Some(file.path.as_str()) {
continue;
}
if !path.is_file() {
continue;
}
match file.is_duplicate(&path) {
Ok(is_dup) => {
if is_dup {
return Some(path);
}
}
Err(error) => {
error!(%error, path = %path.display(), "failed to determine if a file is duplicate");
}
}
}
None
}
fn cross_storage_move(source: impl AsRef<Path>, target: impl AsRef<Path>) -> std::io::Result<()> {
let source = source.as_ref();
let target = target.as_ref();