tar(.gz) support

This commit is contained in:
Robin Appelman 2024-11-04 16:31:28 +01:00
commit 7a524ae1d4
6 changed files with 287 additions and 56 deletions

88
Cargo.lock generated
View file

@ -402,6 +402,28 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
[[package]]
name = "errno"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
dependencies = [
"libc",
"windows-sys 0.52.0",
]
[[package]]
name = "filetime"
version = "0.2.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35c0522e981e68cbfa8c3f978441a5f34b30b96e146b33cd3359176b50fe8586"
dependencies = [
"cfg-if",
"libc",
"libredox",
"windows-sys 0.59.0",
]
[[package]]
name = "flate2"
version = "1.0.31"
@ -540,9 +562,26 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.155"
version = "0.2.159"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5"
[[package]]
name = "libredox"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d"
dependencies = [
"bitflags 2.6.0",
"libc",
"redox_syscall",
]
[[package]]
name = "linux-raw-sys"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
[[package]]
name = "lock_api"
@ -585,6 +624,7 @@ dependencies = [
"regex",
"serde",
"serde_json",
"tar",
"thiserror",
"tikv-jemallocator",
"time",
@ -903,6 +943,19 @@ version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
[[package]]
name = "rustix"
version = "0.38.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811"
dependencies = [
"bitflags 2.6.0",
"errno",
"libc",
"linux-raw-sys",
"windows-sys 0.52.0",
]
[[package]]
name = "rustversion"
version = "1.0.17"
@ -1077,6 +1130,17 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "tar"
version = "0.4.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ff6c40d3aedb5e06b57c6f669ad17ab063dd1e63d977c6a88e7f4dfa4f04020"
dependencies = [
"filetime",
"libc",
"xattr",
]
[[package]]
name = "termcolor"
version = "1.4.1"
@ -1269,6 +1333,15 @@ dependencies = [
"windows-targets 0.52.6",
]
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets 0.52.6",
]
[[package]]
name = "windows-targets"
version = "0.48.5"
@ -1390,6 +1463,17 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "xattr"
version = "1.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8da84f1a25939b27f6820d92aed108f83ff920fdf11a7b19366c27c4cda81d4f"
dependencies = [
"libc",
"linux-raw-sys",
"rustix",
]
[[package]]
name = "zerocopy"
version = "0.7.35"

View file

@ -2,7 +2,7 @@
name = "logsmash"
version = "0.1.5"
edition = "2021"
rust-version = "1.74.0"
rust-version = "1.75.0"
license = "GPL-3"
[dependencies]
@ -25,6 +25,7 @@ ahash = "0.8.11"
base64 = "0.21.7"
derive_more = { version = "1.0.0-beta.6", features = ["from"] }
rayon = "1.10.0"
tar = "0.4.42"
[target.'cfg(not(target_os = "windows"))'.dependencies]
tikv-jemallocator = "0.6.0"

View file

@ -1,3 +1,4 @@
use std::string::FromUtf8Error;
use thiserror::Error;
use zip::result::ZipError;
@ -25,4 +26,6 @@ pub enum ReadError {
MultipleFiles,
#[error("archive contains no files")]
NoFiles,
#[error("log file contained non-utf8 characters: {0:#}")]
Utf8(#[from] FromUtf8Error),
}

View file

@ -1,53 +0,0 @@
use crate::error::ReadError;
use flate2::read::GzDecoder;
use std::fs::File;
use std::io::Read;
use zip::ZipArchive;
pub struct LogFile {
content: String,
}
impl LogFile {
pub fn open(path: &str) -> Result<LogFile, ReadError> {
let mut file = File::open(path)?;
if path.ends_with(".zip") {
let mut zip = ZipArchive::new(file)?;
let files: Vec<_> = zip
.file_names()
.enumerate()
.filter(|(_, name)| !name.starts_with("__MACOSX"))
.collect();
if files.len() > 1 {
return Err(ReadError::MultipleFiles);
} else if files.is_empty() {
return Err(ReadError::NoFiles);
}
let mut log = zip.by_index(files[0].0)?;
let mut content = String::with_capacity(log.size() as usize);
log.read_to_string(&mut content)?;
Ok(LogFile { content })
} else if path.ends_with(".gz") {
let mut decoder = GzDecoder::new(file);
let mut content = String::new();
decoder.read_to_string(&mut content)?;
Ok(LogFile { content })
} else {
let mut content = String::new();
file.read_to_string(&mut content)?;
Ok(LogFile { content })
}
}
pub fn iter(&self) -> impl Iterator<Item = &str> + Send + '_ {
self.content.lines()
}
pub fn nth(&self, index: usize) -> Option<&str> {
self.iter().nth(index)
}
}

125
src/logfile/archive.rs Normal file
View file

@ -0,0 +1,125 @@
use crate::error::ReadError;
use itertools::Either;
use std::borrow::Cow;
use std::io::{Read, Seek};
use std::iter::empty;
use std::sync::Mutex;
pub trait Archive {
type Entry<'a>: ArchiveEntry
where
Self: 'a;
fn entries(&mut self) -> impl Iterator<Item = Self::Entry<'_>>;
}
pub trait ArchiveEntry {
fn name(&self) -> Cow<str>;
fn extract(self) -> Result<Vec<u8>, ReadError>;
}
fn read_to_vec<R: Read>(size: usize, mut reader: R) -> Result<Vec<u8>, ReadError> {
let mut buff = Vec::with_capacity(size.max(GB));
reader.read_to_end(&mut buff)?;
Ok(buff)
}
const GB: usize = 1_073_741_824;
pub struct ZipArchive<R>(Mutex<zip::ZipArchive<R>>);
impl<R: Read + Seek> ZipArchive<R> {
pub fn new(reader: R) -> Result<Self, ReadError> {
Ok(Self(Mutex::new(zip::ZipArchive::new(reader)?)))
}
}
pub struct ZipEntry<'a, R> {
id: usize,
pub path: String,
archive: &'a ZipArchive<R>,
}
impl<R: Read + Seek> ZipArchive<R> {
fn extract(&self, id: usize) -> Result<Vec<u8>, ReadError> {
let mut archive = self.0.lock().unwrap();
let file = archive.by_index(id)?;
read_to_vec(file.size() as usize, file)
}
}
impl<R: Read + Seek> ArchiveEntry for ZipEntry<'_, R> {
fn name(&self) -> Cow<str> {
self.path.as_str().into()
}
fn extract(self) -> Result<Vec<u8>, ReadError> {
self.archive.extract(self.id)
}
}
impl<R: Read + Seek> Archive for ZipArchive<R> {
type Entry<'a> = ZipEntry<'a, R> where R: 'a;
fn entries(&mut self) -> impl Iterator<Item = Self::Entry<'_>> {
let names = self
.0
.lock()
.unwrap()
.file_names()
.map(String::from)
.collect::<Vec<_>>();
names.into_iter().enumerate().map(|(id, path)| Self::Entry {
id,
path,
archive: self,
})
}
}
pub struct TarArchive<R: Read>(tar::Archive<R>);
impl<R: Read> TarArchive<R> {
pub fn new(reader: R) -> Result<Self, ReadError> {
Ok(Self(tar::Archive::new(reader)))
}
}
pub struct TarEntry {
name: String,
content: Vec<u8>,
}
impl TarEntry {
pub fn new<R: Read>(entry: tar::Entry<R>) -> Result<Self, ReadError> {
// work around tar "in-order" requirement by just caching everything :(
let name = match entry.path() {
Ok(path) => path.display().to_string(),
_ => "invalid path".into(),
};
let content = read_to_vec(entry.size() as usize, entry)?;
Ok(TarEntry { name, content })
}
}
impl ArchiveEntry for TarEntry {
fn name(&self) -> Cow<str> {
self.name.as_str().into()
}
fn extract(self) -> Result<Vec<u8>, ReadError> {
Ok(self.content)
}
}
impl<R: Read> Archive for TarArchive<R> {
type Entry<'a> = TarEntry where R: 'a;
fn entries(&mut self) -> impl Iterator<Item = Self::Entry<'_>> {
match self.0.entries() {
Ok(iter) => Either::Left(iter.flatten().flat_map(TarEntry::new)),
_ => Either::Right(empty()),
}
}
}

71
src/logfile/mod.rs Normal file
View file

@ -0,0 +1,71 @@
mod archive;
use crate::error::ReadError;
use crate::logfile::archive::{Archive, ArchiveEntry, TarArchive, ZipArchive};
use flate2::read::GzDecoder;
use std::fs::File;
use std::io::Read;
pub struct LogFile {
content: String,
}
impl LogFile {
pub fn open(path: &str) -> Result<LogFile, ReadError> {
let file = File::open(path)?;
if path.ends_with(".zip") {
let mut zip = ZipArchive::new(file)?;
let content = select_file(&mut zip)?;
return Ok(LogFile { content });
}
if let Some(path) = path.strip_suffix(".gz") {
let decoder = GzDecoder::new(file);
return Self::open_no_seek(path, decoder);
}
Self::open_no_seek(path, Box::new(file))
}
fn open_no_seek<R: Read>(path: &str, mut file: R) -> Result<LogFile, ReadError> {
if path.ends_with(".tar") {
let mut zip = TarArchive::new(file)?;
let content = select_file(&mut zip)?;
Ok(LogFile { content })
} else {
let mut content = String::new();
file.read_to_string(&mut content)?;
Ok(LogFile { content })
}
}
pub fn iter(&self) -> impl Iterator<Item = &str> + Send + '_ {
self.content.lines()
}
pub fn nth(&self, index: usize) -> Option<&str> {
self.iter().nth(index)
}
}
fn select_file<A: Archive>(archive: &mut A) -> Result<String, ReadError> {
let entry = {
let mut entries = archive
.entries()
.filter(|entry| !entry.name().starts_with("__MACOSX"))
.collect::<Vec<_>>();
// todo: present a picker instead
if entries.len() > 1 {
return Err(ReadError::MultipleFiles);
} else if entries.is_empty() {
return Err(ReadError::NoFiles);
}
entries.pop().unwrap()
};
let raw = entry.extract()?;
Ok(String::from_utf8(raw)?)
}