perf: Faster cde rejection (#255)
* Use the tempfile crate instead of the tempdir crate (which is deprecated) https://github.com/rust-lang-deprecated/tempdir?tab=readme-ov-file#deprecation-note * perf: Add benchmark that measures the rejection speed of a large non-zip file * perf: Speed up non-zip rejection by increasing END_WINDOW_SIZE I tested several END_WINDOW_SIZEs across 2 machines: Machine 1: macOS 15.0.1, aarch64 (apfs /tmp) 512: test parse_large_non_zip ... bench: 30,450,608 ns/iter (+/- 673,910) 4096: test parse_large_non_zip ... bench: 7,741,366 ns/iter (+/- 521,101) 8192: test parse_large_non_zip ... bench: 5,807,443 ns/iter (+/- 546,227) 16384: test parse_large_non_zip ... bench: 4,794,314 ns/iter (+/- 419,114) 32768: test parse_large_non_zip ... bench: 4,262,897 ns/iter (+/- 397,582) 65536: test parse_large_non_zip ... bench: 4,060,847 ns/iter (+/- 280,964) Machine 2: Debian testing, x86_64 (tmpfs /tmp) 512: test parse_large_non_zip ... bench: 65,132,581 ns/iter (+/- 7,429,976) 4096: test parse_large_non_zip ... bench: 14,109,503 ns/iter (+/- 2,892,086) 8192: test parse_large_non_zip ... bench: 9,942,500 ns/iter (+/- 1,886,063) 16384: test parse_large_non_zip ... bench: 8,205,851 ns/iter (+/- 2,902,041) 32768: test parse_large_non_zip ... bench: 7,012,011 ns/iter (+/- 2,222,879) 65536: test parse_large_non_zip ... bench: 6,577,275 ns/iter (+/- 881,546) In both cases END_WINDOW_SIZE=8192 performed about 6x better than 512 and >8192 didn't make much of a difference on top of that. * perf: Speed up non-zip rejection by limiting search for EOCDR. I benchmarked several search sizes across 2 machines (these benches are using an 8192 END_WINDOW_SIZE): Machine 1: macOS 15.0.1, aarch64 (apfs /tmp) whole file: test parse_large_non_zip ... bench: 5,773,801 ns/iter (+/- 411,277) last 128k: test parse_large_non_zip ... bench: 54,402 ns/iter (+/- 4,126) last 66,000: test parse_large_non_zip ... bench: 36,152 ns/iter (+/- 4,293) Machine 2: Debian testing, x86_64 (tmpfs /tmp) whole file: test parse_large_non_zip ... bench: 9,942,306 ns/iter (+/- 1,963,522) last 128k: test parse_large_non_zip ... bench: 73,604 ns/iter (+/- 16,662) last 66,000: test parse_large_non_zip ... bench: 41,349 ns/iter (+/- 16,812) As you might expect these significantly increase the rejection speed for large non-zip files. 66,000 was the number previously used by zip-rs. It was changed to zero in7a55945743
. 128K is what Info-Zip uses[1]. This seems like a reasonable (non-zero) choice for compatibility reasons. [1] Info-zip is extremely old and doesn't not have an official git repo to link to. However, an unofficial fork can be found here:bb0c4755d4/zipfile.c (L4073)
--------- Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com>
This commit is contained in:
parent
b8257f81bd
commit
73143a0ad6
6 changed files with 34 additions and 11 deletions
|
@ -62,7 +62,7 @@ walkdir = "2.5.0"
|
|||
time = { workspace = true, features = ["formatting", "macros"] }
|
||||
anyhow = "1"
|
||||
clap = { version = "=4.4.18", features = ["derive"] }
|
||||
tempdir = "0.3.7"
|
||||
tempfile = "3"
|
||||
|
||||
[features]
|
||||
aes-crypto = ["aes", "constant_time_eq", "hmac", "pbkdf2", "sha1", "rand", "zeroize"]
|
||||
|
|
|
@ -5,7 +5,7 @@ use std::io::{self, prelude::*, Cursor};
|
|||
|
||||
use bencher::Bencher;
|
||||
use getrandom::getrandom;
|
||||
use tempdir::TempDir;
|
||||
use tempfile::TempDir;
|
||||
use zip::write::SimpleFileOptions;
|
||||
use zip::{result::ZipResult, CompressionMethod, ZipArchive, ZipWriter};
|
||||
|
||||
|
@ -102,7 +102,7 @@ fn parse_stream_archive(bench: &mut Bencher) {
|
|||
let bytes = generate_random_archive(STREAM_ZIP_ENTRIES, STREAM_FILE_SIZE).unwrap();
|
||||
|
||||
/* Write to a temporary file path to incur some filesystem overhead from repeated reads */
|
||||
let dir = TempDir::new("stream-bench").unwrap();
|
||||
let dir = TempDir::with_prefix("stream-bench").unwrap();
|
||||
let out = dir.path().join("bench-out.zip");
|
||||
fs::write(&out, &bytes).unwrap();
|
||||
|
||||
|
@ -116,11 +116,27 @@ fn parse_stream_archive(bench: &mut Bencher) {
|
|||
bench.bytes = bytes.len() as u64;
|
||||
}
|
||||
|
||||
fn parse_large_non_zip(bench: &mut Bencher) {
|
||||
const FILE_SIZE: usize = 17_000_000;
|
||||
|
||||
// Create a large file that doesn't have a zip header (generating random data _might_ make a zip magic
|
||||
// number somewhere which is _not_ what we're trying to test).
|
||||
let dir = TempDir::with_prefix("large-non-zip-bench").unwrap();
|
||||
let file = dir.path().join("zeros");
|
||||
let buf = vec![0u8; FILE_SIZE];
|
||||
fs::write(&file, &buf).unwrap();
|
||||
|
||||
bench.iter(|| {
|
||||
assert!(zip::ZipArchive::new(std::fs::File::open(&file).unwrap()).is_err());
|
||||
})
|
||||
}
|
||||
|
||||
benchmark_group!(
|
||||
benches,
|
||||
read_metadata,
|
||||
parse_archive_with_comment,
|
||||
parse_zip64_archive_with_comment,
|
||||
parse_stream_archive,
|
||||
parse_large_non_zip,
|
||||
);
|
||||
benchmark_main!(benches);
|
||||
|
|
|
@ -1777,7 +1777,7 @@ mod test {
|
|||
use crate::CompressionMethod::Stored;
|
||||
use crate::{ZipArchive, ZipWriter};
|
||||
use std::io::{Cursor, Read, Write};
|
||||
use tempdir::TempDir;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
fn invalid_offset() {
|
||||
|
@ -1979,7 +1979,7 @@ mod test {
|
|||
v.extend_from_slice(include_bytes!("../tests/data/symlink.zip"));
|
||||
let mut reader = ZipArchive::new(Cursor::new(v)).unwrap();
|
||||
assert!(reader.by_index(0).unwrap().is_symlink());
|
||||
let tempdir = TempDir::new("test_is_symlink")?;
|
||||
let tempdir = TempDir::with_prefix("test_is_symlink")?;
|
||||
reader.extract(&tempdir).unwrap();
|
||||
assert!(tempdir.path().join("bar").is_symlink());
|
||||
Ok(())
|
||||
|
|
11
src/spec.rs
11
src/spec.rs
|
@ -353,9 +353,16 @@ impl Zip32CentralDirectoryEnd {
|
|||
return Err(ZipError::InvalidArchive("Invalid zip header"));
|
||||
}
|
||||
|
||||
let search_lower_bound = 0;
|
||||
// The End Of Central Directory Record should be the last thing in
|
||||
// the file and so searching the last 65557 bytes of the file should
|
||||
// be enough. However, not all zips are well-formed and other
|
||||
// programs may consume zips with extra junk at the end without
|
||||
// error, so we go back 128K to be compatible with them. 128K is
|
||||
// arbitrary, but it matches what Info-Zip does.
|
||||
const EOCDR_SEARCH_SIZE: u64 = 128 * 1024;
|
||||
let search_lower_bound = file_length.saturating_sub(EOCDR_SEARCH_SIZE);
|
||||
|
||||
const END_WINDOW_SIZE: usize = 512;
|
||||
const END_WINDOW_SIZE: usize = 8192;
|
||||
/* TODO: use static_assertions!() */
|
||||
debug_assert!(END_WINDOW_SIZE > mem::size_of::<Magic>());
|
||||
|
||||
|
|
|
@ -2,13 +2,13 @@
|
|||
#[cfg(all(unix, feature = "_deflate-any"))]
|
||||
fn extract_should_respect_links() {
|
||||
use std::{fs, io, path::PathBuf, str::FromStr};
|
||||
use tempdir::TempDir;
|
||||
use tempfile::TempDir;
|
||||
use zip::ZipArchive;
|
||||
|
||||
let mut v = Vec::new();
|
||||
v.extend_from_slice(include_bytes!("data/pandoc_soft_links.zip"));
|
||||
let mut archive = ZipArchive::new(io::Cursor::new(v)).expect("couldn't open test zip file");
|
||||
let temp_dir = TempDir::new("pandoc_soft_links").unwrap();
|
||||
let temp_dir = TempDir::with_prefix("pandoc_soft_links").unwrap();
|
||||
archive.extract(&temp_dir).unwrap();
|
||||
|
||||
let symlink_path = temp_dir.path().join("pandoc-3.2-arm64/bin/pandoc-lua");
|
||||
|
|
|
@ -2,11 +2,11 @@
|
|||
#[test]
|
||||
fn repro_old423() -> zip::result::ZipResult<()> {
|
||||
use std::io;
|
||||
use tempdir::TempDir;
|
||||
use tempfile::TempDir;
|
||||
use zip::ZipArchive;
|
||||
|
||||
let mut v = Vec::new();
|
||||
v.extend_from_slice(include_bytes!("data/lin-ub_iwd-v11.zip"));
|
||||
let mut archive = ZipArchive::new(io::Cursor::new(v)).expect("couldn't open test zip file");
|
||||
archive.extract(TempDir::new("repro_old423")?)
|
||||
archive.extract(TempDir::with_prefix("repro_old423")?)
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue