* Use the tempfile crate instead of the tempdir crate (which is deprecated) https://github.com/rust-lang-deprecated/tempdir?tab=readme-ov-file#deprecation-note * perf: Add benchmark that measures the rejection speed of a large non-zip file * perf: Speed up non-zip rejection by increasing END_WINDOW_SIZE I tested several END_WINDOW_SIZEs across 2 machines: Machine 1: macOS 15.0.1, aarch64 (apfs /tmp) 512: test parse_large_non_zip ... bench: 30,450,608 ns/iter (+/- 673,910) 4096: test parse_large_non_zip ... bench: 7,741,366 ns/iter (+/- 521,101) 8192: test parse_large_non_zip ... bench: 5,807,443 ns/iter (+/- 546,227) 16384: test parse_large_non_zip ... bench: 4,794,314 ns/iter (+/- 419,114) 32768: test parse_large_non_zip ... bench: 4,262,897 ns/iter (+/- 397,582) 65536: test parse_large_non_zip ... bench: 4,060,847 ns/iter (+/- 280,964) Machine 2: Debian testing, x86_64 (tmpfs /tmp) 512: test parse_large_non_zip ... bench: 65,132,581 ns/iter (+/- 7,429,976) 4096: test parse_large_non_zip ... bench: 14,109,503 ns/iter (+/- 2,892,086) 8192: test parse_large_non_zip ... bench: 9,942,500 ns/iter (+/- 1,886,063) 16384: test parse_large_non_zip ... bench: 8,205,851 ns/iter (+/- 2,902,041) 32768: test parse_large_non_zip ... bench: 7,012,011 ns/iter (+/- 2,222,879) 65536: test parse_large_non_zip ... bench: 6,577,275 ns/iter (+/- 881,546) In both cases END_WINDOW_SIZE=8192 performed about 6x better than 512 and >8192 didn't make much of a difference on top of that. * perf: Speed up non-zip rejection by limiting search for EOCDR. I benchmarked several search sizes across 2 machines (these benches are using an 8192 END_WINDOW_SIZE): Machine 1: macOS 15.0.1, aarch64 (apfs /tmp) whole file: test parse_large_non_zip ... bench: 5,773,801 ns/iter (+/- 411,277) last 128k: test parse_large_non_zip ... bench: 54,402 ns/iter (+/- 4,126) last 66,000: test parse_large_non_zip ... bench: 36,152 ns/iter (+/- 4,293) Machine 2: Debian testing, x86_64 (tmpfs /tmp) whole file: test parse_large_non_zip ... bench: 9,942,306 ns/iter (+/- 1,963,522) last 128k: test parse_large_non_zip ... bench: 73,604 ns/iter (+/- 16,662) last 66,000: test parse_large_non_zip ... bench: 41,349 ns/iter (+/- 16,812) As you might expect these significantly increase the rejection speed for large non-zip files. 66,000 was the number previously used by zip-rs. It was changed to zero in7a55945743
. 128K is what Info-Zip uses[1]. This seems like a reasonable (non-zero) choice for compatibility reasons. [1] Info-zip is extremely old and doesn't not have an official git repo to link to. However, an unofficial fork can be found here:bb0c4755d4/zipfile.c (L4073)
--------- Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com>
142 lines
4.4 KiB
Rust
142 lines
4.4 KiB
Rust
use bencher::{benchmark_group, benchmark_main};
|
|
|
|
use std::fs;
|
|
use std::io::{self, prelude::*, Cursor};
|
|
|
|
use bencher::Bencher;
|
|
use getrandom::getrandom;
|
|
use tempfile::TempDir;
|
|
use zip::write::SimpleFileOptions;
|
|
use zip::{result::ZipResult, CompressionMethod, ZipArchive, ZipWriter};
|
|
|
|
const FILE_COUNT: usize = 15_000;
|
|
const FILE_SIZE: usize = 1024;
|
|
|
|
fn generate_random_archive(count_files: usize, file_size: usize) -> ZipResult<Vec<u8>> {
|
|
let data = Vec::new();
|
|
let mut writer = ZipWriter::new(Cursor::new(data));
|
|
let options = SimpleFileOptions::default().compression_method(CompressionMethod::Stored);
|
|
|
|
let mut bytes = vec![0u8; file_size];
|
|
|
|
for i in 0..count_files {
|
|
let name = format!("file_deadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef_{i}.dat");
|
|
writer.start_file(name, options)?;
|
|
getrandom(&mut bytes).map_err(io::Error::from)?;
|
|
writer.write_all(&bytes)?;
|
|
}
|
|
|
|
Ok(writer.finish()?.into_inner())
|
|
}
|
|
|
|
fn read_metadata(bench: &mut Bencher) {
|
|
let bytes = generate_random_archive(FILE_COUNT, FILE_SIZE).unwrap();
|
|
|
|
bench.iter(|| {
|
|
let archive = ZipArchive::new(Cursor::new(bytes.as_slice())).unwrap();
|
|
archive.len()
|
|
});
|
|
bench.bytes = bytes.len() as u64;
|
|
}
|
|
|
|
const COMMENT_SIZE: usize = 50_000;
|
|
|
|
fn generate_zip32_archive_with_random_comment(comment_length: usize) -> ZipResult<Vec<u8>> {
|
|
let data = Vec::new();
|
|
let mut writer = ZipWriter::new(Cursor::new(data));
|
|
let options = SimpleFileOptions::default().compression_method(CompressionMethod::Stored);
|
|
|
|
let mut bytes = vec![0u8; comment_length];
|
|
getrandom(&mut bytes).unwrap();
|
|
writer.set_raw_comment(bytes.into_boxed_slice());
|
|
|
|
writer.start_file("asdf.txt", options)?;
|
|
writer.write_all(b"asdf")?;
|
|
|
|
Ok(writer.finish()?.into_inner())
|
|
}
|
|
|
|
fn parse_archive_with_comment(bench: &mut Bencher) {
|
|
let bytes = generate_zip32_archive_with_random_comment(COMMENT_SIZE).unwrap();
|
|
|
|
bench.bench_n(1, |_| {
|
|
let archive = ZipArchive::new(Cursor::new(bytes.as_slice())).unwrap();
|
|
let _ = archive.comment().len();
|
|
});
|
|
bench.bytes = bytes.len() as u64;
|
|
}
|
|
|
|
const COMMENT_SIZE_64: usize = 500_000;
|
|
|
|
fn generate_zip64_archive_with_random_comment(comment_length: usize) -> ZipResult<Vec<u8>> {
|
|
let data = Vec::new();
|
|
let mut writer = ZipWriter::new(Cursor::new(data));
|
|
let options = SimpleFileOptions::default()
|
|
.compression_method(CompressionMethod::Stored)
|
|
.large_file(true);
|
|
|
|
let mut bytes = vec![0u8; comment_length];
|
|
getrandom(&mut bytes).unwrap();
|
|
writer.set_raw_comment(bytes.into_boxed_slice());
|
|
|
|
writer.start_file("asdf.txt", options)?;
|
|
writer.write_all(b"asdf")?;
|
|
|
|
Ok(writer.finish()?.into_inner())
|
|
}
|
|
|
|
fn parse_zip64_archive_with_comment(bench: &mut Bencher) {
|
|
let bytes = generate_zip64_archive_with_random_comment(COMMENT_SIZE_64).unwrap();
|
|
|
|
bench.iter(|| {
|
|
let archive = ZipArchive::new(Cursor::new(bytes.as_slice())).unwrap();
|
|
archive.comment().len()
|
|
});
|
|
bench.bytes = bytes.len() as u64;
|
|
}
|
|
|
|
fn parse_stream_archive(bench: &mut Bencher) {
|
|
const STREAM_ZIP_ENTRIES: usize = 5;
|
|
const STREAM_FILE_SIZE: usize = 5;
|
|
|
|
let bytes = generate_random_archive(STREAM_ZIP_ENTRIES, STREAM_FILE_SIZE).unwrap();
|
|
|
|
/* Write to a temporary file path to incur some filesystem overhead from repeated reads */
|
|
let dir = TempDir::with_prefix("stream-bench").unwrap();
|
|
let out = dir.path().join("bench-out.zip");
|
|
fs::write(&out, &bytes).unwrap();
|
|
|
|
bench.iter(|| {
|
|
let mut f = fs::File::open(&out).unwrap();
|
|
while zip::read::read_zipfile_from_stream(&mut f)
|
|
.unwrap()
|
|
.is_some()
|
|
{}
|
|
});
|
|
bench.bytes = bytes.len() as u64;
|
|
}
|
|
|
|
fn parse_large_non_zip(bench: &mut Bencher) {
|
|
const FILE_SIZE: usize = 17_000_000;
|
|
|
|
// Create a large file that doesn't have a zip header (generating random data _might_ make a zip magic
|
|
// number somewhere which is _not_ what we're trying to test).
|
|
let dir = TempDir::with_prefix("large-non-zip-bench").unwrap();
|
|
let file = dir.path().join("zeros");
|
|
let buf = vec![0u8; FILE_SIZE];
|
|
fs::write(&file, &buf).unwrap();
|
|
|
|
bench.iter(|| {
|
|
assert!(zip::ZipArchive::new(std::fs::File::open(&file).unwrap()).is_err());
|
|
})
|
|
}
|
|
|
|
benchmark_group!(
|
|
benches,
|
|
read_metadata,
|
|
parse_archive_with_comment,
|
|
parse_zip64_archive_with_comment,
|
|
parse_stream_archive,
|
|
parse_large_non_zip,
|
|
);
|
|
benchmark_main!(benches);
|