perf: Faster cde rejection (#255)

* Use the tempfile crate instead of the tempdir crate (which is deprecated)

https://github.com/rust-lang-deprecated/tempdir?tab=readme-ov-file#deprecation-note

* perf: Add benchmark that measures the rejection speed of a large non-zip file

* perf: Speed up non-zip rejection by increasing END_WINDOW_SIZE

I tested several END_WINDOW_SIZEs across 2 machines:

Machine 1: macOS 15.0.1, aarch64 (apfs /tmp)
512:   test parse_large_non_zip  ... bench:  30,450,608 ns/iter (+/- 673,910)
4096:  test parse_large_non_zip  ... bench:   7,741,366 ns/iter (+/- 521,101)
8192:  test parse_large_non_zip  ... bench:   5,807,443 ns/iter (+/- 546,227)
16384: test parse_large_non_zip  ... bench:   4,794,314 ns/iter (+/- 419,114)
32768: test parse_large_non_zip  ... bench:   4,262,897 ns/iter (+/- 397,582)
65536: test parse_large_non_zip  ... bench:   4,060,847 ns/iter (+/- 280,964)

Machine 2: Debian testing, x86_64 (tmpfs /tmp)
512:   test parse_large_non_zip  ... bench:  65,132,581 ns/iter (+/- 7,429,976)
4096:  test parse_large_non_zip  ... bench:  14,109,503 ns/iter (+/- 2,892,086)
8192:  test parse_large_non_zip  ... bench:   9,942,500 ns/iter (+/- 1,886,063)
16384: test parse_large_non_zip  ... bench:   8,205,851 ns/iter (+/- 2,902,041)
32768: test parse_large_non_zip  ... bench:   7,012,011 ns/iter (+/- 2,222,879)
65536: test parse_large_non_zip  ... bench:   6,577,275 ns/iter (+/- 881,546)

In both cases END_WINDOW_SIZE=8192 performed about 6x better than 512 and >8192
didn't make much of a difference on top of that.

* perf: Speed up non-zip rejection by limiting search for EOCDR.

I benchmarked several search sizes across 2 machines
(these benches are using an 8192 END_WINDOW_SIZE):

Machine 1: macOS 15.0.1, aarch64 (apfs /tmp)
whole file:   test parse_large_non_zip              ... bench:   5,773,801 ns/iter (+/- 411,277)
last 128k:    test parse_large_non_zip              ... bench:      54,402 ns/iter (+/- 4,126)
last 66,000:  test parse_large_non_zip              ... bench:      36,152 ns/iter (+/- 4,293)

Machine 2: Debian testing, x86_64 (tmpfs /tmp)
whole file:   test parse_large_non_zip              ... bench:   9,942,306 ns/iter (+/- 1,963,522)
last 128k:    test parse_large_non_zip              ... bench:      73,604 ns/iter (+/- 16,662)
last 66,000:  test parse_large_non_zip              ... bench:      41,349 ns/iter (+/- 16,812)

As you might expect these significantly increase the rejection speed for
large non-zip files.

66,000 was the number previously used by zip-rs. It was changed to zero in
7a55945743.

128K is what Info-Zip uses[1]. This seems like a reasonable (non-zero)
choice for compatibility reasons.

[1] Info-zip is extremely old and doesn't not have an official git repo to
    link to. However, an unofficial fork can be found here:
    bb0c4755d4/zipfile.c (L4073)

---------

Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com>
This commit is contained in:
David Caldwell 2024-11-19 10:49:35 -08:00 committed by GitHub
parent b8257f81bd
commit 73143a0ad6
Signed by: DevComp
GPG key ID: B5690EEEBB952194
6 changed files with 34 additions and 11 deletions

View file

@ -62,7 +62,7 @@ walkdir = "2.5.0"
time = { workspace = true, features = ["formatting", "macros"] }
anyhow = "1"
clap = { version = "=4.4.18", features = ["derive"] }
tempdir = "0.3.7"
tempfile = "3"
[features]
aes-crypto = ["aes", "constant_time_eq", "hmac", "pbkdf2", "sha1", "rand", "zeroize"]

View file

@ -5,7 +5,7 @@ use std::io::{self, prelude::*, Cursor};
use bencher::Bencher;
use getrandom::getrandom;
use tempdir::TempDir;
use tempfile::TempDir;
use zip::write::SimpleFileOptions;
use zip::{result::ZipResult, CompressionMethod, ZipArchive, ZipWriter};
@ -102,7 +102,7 @@ fn parse_stream_archive(bench: &mut Bencher) {
let bytes = generate_random_archive(STREAM_ZIP_ENTRIES, STREAM_FILE_SIZE).unwrap();
/* Write to a temporary file path to incur some filesystem overhead from repeated reads */
let dir = TempDir::new("stream-bench").unwrap();
let dir = TempDir::with_prefix("stream-bench").unwrap();
let out = dir.path().join("bench-out.zip");
fs::write(&out, &bytes).unwrap();
@ -116,11 +116,27 @@ fn parse_stream_archive(bench: &mut Bencher) {
bench.bytes = bytes.len() as u64;
}
fn parse_large_non_zip(bench: &mut Bencher) {
const FILE_SIZE: usize = 17_000_000;
// Create a large file that doesn't have a zip header (generating random data _might_ make a zip magic
// number somewhere which is _not_ what we're trying to test).
let dir = TempDir::with_prefix("large-non-zip-bench").unwrap();
let file = dir.path().join("zeros");
let buf = vec![0u8; FILE_SIZE];
fs::write(&file, &buf).unwrap();
bench.iter(|| {
assert!(zip::ZipArchive::new(std::fs::File::open(&file).unwrap()).is_err());
})
}
benchmark_group!(
benches,
read_metadata,
parse_archive_with_comment,
parse_zip64_archive_with_comment,
parse_stream_archive,
parse_large_non_zip,
);
benchmark_main!(benches);

View file

@ -1777,7 +1777,7 @@ mod test {
use crate::CompressionMethod::Stored;
use crate::{ZipArchive, ZipWriter};
use std::io::{Cursor, Read, Write};
use tempdir::TempDir;
use tempfile::TempDir;
#[test]
fn invalid_offset() {
@ -1979,7 +1979,7 @@ mod test {
v.extend_from_slice(include_bytes!("../tests/data/symlink.zip"));
let mut reader = ZipArchive::new(Cursor::new(v)).unwrap();
assert!(reader.by_index(0).unwrap().is_symlink());
let tempdir = TempDir::new("test_is_symlink")?;
let tempdir = TempDir::with_prefix("test_is_symlink")?;
reader.extract(&tempdir).unwrap();
assert!(tempdir.path().join("bar").is_symlink());
Ok(())

View file

@ -353,9 +353,16 @@ impl Zip32CentralDirectoryEnd {
return Err(ZipError::InvalidArchive("Invalid zip header"));
}
let search_lower_bound = 0;
// The End Of Central Directory Record should be the last thing in
// the file and so searching the last 65557 bytes of the file should
// be enough. However, not all zips are well-formed and other
// programs may consume zips with extra junk at the end without
// error, so we go back 128K to be compatible with them. 128K is
// arbitrary, but it matches what Info-Zip does.
const EOCDR_SEARCH_SIZE: u64 = 128 * 1024;
let search_lower_bound = file_length.saturating_sub(EOCDR_SEARCH_SIZE);
const END_WINDOW_SIZE: usize = 512;
const END_WINDOW_SIZE: usize = 8192;
/* TODO: use static_assertions!() */
debug_assert!(END_WINDOW_SIZE > mem::size_of::<Magic>());

View file

@ -2,13 +2,13 @@
#[cfg(all(unix, feature = "_deflate-any"))]
fn extract_should_respect_links() {
use std::{fs, io, path::PathBuf, str::FromStr};
use tempdir::TempDir;
use tempfile::TempDir;
use zip::ZipArchive;
let mut v = Vec::new();
v.extend_from_slice(include_bytes!("data/pandoc_soft_links.zip"));
let mut archive = ZipArchive::new(io::Cursor::new(v)).expect("couldn't open test zip file");
let temp_dir = TempDir::new("pandoc_soft_links").unwrap();
let temp_dir = TempDir::with_prefix("pandoc_soft_links").unwrap();
archive.extract(&temp_dir).unwrap();
let symlink_path = temp_dir.path().join("pandoc-3.2-arm64/bin/pandoc-lua");

View file

@ -2,11 +2,11 @@
#[test]
fn repro_old423() -> zip::result::ZipResult<()> {
use std::io;
use tempdir::TempDir;
use tempfile::TempDir;
use zip::ZipArchive;
let mut v = Vec::new();
v.extend_from_slice(include_bytes!("data/lin-ub_iwd-v11.zip"));
let mut archive = ZipArchive::new(io::Cursor::new(v)).expect("couldn't open test zip file");
archive.extract(TempDir::new("repro_old423")?)
archive.extract(TempDir::with_prefix("repro_old423")?)
}