From 68f7f5d4521dc7fa7202a42766e2114f6cffaed8 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 7 Sep 2023 01:06:34 -0400 Subject: [PATCH 1/5] add finish_into_readable() --- src/read.rs | 49 ++++++++++++++++++++++++++++++++++++++++--------- src/write.rs | 44 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 79 insertions(+), 14 deletions(-) diff --git a/src/read.rs b/src/read.rs index 0a39faef..f94ebaee 100644 --- a/src/read.rs +++ b/src/read.rs @@ -51,7 +51,6 @@ pub(crate) mod zip_archive { pub(crate) names_map: super::HashMap, usize>, pub(super) offset: u64, pub(super) dir_start: u64, - pub(super) dir_end: u64, } /// ZIP archive reader @@ -331,6 +330,39 @@ pub(crate) struct CentralDirectoryInfo { pub(crate) disk_with_central_directory: u32, } +impl ZipArchive { + pub(crate) fn from_finalized_writer( + files: Vec, + comment: Vec, + reader: R, + central_start: u64, + ) -> ZipResult { + if files.is_empty() { + return Err(ZipError::InvalidArchive( + "attempt to finalize empty zip writer into readable", + )); + } + /* This is where the whole file starts. */ + let initial_offset = files.first().unwrap().header_start; + let names_map: HashMap, usize> = files + .iter() + .enumerate() + .map(|(i, d)| (d.file_name.clone(), i)) + .collect(); + let shared = Arc::new(zip_archive::Shared { + files: files.into_boxed_slice(), + names_map, + offset: initial_offset, + dir_start: central_start, + }); + Ok(Self { + reader, + shared, + comment: comment.into_boxed_slice().into(), + }) + } +} + impl ZipArchive { fn get_directory_info_zip32( footer: &spec::CentralDirectoryEnd, @@ -482,11 +514,12 @@ impl ZipArchive { result.and_then(|dir_info| { // If the parsed number of files is greater than the offset then // something fishy is going on and we shouldn't trust number_of_files. - let file_capacity = if dir_info.number_of_files > cde_start_pos as usize { - 0 - } else { - dir_info.number_of_files - }; + let file_capacity = + if dir_info.number_of_files > dir_info.directory_start as usize { + 0 + } else { + dir_info.number_of_files + }; let mut files = Vec::with_capacity(file_capacity); let mut names_map = HashMap::with_capacity(file_capacity); reader.seek(io::SeekFrom::Start(dir_info.directory_start))?; @@ -495,7 +528,6 @@ impl ZipArchive { names_map.insert(file.file_name.clone(), files.len()); files.push(file); } - let dir_end = reader.seek(io::SeekFrom::Start(dir_info.directory_start))?; if dir_info.disk_number != dir_info.disk_with_central_directory { unsupported_zip_error("Support for multi-disk files is not implemented") } else { @@ -504,7 +536,6 @@ impl ZipArchive { names_map, offset: dir_info.archive_offset, dir_start: dir_info.directory_start, - dir_end, }) } }) @@ -524,7 +555,7 @@ impl ZipArchive { } let shared = ok_results .into_iter() - .max_by_key(|shared| shared.dir_end) + .max_by_key(|shared| shared.dir_start) .unwrap(); reader.seek(io::SeekFrom::Start(shared.dir_start))?; Ok(shared) diff --git a/src/write.rs b/src/write.rs index 0051f253..35f9bf46 100644 --- a/src/write.rs +++ b/src/write.rs @@ -597,6 +597,39 @@ impl ZipWriter { ) -> ZipResult<()> { self.deep_copy_file(&path_to_string(src_path), &path_to_string(dest_path)) } + + /// Write the zip file into the backing stream, then produce a readable archive of that data. + /// + /// This method avoids parsing the central directory records at the end of the stream for + /// a slight performance improvement over running [`ZipArchive::new()`] on the output of + /// [`Self::finish()`]. + /// + ///``` + /// # fn main() -> Result<(), zip::result::ZipError> { + /// use std::io::{Cursor, prelude::*}; + /// use zip::{ZipArchive, ZipWriter, write::SimpleFileOptions}; + /// + /// let buf = Cursor::new(Vec::new()); + /// let mut zip = ZipWriter::new(buf); + /// let options = SimpleFileOptions::default(); + /// zip.start_file("a.txt", options)?; + /// zip.write_all(b"hello\n")?; + /// + /// let mut zip = zip.finish_into_readable()?; + /// let mut s: String = String::new(); + /// zip.by_name("a.txt")?.read_to_string(&mut s)?; + /// assert_eq!(s, "hello\n"); + /// # Ok(()) + /// # } + ///``` + pub fn finish_into_readable(mut self) -> ZipResult> { + let central_start = self.finalize()?; + let inner = mem::replace(&mut self.inner, Closed).unwrap(); + let comment = mem::take(&mut self.comment); + let files = mem::take(&mut self.files); + let archive = ZipArchive::from_finalized_writer(files, comment, inner, central_start)?; + Ok(archive) + } } impl ZipWriter { @@ -1100,7 +1133,7 @@ impl ZipWriter { /// This will return the writer, but one should normally not append any data to the end of the file. /// Note that the zipfile will also be finished on drop. pub fn finish(&mut self) -> ZipResult { - self.finalize()?; + let _central_start = self.finalize()?; let inner = mem::replace(&mut self.inner, Closed); Ok(inner.unwrap()) } @@ -1160,10 +1193,10 @@ impl ZipWriter { self.add_symlink(path_to_string(path), path_to_string(target), options) } - fn finalize(&mut self) -> ZipResult<()> { + fn finalize(&mut self) -> ZipResult { self.finish_file()?; - { + let central_start = { let central_start = self.write_central_and_footer()?; let writer = self.inner.get_plain(); let footer_end = writer.stream_position()?; @@ -1175,9 +1208,10 @@ impl ZipWriter { writer.seek(SeekFrom::End(-(central_and_footer_size as i64)))?; self.write_central_and_footer()?; } - } + central_start + }; - Ok(()) + Ok(central_start) } fn write_central_and_footer(&mut self) -> Result { From aad5d988d66a33dcdd3b0d64ad2baa88386e6871 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 7 Sep 2023 02:10:24 -0400 Subject: [PATCH 2/5] add ZipWriter::merge_archive() method --- src/read.rs | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/write.rs | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) diff --git a/src/read.rs b/src/read.rs index 0a39faef..589baec2 100644 --- a/src/read.rs +++ b/src/read.rs @@ -332,6 +332,67 @@ pub(crate) struct CentralDirectoryInfo { } impl ZipArchive { + pub(crate) fn merge_contents( + &mut self, + mut w: W, + ) -> ZipResult> { + let mut new_files = self.shared.files.clone(); + if new_files.is_empty() { + return Ok(vec![]); + } + /* The first file header will probably start at the beginning of the file, but zip doesn't + * enforce that, and executable zips like PEX files will have a shebang line so will + * definitely be greater than 0. + * + * assert_eq!(0, new_files[0].header_start); // Avoid this. + */ + + let new_initial_header_start = w.stream_position()?; + /* Push back file header starts for all entries in the covered files. */ + new_files.iter_mut().try_for_each(|f| { + /* This is probably the only really important thing to change. */ + f.header_start = f.header_start.checked_add(new_initial_header_start).ok_or( + ZipError::InvalidArchive("new header start from merge would have been too large"), + )?; + /* This is only ever used internally to cache metadata lookups (it's not part of the + * zip spec), and 0 is the sentinel value. */ + f.central_header_start = 0; + /* This is an atomic variable so it can be updated from another thread in the + * implementation (which is good!). */ + if let Some(old_data_start) = f.data_start.take() { + let new_data_start = old_data_start.checked_add(new_initial_header_start).ok_or( + ZipError::InvalidArchive("new data start from merge would have been too large"), + )?; + f.data_start.get_or_init(|| new_data_start); + } + Ok::<_, ZipError>(()) + })?; + + /* Rewind to the beginning of the file. + * + * NB: we *could* decide to start copying from new_files[0].header_start instead, which + * would avoid copying over e.g. any pex shebangs or other file contents that start before + * the first zip file entry. However, zip files actually shouldn't care about garbage data + * in *between* real entries, since the central directory header records the correct start + * location of each, and keeping track of that math is more complicated logic that will only + * rarely be used, since most zips that get merged together are likely to be produced + * specifically for that purpose (and therefore are unlikely to have a shebang or other + * preface). Finally, this preserves any data that might actually be useful. + */ + self.reader.rewind()?; + /* Find the end of the file data. */ + let length_to_read = self.shared.dir_start; + /* Produce a Read that reads bytes up until the start of the central directory header. + * This "as &mut dyn Read" trick is used elsewhere to avoid having to clone the underlying + * handle, which it really shouldn't need to anyway. */ + let mut limited_raw = (&mut self.reader as &mut dyn Read).take(length_to_read); + /* Copy over file data from source archive directly. */ + io::copy(&mut limited_raw, &mut w)?; + + /* Return the files we've just written to the data stream. */ + Ok(new_files.into_vec()) + } + fn get_directory_info_zip32( footer: &spec::CentralDirectoryEnd, cde_start_pos: u64, diff --git a/src/write.rs b/src/write.rs index 0051f253..db326a74 100644 --- a/src/write.rs +++ b/src/write.rs @@ -934,6 +934,68 @@ impl ZipWriter { Ok(()) } + /* TODO: link to/use Self::finish_into_readable() from https://github.com/zip-rs/zip/pull/400 in + * this docstring. */ + /// Copy over the entire contents of another archive verbatim. + /// + /// This method extracts file metadata from the `source` archive, then simply performs a single + /// big [`io::copy()`](io::copy) to transfer all the actual file contents without any + /// decompression or decryption. This is more performant than the equivalent operation of + /// calling [`Self::raw_copy_file()`] for each entry from the `source` archive in sequence. + /// + ///``` + /// # fn main() -> Result<(), zip::result::ZipError> { + /// use std::io::{Cursor, prelude::*}; + /// use zip::{ZipArchive, ZipWriter, write::SimpleFileOptions}; + /// + /// let buf = Cursor::new(Vec::new()); + /// let mut zip = ZipWriter::new(buf); + /// zip.start_file("a.txt", SimpleFileOptions::default())?; + /// zip.write_all(b"hello\n")?; + /// let src = ZipArchive::new(zip.finish()?)?; + /// + /// let buf = Cursor::new(Vec::new()); + /// let mut zip = ZipWriter::new(buf); + /// zip.start_file("b.txt", SimpleFileOptions::default())?; + /// zip.write_all(b"hey\n")?; + /// let src2 = ZipArchive::new(zip.finish()?)?; + /// + /// let buf = Cursor::new(Vec::new()); + /// let mut zip = ZipWriter::new(buf); + /// zip.merge_archive(src)?; + /// zip.merge_archive(src2)?; + /// let mut result = ZipArchive::new(zip.finish()?)?; + /// + /// let mut s: String = String::new(); + /// result.by_name("a.txt")?.read_to_string(&mut s)?; + /// assert_eq!(s, "hello\n"); + /// s.clear(); + /// result.by_name("b.txt")?.read_to_string(&mut s)?; + /// assert_eq!(s, "hey\n"); + /// # Ok(()) + /// # } + ///``` + pub fn merge_archive(&mut self, mut source: ZipArchive) -> ZipResult<()> + where + R: Read + io::Seek, + { + self.finish_file()?; + + /* Ensure we accept the file contents on faith (and avoid overwriting the data). + * See raw_copy_file_rename(). */ + self.writing_to_file = true; + self.writing_raw = true; + + let writer = self.inner.get_plain(); + /* Get the file entries from the source archive. */ + let new_files = source.merge_contents(writer)?; + + /* These file entries are now ours! */ + self.files.extend(new_files); + + Ok(()) + } + fn normalize_options(options: &mut FileOptions) { if options.permissions.is_none() { options.permissions = Some(0o644); From e42ff64449e43463bb0a670db765c226c4f22691 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 7 Sep 2023 02:58:24 -0400 Subject: [PATCH 3/5] add merge_archive benchmarks --- Cargo.toml | 4 ++ benches/merge_archive.rs | 120 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 benches/merge_archive.rs diff --git a/Cargo.toml b/Cargo.toml index fbf4c7ae..b68b9555 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -87,3 +87,7 @@ harness = false [[bench]] name = "read_metadata" harness = false + +[[bench]] +name = "merge_archive" +harness = false diff --git a/benches/merge_archive.rs b/benches/merge_archive.rs new file mode 100644 index 00000000..c5cb26c5 --- /dev/null +++ b/benches/merge_archive.rs @@ -0,0 +1,120 @@ +use bencher::{benchmark_group, benchmark_main}; + +use std::io::{Cursor, Read, Seek, Write}; + +use bencher::Bencher; +use getrandom::getrandom; +use zip::{result::ZipResult, write::SimpleFileOptions, ZipArchive, ZipWriter}; + +fn generate_random_archive( + num_entries: usize, + entry_size: usize, + options: SimpleFileOptions, +) -> ZipResult<(usize, ZipArchive>>)> { + let buf = Cursor::new(Vec::new()); + let mut zip = ZipWriter::new(buf); + + let mut bytes = vec![0u8; entry_size]; + for i in 0..num_entries { + let name = format!("random{}.dat", i); + zip.start_file(name, options)?; + getrandom(&mut bytes).unwrap(); + zip.write_all(&bytes)?; + } + + let buf = zip.finish()?.into_inner(); + let len = buf.len(); + + Ok((len, ZipArchive::new(Cursor::new(buf))?)) +} + +fn perform_merge( + src: ZipArchive, + mut target: ZipWriter, +) -> ZipResult> { + target.merge_archive(src)?; + Ok(target) +} + +fn perform_raw_copy_file( + mut src: ZipArchive, + mut target: ZipWriter, +) -> ZipResult> { + for i in 0..src.len() { + let entry = src.by_index(i)?; + target.raw_copy_file(entry)?; + } + Ok(target) +} + +const NUM_ENTRIES: usize = 100; +const ENTRY_SIZE: usize = 1024; + +fn merge_archive_stored(bench: &mut Bencher) { + let options = SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored); + let (len, src) = generate_random_archive(NUM_ENTRIES, ENTRY_SIZE, options).unwrap(); + + bench.bytes = len as u64; + + bench.iter(|| { + let buf = Cursor::new(Vec::new()); + let zip = ZipWriter::new(buf); + let mut zip = perform_merge(src.clone(), zip).unwrap(); + let buf = zip.finish().unwrap().into_inner(); + assert_eq!(buf.len(), len); + }); +} + +fn merge_archive_compressed(bench: &mut Bencher) { + let options = SimpleFileOptions::default().compression_method(zip::CompressionMethod::Deflated); + let (len, src) = generate_random_archive(NUM_ENTRIES, ENTRY_SIZE, options).unwrap(); + + bench.bytes = len as u64; + + bench.iter(|| { + let buf = Cursor::new(Vec::new()); + let zip = ZipWriter::new(buf); + let mut zip = perform_merge(src.clone(), zip).unwrap(); + let buf = zip.finish().unwrap().into_inner(); + assert_eq!(buf.len(), len); + }); +} + +fn merge_archive_raw_copy_file_stored(bench: &mut Bencher) { + let options = SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored); + let (len, src) = generate_random_archive(NUM_ENTRIES, ENTRY_SIZE, options).unwrap(); + + bench.bytes = len as u64; + + bench.iter(|| { + let buf = Cursor::new(Vec::new()); + let zip = ZipWriter::new(buf); + let mut zip = perform_raw_copy_file(src.clone(), zip).unwrap(); + let buf = zip.finish().unwrap().into_inner(); + assert_eq!(buf.len(), len); + }); +} + +fn merge_archive_raw_copy_file_compressed(bench: &mut Bencher) { + let options = SimpleFileOptions::default().compression_method(zip::CompressionMethod::Deflated); + let (len, src) = generate_random_archive(NUM_ENTRIES, ENTRY_SIZE, options).unwrap(); + + bench.bytes = len as u64; + + bench.iter(|| { + let buf = Cursor::new(Vec::new()); + let zip = ZipWriter::new(buf); + let mut zip = perform_raw_copy_file(src.clone(), zip).unwrap(); + let buf = zip.finish().unwrap().into_inner(); + assert_eq!(buf.len(), len); + }); +} + +benchmark_group!( + benches, + merge_archive_stored, + merge_archive_compressed, + merge_archive_raw_copy_file_stored, + merge_archive_raw_copy_file_compressed, +); +benchmark_main!(benches); From 3cfbdfca83cc3a3ca32845460b6f224c6ce272e2 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 12 Jul 2023 00:01:23 -0400 Subject: [PATCH 4/5] use num_enum to clean up the System type --- Cargo.toml | 1 + src/read.rs | 4 ++-- src/types.rs | 29 ++++++++++++----------------- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index fbf4c7ae..719f3f78 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,7 @@ constant_time_eq = { version = "0.3.0", optional = true } crc32fast = "1.4.0" flate2 = { version = "1.0.28", default-features = false, optional = true } hmac = { version = "0.12.1", optional = true, features = ["reset"] } +num_enum = "0.6.1" pbkdf2 = { version = "0.12.2", optional = true } sha1 = { version = "0.10.6", optional = true } time = { workspace = true, optional = true, features = [ diff --git a/src/read.rs b/src/read.rs index 0a39faef..e0615fca 100644 --- a/src/read.rs +++ b/src/read.rs @@ -812,7 +812,7 @@ fn central_header_to_zip_file_inner( // Construct the result let mut result = ZipFileData { - system: System::from_u8((version_made_by >> 8) as u8), + system: System::from((version_made_by >> 8) as u8), version_made_by: version_made_by as u8, encrypted, using_data_descriptor, @@ -1178,7 +1178,7 @@ pub fn read_zipfile_from_stream<'a, R: Read>(reader: &'a mut R) -> ZipResult> 8) as u8), + system: System::from((version_made_by >> 8) as u8), version_made_by: version_made_by as u8, encrypted, using_data_descriptor, diff --git a/src/types.rs b/src/types.rs index b4079f6c..7ac8a971 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,5 +1,6 @@ //! Types that specify what is contained in a ZIP. use path::{Component, Path, PathBuf}; +use num_enum::{FromPrimitive, IntoPrimitive}; use std::path; use std::sync::{Arc, OnceLock}; @@ -49,25 +50,15 @@ use crate::result::DateTimeRangeError; #[cfg(feature = "time")] use time::{error::ComponentRange, Date, Month, OffsetDateTime, PrimitiveDateTime, Time}; -#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, FromPrimitive, IntoPrimitive)] +#[repr(u8)] pub enum System { Dos = 0, Unix = 3, + #[num_enum(default)] Unknown, } -impl System { - pub const fn from_u8(system: u8) -> System { - use self::System::*; - - match system { - 0 => Dos, - 3 => Unix, - _ => Unknown, - } - } -} - /// Representation of a moment in time. /// /// Zip files use an old format from DOS to store timestamps, @@ -512,10 +503,14 @@ mod test { #[test] fn system() { use super::System; - assert_eq!(System::Dos as u16, 0u16); - assert_eq!(System::Unix as u16, 3u16); - assert_eq!(System::from_u8(0), System::Dos); - assert_eq!(System::from_u8(3), System::Unix); + assert_eq!(u8::from(System::Dos), 0u8); + assert_eq!(System::Dos as u8, 0u8); + assert_eq!(System::Unix as u8, 3u8); + assert_eq!(u8::from(System::Unix), 3u8); + assert_eq!(System::from(0), System::Dos); + assert_eq!(System::from(3), System::Unix); + assert_eq!(u8::from(System::Unknown), 4u8); + assert_eq!(System::Unknown as u8, 4u8); } #[test] From d80245352542f0a804826f4b3663cc0bc17d9f6a Mon Sep 17 00:00:00 2001 From: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Date: Thu, 2 May 2024 09:35:46 -0700 Subject: [PATCH 5/5] style: rearrange imports to satisfy cargo fmt check --- src/types.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/types.rs b/src/types.rs index 7ac8a971..0a0b8948 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,6 +1,6 @@ //! Types that specify what is contained in a ZIP. -use path::{Component, Path, PathBuf}; use num_enum::{FromPrimitive, IntoPrimitive}; +use path::{Component, Path, PathBuf}; use std::path; use std::sync::{Arc, OnceLock};