diff --git a/Cargo.toml b/Cargo.toml index 00ff9d90..172cd129 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -88,3 +88,7 @@ harness = false [[bench]] name = "read_metadata" harness = false + +[[bench]] +name = "merge_archive" +harness = false diff --git a/benches/merge_archive.rs b/benches/merge_archive.rs new file mode 100644 index 00000000..c5cb26c5 --- /dev/null +++ b/benches/merge_archive.rs @@ -0,0 +1,120 @@ +use bencher::{benchmark_group, benchmark_main}; + +use std::io::{Cursor, Read, Seek, Write}; + +use bencher::Bencher; +use getrandom::getrandom; +use zip::{result::ZipResult, write::SimpleFileOptions, ZipArchive, ZipWriter}; + +fn generate_random_archive( + num_entries: usize, + entry_size: usize, + options: SimpleFileOptions, +) -> ZipResult<(usize, ZipArchive>>)> { + let buf = Cursor::new(Vec::new()); + let mut zip = ZipWriter::new(buf); + + let mut bytes = vec![0u8; entry_size]; + for i in 0..num_entries { + let name = format!("random{}.dat", i); + zip.start_file(name, options)?; + getrandom(&mut bytes).unwrap(); + zip.write_all(&bytes)?; + } + + let buf = zip.finish()?.into_inner(); + let len = buf.len(); + + Ok((len, ZipArchive::new(Cursor::new(buf))?)) +} + +fn perform_merge( + src: ZipArchive, + mut target: ZipWriter, +) -> ZipResult> { + target.merge_archive(src)?; + Ok(target) +} + +fn perform_raw_copy_file( + mut src: ZipArchive, + mut target: ZipWriter, +) -> ZipResult> { + for i in 0..src.len() { + let entry = src.by_index(i)?; + target.raw_copy_file(entry)?; + } + Ok(target) +} + +const NUM_ENTRIES: usize = 100; +const ENTRY_SIZE: usize = 1024; + +fn merge_archive_stored(bench: &mut Bencher) { + let options = SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored); + let (len, src) = generate_random_archive(NUM_ENTRIES, ENTRY_SIZE, options).unwrap(); + + bench.bytes = len as u64; + + bench.iter(|| { + let buf = Cursor::new(Vec::new()); + let zip = ZipWriter::new(buf); + let mut zip = perform_merge(src.clone(), zip).unwrap(); + let buf = zip.finish().unwrap().into_inner(); + assert_eq!(buf.len(), len); + }); +} + +fn merge_archive_compressed(bench: &mut Bencher) { + let options = SimpleFileOptions::default().compression_method(zip::CompressionMethod::Deflated); + let (len, src) = generate_random_archive(NUM_ENTRIES, ENTRY_SIZE, options).unwrap(); + + bench.bytes = len as u64; + + bench.iter(|| { + let buf = Cursor::new(Vec::new()); + let zip = ZipWriter::new(buf); + let mut zip = perform_merge(src.clone(), zip).unwrap(); + let buf = zip.finish().unwrap().into_inner(); + assert_eq!(buf.len(), len); + }); +} + +fn merge_archive_raw_copy_file_stored(bench: &mut Bencher) { + let options = SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored); + let (len, src) = generate_random_archive(NUM_ENTRIES, ENTRY_SIZE, options).unwrap(); + + bench.bytes = len as u64; + + bench.iter(|| { + let buf = Cursor::new(Vec::new()); + let zip = ZipWriter::new(buf); + let mut zip = perform_raw_copy_file(src.clone(), zip).unwrap(); + let buf = zip.finish().unwrap().into_inner(); + assert_eq!(buf.len(), len); + }); +} + +fn merge_archive_raw_copy_file_compressed(bench: &mut Bencher) { + let options = SimpleFileOptions::default().compression_method(zip::CompressionMethod::Deflated); + let (len, src) = generate_random_archive(NUM_ENTRIES, ENTRY_SIZE, options).unwrap(); + + bench.bytes = len as u64; + + bench.iter(|| { + let buf = Cursor::new(Vec::new()); + let zip = ZipWriter::new(buf); + let mut zip = perform_raw_copy_file(src.clone(), zip).unwrap(); + let buf = zip.finish().unwrap().into_inner(); + assert_eq!(buf.len(), len); + }); +} + +benchmark_group!( + benches, + merge_archive_stored, + merge_archive_compressed, + merge_archive_raw_copy_file_stored, + merge_archive_raw_copy_file_compressed, +); +benchmark_main!(benches); diff --git a/src/read.rs b/src/read.rs index d0184941..70b53756 100644 --- a/src/read.rs +++ b/src/read.rs @@ -333,6 +333,67 @@ pub(crate) struct CentralDirectoryInfo { } impl ZipArchive { + pub(crate) fn merge_contents( + &mut self, + mut w: W, + ) -> ZipResult> { + let mut new_files = self.shared.files.clone(); + if new_files.is_empty() { + return Ok(vec![]); + } + /* The first file header will probably start at the beginning of the file, but zip doesn't + * enforce that, and executable zips like PEX files will have a shebang line so will + * definitely be greater than 0. + * + * assert_eq!(0, new_files[0].header_start); // Avoid this. + */ + + let new_initial_header_start = w.stream_position()?; + /* Push back file header starts for all entries in the covered files. */ + new_files.iter_mut().try_for_each(|f| { + /* This is probably the only really important thing to change. */ + f.header_start = f.header_start.checked_add(new_initial_header_start).ok_or( + ZipError::InvalidArchive("new header start from merge would have been too large"), + )?; + /* This is only ever used internally to cache metadata lookups (it's not part of the + * zip spec), and 0 is the sentinel value. */ + f.central_header_start = 0; + /* This is an atomic variable so it can be updated from another thread in the + * implementation (which is good!). */ + if let Some(old_data_start) = f.data_start.take() { + let new_data_start = old_data_start.checked_add(new_initial_header_start).ok_or( + ZipError::InvalidArchive("new data start from merge would have been too large"), + )?; + f.data_start.get_or_init(|| new_data_start); + } + Ok::<_, ZipError>(()) + })?; + + /* Rewind to the beginning of the file. + * + * NB: we *could* decide to start copying from new_files[0].header_start instead, which + * would avoid copying over e.g. any pex shebangs or other file contents that start before + * the first zip file entry. However, zip files actually shouldn't care about garbage data + * in *between* real entries, since the central directory header records the correct start + * location of each, and keeping track of that math is more complicated logic that will only + * rarely be used, since most zips that get merged together are likely to be produced + * specifically for that purpose (and therefore are unlikely to have a shebang or other + * preface). Finally, this preserves any data that might actually be useful. + */ + self.reader.rewind()?; + /* Find the end of the file data. */ + let length_to_read = self.shared.dir_start; + /* Produce a Read that reads bytes up until the start of the central directory header. + * This "as &mut dyn Read" trick is used elsewhere to avoid having to clone the underlying + * handle, which it really shouldn't need to anyway. */ + let mut limited_raw = (&mut self.reader as &mut dyn Read).take(length_to_read); + /* Copy over file data from source archive directly. */ + io::copy(&mut limited_raw, &mut w)?; + + /* Return the files we've just written to the data stream. */ + Ok(new_files.into_vec()) + } + fn get_directory_info_zip32( footer: &spec::CentralDirectoryEnd, cde_start_pos: u64, diff --git a/src/write.rs b/src/write.rs index 7179487f..273e5a60 100644 --- a/src/write.rs +++ b/src/write.rs @@ -935,6 +935,68 @@ impl ZipWriter { Ok(()) } + /* TODO: link to/use Self::finish_into_readable() from https://github.com/zip-rs/zip/pull/400 in + * this docstring. */ + /// Copy over the entire contents of another archive verbatim. + /// + /// This method extracts file metadata from the `source` archive, then simply performs a single + /// big [`io::copy()`](io::copy) to transfer all the actual file contents without any + /// decompression or decryption. This is more performant than the equivalent operation of + /// calling [`Self::raw_copy_file()`] for each entry from the `source` archive in sequence. + /// + ///``` + /// # fn main() -> Result<(), zip::result::ZipError> { + /// use std::io::{Cursor, prelude::*}; + /// use zip::{ZipArchive, ZipWriter, write::SimpleFileOptions}; + /// + /// let buf = Cursor::new(Vec::new()); + /// let mut zip = ZipWriter::new(buf); + /// zip.start_file("a.txt", SimpleFileOptions::default())?; + /// zip.write_all(b"hello\n")?; + /// let src = ZipArchive::new(zip.finish()?)?; + /// + /// let buf = Cursor::new(Vec::new()); + /// let mut zip = ZipWriter::new(buf); + /// zip.start_file("b.txt", SimpleFileOptions::default())?; + /// zip.write_all(b"hey\n")?; + /// let src2 = ZipArchive::new(zip.finish()?)?; + /// + /// let buf = Cursor::new(Vec::new()); + /// let mut zip = ZipWriter::new(buf); + /// zip.merge_archive(src)?; + /// zip.merge_archive(src2)?; + /// let mut result = ZipArchive::new(zip.finish()?)?; + /// + /// let mut s: String = String::new(); + /// result.by_name("a.txt")?.read_to_string(&mut s)?; + /// assert_eq!(s, "hello\n"); + /// s.clear(); + /// result.by_name("b.txt")?.read_to_string(&mut s)?; + /// assert_eq!(s, "hey\n"); + /// # Ok(()) + /// # } + ///``` + pub fn merge_archive(&mut self, mut source: ZipArchive) -> ZipResult<()> + where + R: Read + io::Seek, + { + self.finish_file()?; + + /* Ensure we accept the file contents on faith (and avoid overwriting the data). + * See raw_copy_file_rename(). */ + self.writing_to_file = true; + self.writing_raw = true; + + let writer = self.inner.get_plain(); + /* Get the file entries from the source archive. */ + let new_files = source.merge_contents(writer)?; + + /* These file entries are now ours! */ + self.files.extend(new_files); + + Ok(()) + } + fn normalize_options(options: &mut FileOptions) { if options.permissions.is_none() { options.permissions = Some(0o644);