Merge pull request #76 from cosmicexplorer/oldpr401

feat: add ZipWriter::merge_archive() to efficiently copy all entries from a ZipArchive
2024-05-02 20:49:28 +00:00 · 2024-05-02 20:49:28 +00:00 · 033ec7bd46
commit 033ec7bd46
parent c8655d9eda ffea4df58f
4 changed files with 247 additions and 0 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -88,3 +88,7 @@ harness = false
 [[bench]]
 name = "read_metadata"
 harness = false
 [[bench]]
 name = "merge_archive"
 harness = false
--- a/benches/merge_archive.rs
+++ b/benches/merge_archive.rs
@ -0,0 +1,120 @@
 use bencher::{benchmark_group, benchmark_main};
 use std::io::{Cursor, Read, Seek, Write};
 use bencher::Bencher;
 use getrandom::getrandom;
 use zip::{result::ZipResult, write::SimpleFileOptions, ZipArchive, ZipWriter};
 fn generate_random_archive(
    num_entries: usize,
    entry_size: usize,
    options: SimpleFileOptions,
 ) -> ZipResult<(usize, ZipArchive<Cursor<Vec<u8>>>)> {
    let buf = Cursor::new(Vec::new());
    let mut zip = ZipWriter::new(buf);
    let mut bytes = vec![0u8; entry_size];
    for i in 0..num_entries {
        let name = format!("random{}.dat", i);
        zip.start_file(name, options)?;
        getrandom(&mut bytes).unwrap();
        zip.write_all(&bytes)?;
    }
    let buf = zip.finish()?.into_inner();
    let len = buf.len();
    Ok((len, ZipArchive::new(Cursor::new(buf))?))
 }
 fn perform_merge<R: Read + Seek, W: Write + Seek>(
    src: ZipArchive<R>,
    mut target: ZipWriter<W>,
 ) -> ZipResult<ZipWriter<W>> {
    target.merge_archive(src)?;
    Ok(target)
 }
 fn perform_raw_copy_file<R: Read + Seek, W: Write + Seek>(
    mut src: ZipArchive<R>,
    mut target: ZipWriter<W>,
 ) -> ZipResult<ZipWriter<W>> {
    for i in 0..src.len() {
        let entry = src.by_index(i)?;
        target.raw_copy_file(entry)?;
    }
    Ok(target)
 }
 const NUM_ENTRIES: usize = 100;
 const ENTRY_SIZE: usize = 1024;
 fn merge_archive_stored(bench: &mut Bencher) {
    let options = SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored);
    let (len, src) = generate_random_archive(NUM_ENTRIES, ENTRY_SIZE, options).unwrap();
    bench.bytes = len as u64;
    bench.iter(|| {
        let buf = Cursor::new(Vec::new());
        let zip = ZipWriter::new(buf);
        let mut zip = perform_merge(src.clone(), zip).unwrap();
        let buf = zip.finish().unwrap().into_inner();
        assert_eq!(buf.len(), len);
    });
 }
 fn merge_archive_compressed(bench: &mut Bencher) {
    let options = SimpleFileOptions::default().compression_method(zip::CompressionMethod::Deflated);
    let (len, src) = generate_random_archive(NUM_ENTRIES, ENTRY_SIZE, options).unwrap();
    bench.bytes = len as u64;
    bench.iter(|| {
        let buf = Cursor::new(Vec::new());
        let zip = ZipWriter::new(buf);
        let mut zip = perform_merge(src.clone(), zip).unwrap();
        let buf = zip.finish().unwrap().into_inner();
        assert_eq!(buf.len(), len);
    });
 }
 fn merge_archive_raw_copy_file_stored(bench: &mut Bencher) {
    let options = SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored);
    let (len, src) = generate_random_archive(NUM_ENTRIES, ENTRY_SIZE, options).unwrap();
    bench.bytes = len as u64;
    bench.iter(|| {
        let buf = Cursor::new(Vec::new());
        let zip = ZipWriter::new(buf);
        let mut zip = perform_raw_copy_file(src.clone(), zip).unwrap();
        let buf = zip.finish().unwrap().into_inner();
        assert_eq!(buf.len(), len);
    });
 }
 fn merge_archive_raw_copy_file_compressed(bench: &mut Bencher) {
    let options = SimpleFileOptions::default().compression_method(zip::CompressionMethod::Deflated);
    let (len, src) = generate_random_archive(NUM_ENTRIES, ENTRY_SIZE, options).unwrap();
    bench.bytes = len as u64;
    bench.iter(|| {
        let buf = Cursor::new(Vec::new());
        let zip = ZipWriter::new(buf);
        let mut zip = perform_raw_copy_file(src.clone(), zip).unwrap();
        let buf = zip.finish().unwrap().into_inner();
        assert_eq!(buf.len(), len);
    });
 }
 benchmark_group!(
    benches,
    merge_archive_stored,
    merge_archive_compressed,
    merge_archive_raw_copy_file_stored,
    merge_archive_raw_copy_file_compressed,
 );
 benchmark_main!(benches);
--- a/src/read.rs
+++ b/src/read.rs
@ -333,6 +333,67 @@ pub(crate) struct CentralDirectoryInfo {
 }
 impl<R: Read + Seek> ZipArchive<R> {
    pub(crate) fn merge_contents<W: Write + io::Seek>(
        &mut self,
        mut w: W,
    ) -> ZipResult<Vec<ZipFileData>> {
        let mut new_files = self.shared.files.clone();
        if new_files.is_empty() {
            return Ok(vec![]);
        }
        /* The first file header will probably start at the beginning of the file, but zip doesn't
         * enforce that, and executable zips like PEX files will have a shebang line so will
         * definitely be greater than 0.
         *
         * assert_eq!(0, new_files[0].header_start); // Avoid this.
         */
        let new_initial_header_start = w.stream_position()?;
        /* Push back file header starts for all entries in the covered files. */
        new_files.iter_mut().try_for_each(|f| {
            /* This is probably the only really important thing to change. */
            f.header_start = f.header_start.checked_add(new_initial_header_start).ok_or(
                ZipError::InvalidArchive("new header start from merge would have been too large"),
            )?;
            /* This is only ever used internally to cache metadata lookups (it's not part of the
             * zip spec), and 0 is the sentinel value. */
            f.central_header_start = 0;
            /* This is an atomic variable so it can be updated from another thread in the
             * implementation (which is good!). */
            if let Some(old_data_start) = f.data_start.take() {
                let new_data_start = old_data_start.checked_add(new_initial_header_start).ok_or(
                    ZipError::InvalidArchive("new data start from merge would have been too large"),
                )?;
                f.data_start.get_or_init(|| new_data_start);
            }
            Ok::<_, ZipError>(())
        })?;
        /* Rewind to the beginning of the file.
         *
         * NB: we *could* decide to start copying from new_files[0].header_start instead, which
         * would avoid copying over e.g. any pex shebangs or other file contents that start before
         * the first zip file entry. However, zip files actually shouldn't care about garbage data
         * in *between* real entries, since the central directory header records the correct start
         * location of each, and keeping track of that math is more complicated logic that will only
         * rarely be used, since most zips that get merged together are likely to be produced
         * specifically for that purpose (and therefore are unlikely to have a shebang or other
         * preface). Finally, this preserves any data that might actually be useful.
         */
        self.reader.rewind()?;
        /* Find the end of the file data. */
        let length_to_read = self.shared.dir_start;
        /* Produce a Read that reads bytes up until the start of the central directory header.
         * This "as &mut dyn Read" trick is used elsewhere to avoid having to clone the underlying
         * handle, which it really shouldn't need to anyway. */
        let mut limited_raw = (&mut self.reader as &mut dyn Read).take(length_to_read);
        /* Copy over file data from source archive directly. */
        io::copy(&mut limited_raw, &mut w)?;
        /* Return the files we've just written to the data stream. */
        Ok(new_files.into_vec())
    }
    fn get_directory_info_zip32(
        footer: &spec::CentralDirectoryEnd,
        cde_start_pos: u64,
--- a/src/write.rs
+++ b/src/write.rs
@ -935,6 +935,68 @@ impl<W: Write + Seek> ZipWriter<W> {
        Ok(())
    }
    /* TODO: link to/use Self::finish_into_readable() from https://github.com/zip-rs/zip/pull/400 in
     * this docstring. */
    /// Copy over the entire contents of another archive verbatim.
    ///
    /// This method extracts file metadata from the `source` archive, then simply performs a single
    /// big [`io::copy()`](io::copy) to transfer all the actual file contents without any
    /// decompression or decryption. This is more performant than the equivalent operation of
    /// calling [`Self::raw_copy_file()`] for each entry from the `source` archive in sequence.
    ///
    ///```
    /// # fn main() -> Result<(), zip::result::ZipError> {
    /// use std::io::{Cursor, prelude::*};
    /// use zip::{ZipArchive, ZipWriter, write::SimpleFileOptions};
    ///
    /// let buf = Cursor::new(Vec::new());
    /// let mut zip = ZipWriter::new(buf);
    /// zip.start_file("a.txt", SimpleFileOptions::default())?;
    /// zip.write_all(b"hello\n")?;
    /// let src = ZipArchive::new(zip.finish()?)?;
    ///
    /// let buf = Cursor::new(Vec::new());
    /// let mut zip = ZipWriter::new(buf);
    /// zip.start_file("b.txt", SimpleFileOptions::default())?;
    /// zip.write_all(b"hey\n")?;
    /// let src2 = ZipArchive::new(zip.finish()?)?;
    ///
    /// let buf = Cursor::new(Vec::new());
    /// let mut zip = ZipWriter::new(buf);
    /// zip.merge_archive(src)?;
    /// zip.merge_archive(src2)?;
    /// let mut result = ZipArchive::new(zip.finish()?)?;
    ///
    /// let mut s: String = String::new();
    /// result.by_name("a.txt")?.read_to_string(&mut s)?;
    /// assert_eq!(s, "hello\n");
    /// s.clear();
    /// result.by_name("b.txt")?.read_to_string(&mut s)?;
    /// assert_eq!(s, "hey\n");
    /// # Ok(())
    /// # }
    ///```
    pub fn merge_archive<R>(&mut self, mut source: ZipArchive<R>) -> ZipResult<()>
    where
        R: Read + io::Seek,
    {
        self.finish_file()?;
        /* Ensure we accept the file contents on faith (and avoid overwriting the data).
         * See raw_copy_file_rename(). */
        self.writing_to_file = true;
        self.writing_raw = true;
        let writer = self.inner.get_plain();
        /* Get the file entries from the source archive. */
        let new_files = source.merge_contents(writer)?;
        /* These file entries are now ours! */
        self.files.extend(new_files);
        Ok(())
    }
    fn normalize_options<T: FileOptionExtension>(options: &mut FileOptions<T>) {
        if options.permissions.is_none() {
            options.permissions = Some(0o644);