From aad5d988d66a33dcdd3b0d64ad2baa88386e6871 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 7 Sep 2023 02:10:24 -0400 Subject: [PATCH] add ZipWriter::merge_archive() method --- src/read.rs | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/write.rs | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) diff --git a/src/read.rs b/src/read.rs index 0a39faef..589baec2 100644 --- a/src/read.rs +++ b/src/read.rs @@ -332,6 +332,67 @@ pub(crate) struct CentralDirectoryInfo { } impl ZipArchive { + pub(crate) fn merge_contents( + &mut self, + mut w: W, + ) -> ZipResult> { + let mut new_files = self.shared.files.clone(); + if new_files.is_empty() { + return Ok(vec![]); + } + /* The first file header will probably start at the beginning of the file, but zip doesn't + * enforce that, and executable zips like PEX files will have a shebang line so will + * definitely be greater than 0. + * + * assert_eq!(0, new_files[0].header_start); // Avoid this. + */ + + let new_initial_header_start = w.stream_position()?; + /* Push back file header starts for all entries in the covered files. */ + new_files.iter_mut().try_for_each(|f| { + /* This is probably the only really important thing to change. */ + f.header_start = f.header_start.checked_add(new_initial_header_start).ok_or( + ZipError::InvalidArchive("new header start from merge would have been too large"), + )?; + /* This is only ever used internally to cache metadata lookups (it's not part of the + * zip spec), and 0 is the sentinel value. */ + f.central_header_start = 0; + /* This is an atomic variable so it can be updated from another thread in the + * implementation (which is good!). */ + if let Some(old_data_start) = f.data_start.take() { + let new_data_start = old_data_start.checked_add(new_initial_header_start).ok_or( + ZipError::InvalidArchive("new data start from merge would have been too large"), + )?; + f.data_start.get_or_init(|| new_data_start); + } + Ok::<_, ZipError>(()) + })?; + + /* Rewind to the beginning of the file. + * + * NB: we *could* decide to start copying from new_files[0].header_start instead, which + * would avoid copying over e.g. any pex shebangs or other file contents that start before + * the first zip file entry. However, zip files actually shouldn't care about garbage data + * in *between* real entries, since the central directory header records the correct start + * location of each, and keeping track of that math is more complicated logic that will only + * rarely be used, since most zips that get merged together are likely to be produced + * specifically for that purpose (and therefore are unlikely to have a shebang or other + * preface). Finally, this preserves any data that might actually be useful. + */ + self.reader.rewind()?; + /* Find the end of the file data. */ + let length_to_read = self.shared.dir_start; + /* Produce a Read that reads bytes up until the start of the central directory header. + * This "as &mut dyn Read" trick is used elsewhere to avoid having to clone the underlying + * handle, which it really shouldn't need to anyway. */ + let mut limited_raw = (&mut self.reader as &mut dyn Read).take(length_to_read); + /* Copy over file data from source archive directly. */ + io::copy(&mut limited_raw, &mut w)?; + + /* Return the files we've just written to the data stream. */ + Ok(new_files.into_vec()) + } + fn get_directory_info_zip32( footer: &spec::CentralDirectoryEnd, cde_start_pos: u64, diff --git a/src/write.rs b/src/write.rs index 0051f253..db326a74 100644 --- a/src/write.rs +++ b/src/write.rs @@ -934,6 +934,68 @@ impl ZipWriter { Ok(()) } + /* TODO: link to/use Self::finish_into_readable() from https://github.com/zip-rs/zip/pull/400 in + * this docstring. */ + /// Copy over the entire contents of another archive verbatim. + /// + /// This method extracts file metadata from the `source` archive, then simply performs a single + /// big [`io::copy()`](io::copy) to transfer all the actual file contents without any + /// decompression or decryption. This is more performant than the equivalent operation of + /// calling [`Self::raw_copy_file()`] for each entry from the `source` archive in sequence. + /// + ///``` + /// # fn main() -> Result<(), zip::result::ZipError> { + /// use std::io::{Cursor, prelude::*}; + /// use zip::{ZipArchive, ZipWriter, write::SimpleFileOptions}; + /// + /// let buf = Cursor::new(Vec::new()); + /// let mut zip = ZipWriter::new(buf); + /// zip.start_file("a.txt", SimpleFileOptions::default())?; + /// zip.write_all(b"hello\n")?; + /// let src = ZipArchive::new(zip.finish()?)?; + /// + /// let buf = Cursor::new(Vec::new()); + /// let mut zip = ZipWriter::new(buf); + /// zip.start_file("b.txt", SimpleFileOptions::default())?; + /// zip.write_all(b"hey\n")?; + /// let src2 = ZipArchive::new(zip.finish()?)?; + /// + /// let buf = Cursor::new(Vec::new()); + /// let mut zip = ZipWriter::new(buf); + /// zip.merge_archive(src)?; + /// zip.merge_archive(src2)?; + /// let mut result = ZipArchive::new(zip.finish()?)?; + /// + /// let mut s: String = String::new(); + /// result.by_name("a.txt")?.read_to_string(&mut s)?; + /// assert_eq!(s, "hello\n"); + /// s.clear(); + /// result.by_name("b.txt")?.read_to_string(&mut s)?; + /// assert_eq!(s, "hey\n"); + /// # Ok(()) + /// # } + ///``` + pub fn merge_archive(&mut self, mut source: ZipArchive) -> ZipResult<()> + where + R: Read + io::Seek, + { + self.finish_file()?; + + /* Ensure we accept the file contents on faith (and avoid overwriting the data). + * See raw_copy_file_rename(). */ + self.writing_to_file = true; + self.writing_raw = true; + + let writer = self.inner.get_plain(); + /* Get the file entries from the source archive. */ + let new_files = source.merge_contents(writer)?; + + /* These file entries are now ours! */ + self.files.extend(new_files); + + Ok(()) + } + fn normalize_options(options: &mut FileOptions) { if options.permissions.is_none() { options.permissions = Some(0o644);