From 49d5060e875e43f52f9a9d5d103d9d6ecb4f7d2a Mon Sep 17 00:00:00 2001 From: Alex Kocharin Date: Thu, 9 Jun 2022 21:01:31 +0300 Subject: [PATCH] Speed up deflation for level 0 (storing). https://github.com/madler/zlib/commit/9dc5a8585f429109ef1948ab71b6b71bfa7181e2 --- lib/zlib/deflate.js | 324 +++++++++++++++++++++++++++++--------------- 1 file changed, 211 insertions(+), 113 deletions(-) diff --git a/lib/zlib/deflate.js b/lib/zlib/deflate.js index a797765..de91a8c 100644 --- a/lib/zlib/deflate.js +++ b/lib/zlib/deflate.js @@ -98,6 +98,34 @@ const zero = (buf) => { let len = buf.length; while (--len >= 0) { buf[len] = 0; } }; +/* =========================================================================== + * Slide the hash table when sliding the window down (could be avoided with 32 + * bit values at the expense of memory usage). We slide even when level == 0 to + * keep the hash table consistent if we switch back to level > 0 later. + */ +const slide_hash = (s) => { + let n, m; + let p; + let wsize = s.w_size; + + n = s.hash_size; + p = n; + do { + m = s.head[--p]; + s.head[p] = (m >= wsize ? m - wsize : 0); + } while (--n); + n = wsize; +//#ifndef FASTEST + p = n; + do { + m = s.prev[--p]; + s.prev[p] = (m >= wsize ? m - wsize : 0); + /* If n is not on any hash chain, prev[n] is garbage but + * its value will never be used. + */ + } while (--n); +//#endif +}; /* eslint-disable new-cap */ let HASH_ZLIB = (s, prev, data) => ((prev << s.hash_shift) ^ data) & s.hash_mask; @@ -106,11 +134,12 @@ let HASH_ZLIB = (s, prev, data) => ((prev << s.hash_shift) ^ data) & s.hash_mask //let HASH_FAST = (s, prev, data) => ((prev << 8) + (prev >> 8) + (data << 4)) & s.hash_mask; let HASH = HASH_ZLIB; + /* ========================================================================= - * Flush as much pending output as possible. All deflate() output goes - * through this function so some applications may wish to modify it - * to avoid allocating a large strm->output buffer and copying into it. - * (See also read_buf()). + * Flush as much pending output as possible. All deflate() output, except for + * some deflate_stored() output, goes through this function so some + * applications may wish to modify it to avoid allocating a large + * strm->next_out buffer and copying into it. (See also read_buf()). */ const flush_pending = (strm) => { const s = strm.state; @@ -319,7 +348,7 @@ const longest_match = (s, cur_match) => { const fill_window = (s) => { const _w_size = s.w_size; - let p, n, m, more, str; + let n, more, str; //Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead"); @@ -346,38 +375,12 @@ const fill_window = (s) => { */ if (s.strstart >= _w_size + (_w_size - MIN_LOOKAHEAD)) { - s.window.set(s.window.subarray(_w_size, _w_size + _w_size), 0); + s.window.set(s.window.subarray(_w_size, _w_size + _w_size - more), 0); s.match_start -= _w_size; s.strstart -= _w_size; /* we now have strstart >= MAX_DIST */ s.block_start -= _w_size; - - /* Slide the hash table (could be avoided with 32 bit values - at the expense of memory usage). We slide even when level == 0 - to keep the hash table consistent if we switch back to level > 0 - later. (Using level 0 permanently is not an optimal usage of - zlib, so we don't care about this pathological case.) - */ - - n = s.hash_size; - p = n; - - do { - m = s.head[--p]; - s.head[p] = (m >= _w_size ? m - _w_size : 0); - } while (--n); - - n = _w_size; - p = n; - - do { - m = s.prev[--p]; - s.prev[p] = (m >= _w_size ? m - _w_size : 0); - /* If n is not on any hash chain, prev[n] is garbage but - * its value will never be used. - */ - } while (--n); - + slide_hash(s); more += _w_size; } if (s.strm.avail_in === 0) { @@ -469,104 +472,199 @@ const fill_window = (s) => { /* =========================================================================== * Copy without compression as much as possible from the input stream, return * the current block state. - * This function does not insert new strings in the dictionary since - * uncompressible data is probably not useful. This function is used - * only for the level=0 compression option. - * NOTE: this function should be optimized to avoid extra copying from - * window to pending_buf. + * + * In case deflateParams() is used to later switch to a non-zero compression + * level, s->matches (otherwise unused when storing) keeps track of the number + * of hash table slides to perform. If s->matches is 1, then one hash table + * slide will be done when switching. If s->matches is 2, the maximum value + * allowed here, then the hash table will be cleared, since two or more slides + * is the same as a clear. + * + * deflate_stored() is written to minimize the number of times an input byte is + * copied. It is most efficient with large input and output buffers, which + * maximizes the opportunites to have a single copy from next_in to next_out. */ const deflate_stored = (s, flush) => { - /* Stored blocks are limited to 0xffff bytes, pending_buf is limited - * to pending_buf_size, and each stored block has a 5 byte header: + /* Smallest worthy block size when not flushing or finishing. By default + * this is 32K. This can be as small as 507 bytes for memLevel == 1. For + * large input and output buffers, the stored block size will be larger. */ - let max_block_size = 0xffff; + let min_block = s.pending_buf_size - 5 > s.w_size ? s.w_size : s.pending_buf_size - 5; - if (max_block_size > s.pending_buf_size - 5) { - max_block_size = s.pending_buf_size - 5; - } - - /* Copy as much as possible from input to output: */ + /* Copy as many min_block or larger stored blocks directly to next_out as + * possible. If flushing, copy the remaining available input to next_out as + * stored blocks, if there is enough space. + */ + let len, left, have, last; + let used = s.strm.avail_in; for (;;) { - /* Fill the window as much as possible: */ - if (s.lookahead <= 1) { - - //Assert(s->strstart < s->w_size+MAX_DIST(s) || - // s->block_start >= (long)s->w_size, "slide too late"); -// if (!(s.strstart < s.w_size + (s.w_size - MIN_LOOKAHEAD) || -// s.block_start >= s.w_size)) { -// throw new Error("slide too late"); -// } - - fill_window(s); - if (s.lookahead === 0 && flush === Z_NO_FLUSH) { - return BS_NEED_MORE; - } - - if (s.lookahead === 0) { - break; - } - /* flush the current block */ - } - //Assert(s->block_start >= 0L, "block gone"); -// if (s.block_start < 0) throw new Error("block gone"); - - s.strstart += s.lookahead; - s.lookahead = 0; - - /* Emit a stored block if pending_buf will be full: */ - const max_start = max_block_size + s.block_start; - - if (s.strstart === 0 || s.strstart >= max_start) { - /* strstart == 0 is possible when wraparound on 16-bit machine */ - s.lookahead = s.strstart - max_start; - s.strstart = max_start; - /*** FLUSH_BLOCK(s, 0); ***/ - flush_block_only(s, false); - if (s.strm.avail_out === 0) { - return BS_NEED_MORE; - } - /***/ - - - } - /* Flush if we may have to slide, otherwise block_start may become - * negative and the data will be gone: + /* Set len to the maximum size block that we can copy directly with the + * available input data and output space. Set left to how much of that + * would be copied from what's left in the window. */ - if (s.strstart - s.block_start >= (s.w_size - MIN_LOOKAHEAD)) { - /*** FLUSH_BLOCK(s, 0); ***/ - flush_block_only(s, false); - if (s.strm.avail_out === 0) { - return BS_NEED_MORE; - } - /***/ + len = 65535/* MAX_STORED */; /* maximum deflate stored block length */ + have = (s.bi_valid + 42) >> 3; /* number of header bytes */ + /* maximum stored block length that will fit in avail_out: */ + have = s.strm.avail_out > have ? s.strm.avail_out - have : 0; + left = s.strstart - s.block_start; /* bytes left in window */ + if (len > left + s.strm.avail_in) { + len = left + s.strm.avail_in; /* limit len to the input */ + } + if (len > have) { + len = have; /* limit len to the output */ + } + if (left > len) { + left = len; /* limit window pull to len */ + } + + /* If the stored block would be less than min_block in length, or if + * unable to copy all of the available input when flushing, then try + * copying to the window and the pending buffer instead. Also don't + * write an empty block when flushing -- deflate() does that. + */ + if (len < min_block && (len === 0 || flush === Z_NO_FLUSH || + len - left !== s.strm.avail_in)) { + break; + } + + /* Make a dummy stored block in pending to get the header bytes, + * including any pending bits. This also updates the debugging counts. + */ + last = flush === Z_FINISH && len - left === s.strm.avail_in ? 1 : 0; + _tr_stored_block(s, 0, 0, last); + + /* Replace the lengths in the dummy stored block with len. */ + s.pending_buf[s.pending - 4] = len; + s.pending_buf[s.pending - 3] = len >> 8; + s.pending_buf[s.pending - 2] = ~len; + s.pending_buf[s.pending - 1] = ~len >> 8; + + /* Write the stored block header bytes. */ + flush_pending(s.strm); + + /* Update debugging counts for the data about to be copied. */ +//#ifdef ZLIB_DEBUG +// s->compressed_len += len << 3; +// s->bits_sent += len << 3; +//#endif + + /* Copy uncompressed bytes from the window to next_out. */ + if (left) { + //zmemcpy(s->strm->next_out, s->window + s->block_start, left); + s.strm.output.set(s.window.subarray(s.block_start, s.block_start + left), s.strm.next_out); + s.strm.next_out += left; + s.strm.avail_out -= left; + s.strm.total_out += left; + s.block_start += left; + len -= left; + } + + /* Copy uncompressed bytes directly from next_in to next_out, updating + * the check value. + */ + if (len) { + read_buf(s.strm, s.strm.output, s.strm.next_out, len); + s.strm.next_out += len; + s.strm.avail_out -= len; + s.strm.total_out += len; } } - s.insert = 0; + /* Update the sliding window with the last s->w_size bytes of the copied + * data, or append all of the copied data to the existing window if less + * than s->w_size bytes were copied. Also update the number of bytes to + * insert in the hash tables, in the event that deflateParams() switches to + * a non-zero compression level. + */ + used -= s.strm.avail_in; /* number of input bytes directly copied */ + if (used) { + /* If any input was used, then no unused input remains in the window, + * therefore s->block_start == s->strstart. + */ + if (used >= s.w_size) { /* supplant the previous history */ + s.matches = 2; /* clear hash */ + //zmemcpy(s->window, s->strm->next_in - s->w_size, s->w_size); + s.window.set(s.strm.input.subarray(s.strm.next_in - s.w_size, s.strm.next_in), 0); + s.strstart = s.w_size; + } + else { + if (s.window_size - s.strstart <= used) { + /* Slide the window down. */ + s.strstart -= s.w_size; + //zmemcpy(s->window, s->window + s->w_size, s->strstart); + s.window.set(s.window.subarray(s.w_size, s.w_size + s.strstart), 0); + if (s.matches < 2) { + s.matches++; /* add a pending slide_hash() */ + } + } + //zmemcpy(s->window + s->strstart, s->strm->next_in - used, used); + s.window.set(s.strm.input.subarray(s.strm.next_in - used, s.strm.next_in), s.strstart); + s.strstart += used; + } + s.block_start = s.strstart; + s.insert += used > s.w_size - s.insert ? s.w_size - s.insert : used; + } - if (flush === Z_FINISH) { - /*** FLUSH_BLOCK(s, 1); ***/ - flush_block_only(s, true); - if (s.strm.avail_out === 0) { + /* If flushing or finishing and all input has been consumed, then done. If + * the code above couldn't write a complete block to next_out, then the + * code following this won't be able to either. + */ + if (flush !== Z_NO_FLUSH && s.strm.avail_in === 0 && + s.strstart === s.block_start) { + return flush === Z_FINISH ? BS_FINISH_DONE : BS_BLOCK_DONE; + } + + /* Fill the window with any remaining input. */ + have = s.window_size - s.strstart - 1; + if (s.strm.avail_in > have && s.block_start >= s.w_size) { + /* Slide the window down. */ + s.block_start -= s.w_size; + s.strstart -= s.w_size; + //zmemcpy(s->window, s->window + s->w_size, s->strstart); + s.window.set(s.window.subarray(s.w_size, s.w_size + s.strstart), 0); + if (s.matches < 2) { + s.matches++; /* add a pending slide_hash() */ + } + have += s.w_size; /* more space now */ + } + if (have > s.strm.avail_in) { + have = s.strm.avail_in; + } + if (have) { + read_buf(s.strm, s.window, s.strstart, have); + s.strstart += have; + } + + /* There was not enough avail_out to write a complete worthy or flushed + * stored block to next_out. Write a stored block to pending instead, if we + * have enough input for a worthy block, or if flushing and there is enough + * room for the remaining input as a stored block in the pending buffer. + */ + have = (s.bi_valid + 42) >> 3; /* number of header bytes */ + /* maximum stored block length that will fit in pending: */ + have = s.pending_buf_size - have > 65535/* MAX_STORED */ ? 65535/* MAX_STORED */ : s.pending_buf_size - have; + min_block = have > s.w_size ? s.w_size : have; + left = s.strstart - s.block_start; + if (left >= min_block || + (left && flush !== Z_NO_FLUSH && s.strm.avail_in === 0 && + left <= have)) { + len = left > have ? have : left; + last = flush === Z_FINISH && s.strm.avail_in === 0 && + len === left ? 1 : 0; + _tr_stored_block(s, s.block_start, len, last); + s.block_start += len; + flush_pending(s.strm); + if (last) { return BS_FINISH_STARTED; } - /***/ - return BS_FINISH_DONE; - } - - if (s.strstart > s.block_start) { - /*** FLUSH_BLOCK(s, 0); ***/ - flush_block_only(s, false); - if (s.strm.avail_out === 0) { - return BS_NEED_MORE; - } - /***/ } + /* We've done all we can with the available input and output. */ return BS_NEED_MORE; }; + /* =========================================================================== * Compress as much as possible from the input stream, return the current * block state.