diff --git a/lib/inflate.js b/lib/inflate.js index 358cc33..3320c10 100644 --- a/lib/inflate.js +++ b/lib/inflate.js @@ -211,20 +211,7 @@ Inflate.prototype.push = function(data, mode) { if (this.options.to === 'string') { - // realign size to utf8 char border & move tail to start of buffer - next_out_utf8_index = strm.next_out_index - 6; - if (next_out_utf8_index < 0) { next_out_utf8_index = 0; } - - tail = strings.utf8tail(strm.next_out[next_out_utf8_index]); - while (next_out_utf8_index + tail <= strm.next_out_index) { - next_out_utf8_index += tail; - tail = strings.utf8tail(strm.next_out[next_out_utf8_index]); - } - - // shit happened - broken tail. then take it all. - if (next_out_utf8_index === 0) { - next_out_utf8_index = strm.next_out_index; - } + next_out_utf8_index = strings.utf8border(strm.next_out, strm.next_out_index); tail = strm.next_out_index - next_out_utf8_index; utf8str = strings.buf2string(strm.next_out, next_out_utf8_index); diff --git a/lib/utils/strings.js b/lib/utils/strings.js index c5bfba6..dc8b44b 100644 --- a/lib/utils/strings.js +++ b/lib/utils/strings.js @@ -172,7 +172,29 @@ exports.buf2string = function (buf, max) { }; -// calculate tail size of utf8 char by current byte value -exports.utf8tail = function(code) { - return _utf8len[code]; +// Calculate max possible position in utf8 buffer, +// that will not break sequence. If that's not possible +// - (very small limits) return max size as is. +// +// buf[] - utf8 bytes array +// max - length limit (mandatory); +exports.utf8border = function(buf, max) { + var pos; + + max = max || buf.length; + if (max > buf.length) { max = buf.length; } + + // go back from last position, until start of sequence found + pos = max-1; + while (pos >= 0 && (buf[pos] & 0xC0) === 0x80) { pos--; } + + // Fuckup - very small and broken sequence, + // return max, because we should return something anyway. + if (pos < 0) { return max; } + + // If we came to start of buffer - that means vuffer is too small, + // return max too. + if (pos === 0) { return max; } + + return (pos + _utf8len[buf[pos]] > max) ? pos : max; }; diff --git a/test/strings.js b/test/strings.js index d905f40..39ed57f 100644 --- a/test/strings.js +++ b/test/strings.js @@ -40,11 +40,40 @@ function a2utf16(arr) { describe('Encode/Decode', function () { - var utf16sample = a2utf16([0x1f3b5, 'abcd', 0x266a, 0x35, 0xe800, 0x10ffff, 0x0fffff]); + // Create sample, that contains all types of utf8 (1-4byte) after conversion + var utf16sample = a2utf16([0x1f3b5, 'a', 0x266a, 0x35, 0xe800, 0x10ffff, 0x0fffff]); + // use node Buffer internal conversion as "done right" var utf8sample = new Uint8Array(new Buffer(utf16sample)); - console.log(utf16sample, utf16sample.length); - console.log(new Buffer(utf16sample)); + it('utf-8 border detect', function () { + var ub = strings.utf8border; + assert.equal(ub(utf8sample, 1), 1); + assert.equal(ub(utf8sample, 2), 2); + assert.equal(ub(utf8sample, 3), 3); + assert.equal(ub(utf8sample, 4), 4); + + assert.equal(ub(utf8sample, 5), 5); + + assert.equal(ub(utf8sample, 6), 5); + assert.equal(ub(utf8sample, 7), 5); + assert.equal(ub(utf8sample, 8), 8); + + assert.equal(ub(utf8sample, 9), 9); + + assert.equal(ub(utf8sample, 10), 9); + assert.equal(ub(utf8sample, 11), 9); + assert.equal(ub(utf8sample, 12), 12); + + assert.equal(ub(utf8sample, 13), 12); + assert.equal(ub(utf8sample, 14), 12); + assert.equal(ub(utf8sample, 15), 12); + assert.equal(ub(utf8sample, 16), 16); + + assert.equal(ub(utf8sample, 17), 16); + assert.equal(ub(utf8sample, 18), 16); + assert.equal(ub(utf8sample, 19), 16); + assert.equal(ub(utf8sample, 20), 20); + }); it('Encode string to utf8 buf', function () { assert.ok(cmp(