Moved utf8 border detect to utils, fixed bugs & added tests

This commit is contained in:
Vitaly Puzrin 2014-04-15 00:22:37 +04:00
parent 4d3acc2961
commit 2cd0309b62
3 changed files with 58 additions and 20 deletions

View file

@ -211,20 +211,7 @@ Inflate.prototype.push = function(data, mode) {
if (this.options.to === 'string') {
// realign size to utf8 char border & move tail to start of buffer
next_out_utf8_index = strm.next_out_index - 6;
if (next_out_utf8_index < 0) { next_out_utf8_index = 0; }
tail = strings.utf8tail(strm.next_out[next_out_utf8_index]);
while (next_out_utf8_index + tail <= strm.next_out_index) {
next_out_utf8_index += tail;
tail = strings.utf8tail(strm.next_out[next_out_utf8_index]);
}
// shit happened - broken tail. then take it all.
if (next_out_utf8_index === 0) {
next_out_utf8_index = strm.next_out_index;
}
next_out_utf8_index = strings.utf8border(strm.next_out, strm.next_out_index);
tail = strm.next_out_index - next_out_utf8_index;
utf8str = strings.buf2string(strm.next_out, next_out_utf8_index);

View file

@ -172,7 +172,29 @@ exports.buf2string = function (buf, max) {
};
// calculate tail size of utf8 char by current byte value
exports.utf8tail = function(code) {
return _utf8len[code];
// Calculate max possible position in utf8 buffer,
// that will not break sequence. If that's not possible
// - (very small limits) return max size as is.
//
// buf[] - utf8 bytes array
// max - length limit (mandatory);
exports.utf8border = function(buf, max) {
var pos;
max = max || buf.length;
if (max > buf.length) { max = buf.length; }
// go back from last position, until start of sequence found
pos = max-1;
while (pos >= 0 && (buf[pos] & 0xC0) === 0x80) { pos--; }
// Fuckup - very small and broken sequence,
// return max, because we should return something anyway.
if (pos < 0) { return max; }
// If we came to start of buffer - that means vuffer is too small,
// return max too.
if (pos === 0) { return max; }
return (pos + _utf8len[buf[pos]] > max) ? pos : max;
};

View file

@ -40,11 +40,40 @@ function a2utf16(arr) {
describe('Encode/Decode', function () {
var utf16sample = a2utf16([0x1f3b5, 'abcd', 0x266a, 0x35, 0xe800, 0x10ffff, 0x0fffff]);
// Create sample, that contains all types of utf8 (1-4byte) after conversion
var utf16sample = a2utf16([0x1f3b5, 'a', 0x266a, 0x35, 0xe800, 0x10ffff, 0x0fffff]);
// use node Buffer internal conversion as "done right"
var utf8sample = new Uint8Array(new Buffer(utf16sample));
console.log(utf16sample, utf16sample.length);
console.log(new Buffer(utf16sample));
it('utf-8 border detect', function () {
var ub = strings.utf8border;
assert.equal(ub(utf8sample, 1), 1);
assert.equal(ub(utf8sample, 2), 2);
assert.equal(ub(utf8sample, 3), 3);
assert.equal(ub(utf8sample, 4), 4);
assert.equal(ub(utf8sample, 5), 5);
assert.equal(ub(utf8sample, 6), 5);
assert.equal(ub(utf8sample, 7), 5);
assert.equal(ub(utf8sample, 8), 8);
assert.equal(ub(utf8sample, 9), 9);
assert.equal(ub(utf8sample, 10), 9);
assert.equal(ub(utf8sample, 11), 9);
assert.equal(ub(utf8sample, 12), 12);
assert.equal(ub(utf8sample, 13), 12);
assert.equal(ub(utf8sample, 14), 12);
assert.equal(ub(utf8sample, 15), 12);
assert.equal(ub(utf8sample, 16), 16);
assert.equal(ub(utf8sample, 17), 16);
assert.equal(ub(utf8sample, 18), 16);
assert.equal(ub(utf8sample, 19), 16);
assert.equal(ub(utf8sample, 20), 20);
});
it('Encode string to utf8 buf', function () {
assert.ok(cmp(