From 554ffaf908d807fc9f0f6cae9bb95593f9457323 Mon Sep 17 00:00:00 2001 From: Vitaly Puzrin Date: Mon, 14 Apr 2014 00:46:30 +0400 Subject: [PATCH] Optimized utf8 decoder --- lib/utils/strings.js | 111 ++++++++++++++++++++++++++++++++----------- 1 file changed, 82 insertions(+), 29 deletions(-) diff --git a/lib/utils/strings.js b/lib/utils/strings.js index b1a8363..317dc72 100644 --- a/lib/utils/strings.js +++ b/lib/utils/strings.js @@ -10,6 +10,13 @@ var STR_APPLY_OK = true; try { String.fromCharCode.apply(null, [0]); } catch(__) { STR_APPLY_OK = false; } +// Table with utf8 lengths +var utf8len = new utils.Buf8(256); +for (var i=0; i<256; i++) { + utf8len[i] = (i >= 252 ? 6 : i >= 248 ? 5 : i >= 240 ? 4 : i >= 224 ? 3 : i >= 192 ? 2 : 1); +} + + // convert string to array (typed, when possible) // src: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Base64_encoding_and_decoding exports.string2buf = function (str) { @@ -68,34 +75,6 @@ exports.string2buf = function (str) { }; -// convert array to string -// src: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Base64_encoding_and_decoding -exports.buf2string = function (buf, max) { - var str = ''; - - for (var part, len = max || buf.length, i = 0; i < len; i++) { - part = buf[i]; - str += String.fromCharCode( - part > 251 && part < 254 && i + 5 < len ? /* six bytes */ - /* (part - 252 << 32) is not possible in ECMAScript! So...: */ - (part - 252) * 1073741824 + (buf[++i] - 128 << 24) + (buf[++i] - 128 << 18) + (buf[++i] - 128 << 12) + (buf[++i] - 128 << 6) + buf[++i] - 128 - : part > 247 && part < 252 && i + 4 < len ? /* five bytes */ - (part - 248 << 24) + (buf[++i] - 128 << 18) + (buf[++i] - 128 << 12) + (buf[++i] - 128 << 6) + buf[++i] - 128 - : part > 239 && part < 248 && i + 3 < len ? /* four bytes */ - (part - 240 << 18) + (buf[++i] - 128 << 12) + (buf[++i] - 128 << 6) + buf[++i] - 128 - : part > 223 && part < 240 && i + 2 < len ? /* three bytes */ - (part - 224 << 12) + (buf[++i] - 128 << 6) + buf[++i] - 128 - : part > 191 && part < 224 && i + 1 < len ? /* two bytes */ - (part - 192 << 6) + buf[++i] - 128 - : /* part < 127 ? */ /* one byte */ - part - ); - } - - return str; -}; - - // Convert byte array to binary string exports.buf2binstring = function(buf) { // use fallback for big arrays to avoid stack overflow @@ -121,7 +100,81 @@ exports.binstring2buf = function(str) { }; +// convert array to string +// src: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Base64_encoding_and_decoding +exports.buf2string = function (buf, max) { + /*jshint nonstandard:true*/ + // That's not as fast as via String.fromCharCode.appy + /*return decodeURIComponent(escape(exports.buf2binstring( + (buf.length === max) ? + buf + : + buf.subarray ? buf.subarray(0, max) : buf.slice(0, max) + )));*/ + + var str = '', i, out, part, char_len; + var len = max || buf.length; + var out_len = 0; + var utf16buf; + + // Calculate output length + for (i=0; i < len;) { + i += utf8len[buf[i]]; + out_len++; + } + + utf16buf = new utils.Buf32(out_len); + + for (out=0, i=0; i len) { + utf16buf[out++] = 0x7f; + break; + } + switch (char_len) { + case 1: + utf16buf[out++] = part; + break; + case 2: + utf16buf[out++] = ((part & 0x1f) << 6) | (buf[++i] & 0x7f); + break; + case 3: + utf16buf[out++] = ((part & 0x0f) << 12) | ((buf[++i] & 0x3f) << 6) | (buf[++i] & 0x3f); + break; + case 4: + utf16buf[out++] = ((part & 0x07) << 18) | ((buf[++i] & 0x3f) << 12) | ((buf[++i] & 0x3f) << 6) + (buf[++i] & 0x3f); + break; + // 5 & 6 bytes uticodes not supported in UTF16 (JS), so fill with dummy symbol + case 5: + i += 4; + utf16buf[out++] = 0x7f; + //utf16buf[out++] = (part - 248 << 24) + (buf[++i] - 128 << 18) + (buf[++i] - 128 << 12) + (buf[++i] - 128 << 6) + buf[++i] - 128; + break; + case 6: + i += 5; + utf16buf[out++] = 0x7f; + // (part - 252 << 32) is not possible in ECMAScript! So...: + //utf16buf[out++] = (part - 252) * 1073741824 + (buf[++i] - 128 << 24) + (buf[++i] - 128 << 18) + (buf[++i] - 128 << 12) + (buf[++i] - 128 << 6) + buf[++i] - 128; + break; + } + } + + if (STR_APPLY_OK) { + return String.fromCharCode.apply(null, utf16buf); + } + + // Fallback, when String.fromCharCode.apply not available + for (i=0, len=utf16buf.length; i= 252 ? 6 : code >= 248 ? 5 : code >= 240 ? 4 : code >= 224 ? 3 : code >= 192 ? 2 : 1; + return utf8len[code]; };