From 0b3bc9aa8da69c96770912afbd316abc2893f2d9 Mon Sep 17 00:00:00 2001 From: Jesse Tane Date: Mon, 10 Aug 2015 22:56:19 -0400 Subject: [PATCH] do utf8 -> unicode -> utf16 the way node does --- index.js | 83 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 66 insertions(+), 17 deletions(-) diff --git a/index.js b/index.js index 6c6be99..b9a511b 100644 --- a/index.js +++ b/index.js @@ -623,20 +623,77 @@ function base64Slice (buf, start, end) { } function utf8Slice (buf, start, end) { - var res = '' - var tmp = '' end = Math.min(buf.length, end) - - for (var i = start; i < end; i++) { - if (buf[i] <= 0x7F) { - res += decodeUtf8Char(tmp) + String.fromCharCode(buf[i]) - tmp = '' + var firstByte, secondByte, thirdByte, fourthByte, bytesPerSequence, tempCodePoint, codePoint, res = [] + var i = start + + for (; i < end; i += bytesPerSequence) { + firstByte = buf[i] + codePoint = 0xFFFD + + if (firstByte > 0xEF) { + bytesPerSequence = 4 + } else if (firstByte > 0xDF) { + bytesPerSequence = 3 + } else if (firstByte > 0xBF) { + bytesPerSequence = 2 } else { - tmp += '%' + buf[i].toString(16) + bytesPerSequence = 1 } + + if (i + bytesPerSequence <= end) { + switch (bytesPerSequence) { + case 1: + if (firstByte < 0x80) { + codePoint = firstByte + } + break + case 2: + secondByte = buf[i + 1] + if ((secondByte & 0xC0) === 0x80) { + tempCodePoint = (firstByte & 0x1F) << 0x6 | (secondByte & 0x3F) + if (tempCodePoint > 0x7F) { + codePoint = tempCodePoint + } + } + break + case 3: + secondByte = buf[i + 1] + thirdByte = buf[i + 2] + if ((secondByte & 0xC0) === 0x80 && (thirdByte & 0xC0) === 0x80) { + tempCodePoint = (firstByte & 0xF) << 0xC | (secondByte & 0x3F) << 0x6 | (thirdByte & 0x3F) + if (tempCodePoint > 0x7FF) { + codePoint = tempCodePoint + } + } + break + case 4: + secondByte = buf[i + 1] + thirdByte = buf[i + 2] + fourthByte = buf[i + 3] + if ((secondByte & 0xC0) === 0x80 && (thirdByte & 0xC0) === 0x80 && (fourthByte & 0xC0) === 0x80) { + tempCodePoint = (firstByte & 0xF) << 0x12 | (secondByte & 0x3F) << 0xC | (thirdByte & 0x3F) << 0x6 | (fourthByte & 0x3F) + if (tempCodePoint > 0xFFFF && tempCodePoint < 0x110000) { + codePoint = tempCodePoint + } + } + } + } + + if (codePoint === 0xFFFD) { + // we generated an invalid codePoint so make sure to only advance by 1 byte + bytesPerSequence = 1 + } else if (codePoint > 0xFFFF) { + // encode to utf16 (surrogate pair dance) + codePoint -= 0x10000 + res.push(codePoint >>> 10 & 0x3FF | 0xD800) + codePoint = 0xDC00 | codePoint & 0x3FF + } + + res.push(codePoint) } - return res + decodeUtf8Char(tmp) + return String.fromCharCode.apply(String, res) } function asciiSlice (buf, start, end) { @@ -1453,11 +1510,3 @@ function blitBuffer (src, dst, offset, length) { } return i } - -function decodeUtf8Char (str) { - try { - return decodeURIComponent(str) - } catch (err) { - return String.fromCharCode(0xFFFD) // UTF 8 invalid char - } -} -- 2.34.1