From db9fbd98cc2c89dcb34673becc32cdd930a41295 Mon Sep 17 00:00:00 2001 From: Jesse Tane Date: Mon, 15 Dec 2014 19:17:57 -0500 Subject: [PATCH] do not encode partial or invalid code points to utf8 --- index.js | 106 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 92 insertions(+), 14 deletions(-) diff --git a/index.js b/index.js index 1a7144a..f01d6ac 100644 --- a/index.js +++ b/index.js @@ -341,7 +341,7 @@ function hexWrite (buf, string, offset, length) { } function utf8Write (buf, string, offset, length) { - var charsWritten = blitBuffer(utf8ToBytes(string), buf, offset, length) + var charsWritten = blitBuffer(utf8ToBytes(string, buf.length - offset), buf, offset, length) return charsWritten } @@ -1171,22 +1171,100 @@ function toHex (n) { return n.toString(16) } -function utf8ToBytes (str) { - var byteArray = [] - for (var i = 0; i < str.length; i++) { - var b = str.charCodeAt(i) - if (b <= 0x7F) { - byteArray.push(b) - } else { - var start = i - if (b >= 0xD800 && b <= 0xDFFF) i++ - var h = encodeURIComponent(str.slice(start, i+1)).substr(1).split('%') - for (var j = 0; j < h.length; j++) { - byteArray.push(parseInt(h[j], 16)) +function utf8ToBytes(string, units) { + var codePoint, length = string.length + var leadSurrogate = null + var units = units || Infinity + var bytes = [] + var i = 0 + + for (; i 0xD7FF && codePoint < 0xE000) { + + // last char was a lead + if (leadSurrogate) { + + // 2 leads in a row + if (codePoint < 0xDC00) { + if ((units -= 3) > -1) bytes.push(0xEF, 0xBF, 0xBD) + leadSurrogate = codePoint + continue + } + + // valid surrogate pair + else { + codePoint = leadSurrogate - 0xD800 << 10 | codePoint - 0xDC00 | 0x10000 + leadSurrogate = null + } + } + + // no lead yet + else { + + // unexpected trail + if (codePoint > 0xDBFF) { + if ((units -= 3) > -1) bytes.push(0xEF, 0xBF, 0xBD) + continue + } + + // unpaired lead + else if (i + 1 === length) { + if ((units -= 3) > -1) bytes.push(0xEF, 0xBF, 0xBD) + continue + } + + // valid lead + else { + leadSurrogate = codePoint + continue + } } } + + // valid bmp char, but last char was a lead + else if (leadSurrogate) { + if ((units -= 3) > -1) bytes.push(0xEF, 0xBF, 0xBD) + leadSurrogate = null + } + + // encode utf8 + if (codePoint < 0x80) { + if ((units -= 1) < 0) break + bytes.push(codePoint) + } + else if (codePoint < 0x800) { + if ((units -= 2) < 0) break + bytes.push( + codePoint >> 0x6 | 0xC0, + codePoint & 0x3F | 0x80 + ); + } + else if (codePoint < 0x10000) { + if ((units -= 3) < 0) break + bytes.push( + codePoint >> 0xC | 0xE0, + codePoint >> 0x6 & 0x3F | 0x80, + codePoint & 0x3F | 0x80 + ); + } + else if (codePoint < 0x200000) { + if ((units -= 4) < 0) break + bytes.push( + codePoint >> 0x12 | 0xF0, + codePoint >> 0xC & 0x3F | 0x80, + codePoint >> 0x6 & 0x3F | 0x80, + codePoint & 0x3F | 0x80 + ); + } + else { + throw new Error('Invalid code point') + } } - return byteArray + + return bytes } function asciiToBytes (str) { -- 2.34.1