UTF.readAsBinaryString()返回FileReader-8编码的二进制字符串。如何获取一系列Unicode转义序列(\uxxxx)形式的数据?
发布于 2015-03-25 16:31:29
FileReader.readAsBinaryString()已弃用-请改用readAsArrayBuffer()。这允许您使用以下两种方法之一将输入字符串转换为转义的unicode字符:
方法1
这将在其上使用ArrayBuffer和Uint8Array视图。在下面的演示中,假定缓冲区是预先加载的(提供了一些虚拟数据)。
var buffer = new Uint8Array([0x20, 0xac, 0x2b, 0x08]), // big-endian format
pos = 0, txt = "";
// iterate buffer byte-per-byte and build string:
while(pos < buffer.length)
txt += "\\u" + toString(buffer[pos++]) + toString(buffer[pos++]);
// make sure we end up with two digits (v < 0x10)
function toString(v) {
var s = v.toString(16); return s.length === 1 ? "0" + s : s
}
out.innerHTML = txt;<output id="out"></output>
如果数据是小端的(在大多数主流系统上),您可以在这里使用Uint16Array,并且只需将单个值转换为字符串,而不是两个。或者使用DataView,这样您就可以读取请求字符顺序。这可能会更快,也可能不会更快(浏览器将进行字节交换,我们在一个操作中读取16位,但检查只是在toString方法中合并):
var data = new Uint8Array([0x20, 0xac, 0x2b, 0x08]), // big-endian format
view = new DataView(data.buffer), // use a view on the ArrayBuffer
pos = 0, txt = "";
// iterate buffer byte-per-byte and build string:
while(pos < view.byteLength) {
txt += "\\u" + toString(view.getUint16(pos, false)); // true = little endian
pos += 2
};
// make sure we end up with four digits
function toString(v) {
var s = v.toString(16);
return s.length === 3 ? "0" + s : (s.length === 2 ? "00" + s : s)
}
out.innerHTML = txt;<output id="out"></output>
方法2
它使用新的TextDecoder应用程序接口来解析输入缓冲区-这里也假设它是一个ArrayBuffer。
然后将escape与replace一起使用。这是一种快速转换的方法,但是也不推荐使用escape()。然而,它不会很快走到任何地方,所以如果你觉得大胆,它可能是一个选择-在任何情况下,我都会把它包含在这里:
var td = new TextDecoder("utf-16be"), // be = big endian, def: le
buffer = new Uint8Array([0x20, 0xac, 0x2b, 0x08]); // big-endian format
// assumes data loaded into an ArrayBuffer
var txt = td.decode(buffer);
// escape is deprecated but won't go anywhere for a while:
out.innerHTML = escape(txt).replace(/%/g, "\\");
// or use the same last step as in method 1, just showing an alternative way
//=> "\u20AC\u2B08"<output id="out"></output>
注释:你可能已经注意到了,我已经指出了字节顺序的高字节顺序。通常,从网络读取文件或二进制数据时使用big-endian (也称为网络顺序)。如果数据恰好是little-endian格式,则需要交换字节顺序:
对于方法1,您可以执行以下操作:
while(pos < buffer.length) {
txt += "\\u" + toString(buffer[pos+1]) + toString(buffer[pos]);
pos += 2;
}或者只是使用带有上面提到的修改后的toString方法的Uint16Array。
对于方法2,您可以简单地为utf-16指定小端版本:
var td = new TextDecoder("utf-16"); // default = little-endian注意,TextDecoder还不是稳定的。
发布于 2015-03-24 02:25:52
我想我可能已经为你找到了一个解决方案。
在this site上,我发现了一个将文本转换为Unicode表示法的开源项目。
我编辑了与您的项目相关的函数,并创建了一个处理打开文件的小函数。
/*
Copyright (C) 2007 Richard Ishida ishida@w3.org
This program is free software; you can redistribute it and/or modify it under the terms
of the GNU General Public License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version as long as you point to
http://rishida.net/ in your code.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details. http://www.gnu.org/licenses/gpl.html
*/
function dec2hex(textString) {
return (textString + 0).toString(16).toUpperCase();
}
function convertCharStr2Unicode(textString, preserve, pad) {
// converts a string of characters to U+... notation, separated by space
// textString: string, the string to convert
// preserve: string enum [ascii, latin1], a set of characters to not convert
// pad: boolean, if true, hex numbers lower than 1000 are padded with zeros
var haut = 0;
var n = 0;
var CPstring = '';
for (var i = 0; i < textString.length; i++) {
var b = textString.charCodeAt(i);
if (b < 0 || b > 0xFFFF) {
CPstring += 'Error in convertChar2CP: byte out of range ' + dec2hex(b) + '!';
}
if (haut != 0) {
if (0xDC00 <= b && b <= 0xDFFF) {
CPstring += dec2hex(0x10000 + ((haut - 0xD800) << 10) + (b - 0xDC00)) + ' ';
haut = 0;
continue;
} else {
CPstring += 'Error in convertChar2CP: surrogate out of range ' + dec2hex(haut) + '!';
haut = 0;
}
}
if (0xD800 <= b && b <= 0xDBFF) {
haut = b;
} else {
if (b <= 127 && preserve == 'ascii') {
CPstring += textString.charAt(i) + ' ';
} else if (b <= 255 && preserve == 'latin1') {
CPstring += textString.charAt(i) + ' ';
} else {
cp = dec2hex(b);
if (pad) {
while (cp.length < 4) {
cp = '0' + cp;
}
}
CPstring += '\\u' + cp + ' ';
}
}
}
return CPstring.substring(0, CPstring.length - 1);
}演示(带文件处理和文本输入):
发布于 2015-03-24 05:30:44
此函数将转义非ascii字符并将unicode转换回\uHHHH。
function ascii(str) {
var s = ""
for (var i = 0, len = str.length; i < len; i++) {
var n = str.charCodeAt(i);
if (n >= 32 && n <= 126) {
// printable ASCII
s += str.charAt(i);
} else {
// unicode escape everything else
n = n.toString(16);
n = "0000".substr(n.length) + n;
s+= "\\u" + n;
}
}
return s
}您可能希望保留文本格式或将其编码为\n、\r、\t。如果需要编辑文本,这可能会很有帮助。
var x = "♫\n☆\n⚛\n☯\n⚓\n";
console.log(x);
function ascii(str, formatting, convert) {
var s = ""
var TAB = 9, LF = 10, CR = 13;
for (var i = 0, len = str.length; i < len; i++) {
var n = str.charCodeAt(i);
if (n >= 32 && n <= 126) {
// printable ASCII
s += str.charAt(i);
} else if(formatting === true && (n === TAB || n === LF || n === CR)) {
if (convert === true) {
s += n === TAB ? "\\t" : n === LF ? "\\n" : "\\r";
} else {
s += str.charAt(i);
}
} else {
// unicode escape everything else
n = n.toString(16);
n = "0000".substr(n.length) + n;
s+= "\\u" + n;
}
}
return s;
}
console.log(ascii(x));
console.log(ascii(x, true));
console.log(ascii(x, true, true));输出:
♫
☆
⚛
☯
⚓
\u266b\u000a\u2606\u000a\u269b\u000a\u262f\u000a\u2693\u000a
\u266b
\u2606
\u269b
\u262f
\u2693
\u266b\n\u2606\n\u269b\n\u262f\n\u2693\nhttps://stackoverflow.com/questions/29007389
复制相似问题