addoption "charset" to decode filename
add option "charset" to support double-byte encoding language, this can fix chinese word erro show.
options = utils.extend(options || {}, {
base64: false,
checkCRC32: false,
optimizedBinaryString: false,
createFolders: false,
decodeFileName: utf8.utf8decode,
charset:'gbk'
});
decode:function(u8,i){
i = i||0;
var charset = this.loadOptions.charset;
if(charset&&charset!='utf8'){
for(;i<u8.byteLength;i++){
if(u8[i]>127){
//not a ascii
var utf8 = false;
var k=0;
for(var j=1;j<u8[i].toString(2).split('0')[0].length;j++){
if(u8[i+j]>>6==2){
//10xxxxxx
k+=1;
}
}
if(k>0&&k==j-1&&u8[i+j]>>6!=2){
if(k==1&&charset=='gbk'){
//double byte
//some gbk will erro
//return this.decode(u8,j);
}else{
utf8 = true;
}
}
if(utf8===false)return new TextDecoder(charset).decode(u8);
break;
}
}
}
return new TextDecoder().decode(u8);
},
handleUTF8: function() {
var charset = this.loadOptions.charset,
utf8decode = utf8.utf8decode,
decode = this.loadOptions.decodeFileName||utf8decode;
if(charset&&'TextDecoder' in window&&'Uint8Array' in window){
this.fileNameStr = this.decode(this.fileName);
this.fileCommentStr = this.decode(this.fileComment);
}else if(this.useUTF8()){
this.fileNameStr = utf8decode(this.fileName);
this.fileCommentStr = utf8decode(this.fileComment);
}else{
var decodeParamType = support.uint8array ? "uint8array" : "array";
var upath = this.findExtraFieldUnicodePath();
if (upath !== null) {
this.fileNameStr = upath;
} else {
// ASCII text or unsupported code page
var fileNameByteArray = utils.transformTo(decodeParamType, this.fileName);
this.fileNameStr = decode(fileNameByteArray);
}
var ucomment = this.findExtraFieldUnicodeComment();
if (ucomment !== null) {
this.fileCommentStr = ucomment;
} else {
// ASCII text or unsupported code page
var commentByteArray = utils.transformTo(decodeParamType, this.fileComment);
this.fileCommentStr = decode(commentByteArray);
}
}
},
Thanks for the PR! Does this only work for gbk encoding, or also others?
And a few thoughts/comments:
- Could you merge
mainand fix the linting errors - Could you add comments to the new code to explain what is going on?
- I think it might be best if
utf8.jswas renamed to maybecharset.js, anddecodewas moved there - Could you add some tests for this new code?
Thanks for the PR! Does this only work for
gbkencoding, or also others?And a few thoughts/comments:
Could you merge
mainand fix the linting errorsCould you add comments to the new code to explain what is going on?
I think it might be best if
utf8.jswas renamed to maybecharset.js, anddecodewas moved thereCould you add some tests for this new code?
-
By coincidence, I compressed the list of file names encoded with gbk and utf8, and it was successfully decoded!
-
I didn't test other ansi codes such as Japanese and Korean. However, it is certain that at least it is more convenient than the original utf8 encoding, such as emoji.
-
There may be a problem. Ansi double-byte problem handling problem, because utf8 also has double bytes. Refer to k==1 &&charset="gbk" in decode, which means that if gbk is specified, then all double bytes are encoded as gbk, or continue to loop. Until you make sure it's not utf8.
-
I think ansyc ("text") decodes faster in this way, and utf8 files will not be garbled.