UTF16 LE (without BOM) is not detected as UTF16 (not supported yet)

Open Mosch0512 opened this issue 2 years ago • 2 comments

For the provided file the encoding is not detected as UTF16 LE even though it is.

detector.Encoding = ASCIIEncoding.ASCIIEncodingSealed
 BodyName = {string} "us-ascii"
 CodePage = {int} 20127
 DecoderFallback = DecoderReplacementFallback
 EncoderFallback = EncoderReplacementFallback
 EncodingName = {string} "US-ASCII"
 HeaderName = {string} "us-ascii"
 IsBrowserDisplay = {bool} false
 IsBrowserSave = {bool} false
 IsMailNewsDisplay = {bool} true
 IsMailNewsSave = {bool} true
 IsReadOnly = {bool} true
 IsSingleByte = {bool} true
 IsUTF8CodePage = {bool} false
 Preamble = {ReadOnlySpan<byte>} System.ReadOnlySpan<Byte>[0]
 WebName = {string} "us-ascii"
 WindowsCodePage = {int} 1252
 _codePage = {int} 20127
 _dataItem = CodePageDataItem
 _isReadOnly = {bool} true
 decoderFallback = DecoderReplacementFallback
 encoderFallback = EncoderReplacementFallback

As workaround i am now using this code

byte[] byteArray = File.ReadAllBytes(filePath);

// count every second byte array if its zero.
int zeroBytesCount = 0;
for (int i = 1; i < byteArray.Length; i += 2)
{
	if (byteArray[i] == 0)
	{
		zeroBytesCount++;
	}
}

Encoding encoding = Encoding.UTF8;
// if count is bigger or equal to 40% of the byte array, it most likely UTF16
if (zeroBytesCount >= byteArray.Length * 0.4)
{
	encoding = Encoding.Unicode;
}
else
{
	DetectionDetail detector = CharsetDetector.DetectFromBytes(byteArray).Detected;
	encoding = detector.Encoding;
}

Nov 23 '23 08:11 Mosch0512