UTF-unknown
UTF-unknown copied to clipboard
UTF16 LE (without BOM) is not detected as UTF16 (not supported yet)
For the provided file the encoding is not detected as UTF16 LE even though it is.
detector.Encoding = ASCIIEncoding.ASCIIEncodingSealed
BodyName = {string} "us-ascii"
CodePage = {int} 20127
DecoderFallback = DecoderReplacementFallback
EncoderFallback = EncoderReplacementFallback
EncodingName = {string} "US-ASCII"
HeaderName = {string} "us-ascii"
IsBrowserDisplay = {bool} false
IsBrowserSave = {bool} false
IsMailNewsDisplay = {bool} true
IsMailNewsSave = {bool} true
IsReadOnly = {bool} true
IsSingleByte = {bool} true
IsUTF8CodePage = {bool} false
Preamble = {ReadOnlySpan<byte>} System.ReadOnlySpan<Byte>[0]
WebName = {string} "us-ascii"
WindowsCodePage = {int} 1252
_codePage = {int} 20127
_dataItem = CodePageDataItem
_isReadOnly = {bool} true
decoderFallback = DecoderReplacementFallback
encoderFallback = EncoderReplacementFallback
As workaround i am now using this code
byte[] byteArray = File.ReadAllBytes(filePath);
// count every second byte array if its zero.
int zeroBytesCount = 0;
for (int i = 1; i < byteArray.Length; i += 2)
{
if (byteArray[i] == 0)
{
zeroBytesCount++;
}
}
Encoding encoding = Encoding.UTF8;
// if count is bigger or equal to 40% of the byte array, it most likely UTF16
if (zeroBytesCount >= byteArray.Length * 0.4)
{
encoding = Encoding.Unicode;
}
else
{
DetectionDetail detector = CharsetDetector.DetectFromBytes(byteArray).Detected;
encoding = detector.Encoding;
}