NetTopologySuite.IO.ShapeFile Maybe a little bug about class DbaseFileHeader's encoding ？

Maybe a little bug about class DbaseFileHeader's encoding ？

Open RenZachary opened this issue 3 years ago • 10 comments

The attribute column header and value of a shp file I read contain Chinese characters. It is correct using qgis. When using the following two methods to read the column header, the column header is garbled, but the value is read correctly:

var sdr = new NetTopologySuite.IO.ShapeFile.Extended.ShapeDataReader(shpPath);
foreach (var f in shp.ReadByMBRFilter(shp.ShapefileBounds))
{
    //■■■■■■■■■■■■■■■■■■■■■■■■■
    //header of f.Attributes  here is not currect 
    //■■■■■■■■■■■■■■■■■■■■■■■■■
}


using (var rd = new ShapefileDataReader(readFile, factory, readEncoding))
{
    readHeader = rd.DbaseHeader;

    string[] fieldNames = new string[readHeader.NumFields];
    features = new List<Feature>(readHeader.NumRecords);

    for (int i = 0; i < fieldNames.Length; i++)
    {
        //■■■■■■■■■■■■■■■■■■■■■■■■■
        //rd.GetName(i + 1) here is not currect
        //■■■■■■■■■■■■■■■■■■■■■■■■■
        fieldNames[i] = rd.GetName(i + 1);
    }
    ·····

even

//test all encoding ■■■■■■■■
foreach (var encoding in Encoding.GetEncodings())
{using (var rd = new ShapefileDataReader(readFile, factory, encoding ))
{
    readHeader = rd.DbaseHeader;

    string[] fieldNames = new string[readHeader.NumFields];
    features = new List<Feature>(readHeader.NumRecords);

    for (int i = 0; i < fieldNames.Length; i++)
    {
        //■■■■■■■■■■■■■■■■■■■■■■■■■
        //rd.GetName(i + 1) here is not currect
        //■■■■■■■■■■■■■■■■■■■■■■■■■
        fieldNames[i] = rd.GetName(i + 1);
    }
    ·····
}

so i rewrite the class DbaseFileHeader,change the function public void ReadHeader(BinaryReader reader, string filename) to:

public void ReadHeader(BinaryReader reader, string filename)
{
    // type of reader.
    _fileType = reader.ReadByte();
    if (_fileType != 0x03)
        throw new NotSupportedException("Unsupported DBF reader Type " + _fileType);

    // parse the update date information.
    int year = reader.ReadByte();
    int month = reader.ReadByte();
    int day = reader.ReadByte();
    _updateDate = new DateTime(year + 1900, month, day);

    // read the number of records.
    _numRecords = reader.ReadInt32();

    // read the length of the header structure.
    _headerLength = reader.ReadInt16();

    // read the length of a record
    _recordLength = reader.ReadInt16();

    // skip the reserved bytes in the header.
    //in.skipBytes(20);
    byte[] data = reader.ReadBytes(20);
    byte lcid = data[29 - 12]; //get the 29th byte in the file... we've first to read into arry was no 12
    
    //■■■■■■■■■■■■■■■■■■■■■■■■■
    //_encoding = DetectEncodingFromMark(lcid, filename);
    //■■■■■■■■■■■■■■■■■■■■■■■■■
    _encoding = this.Encoding;
    //■■■■■■■■■■■■■■■■■■■■■■■■■

    //Replace reader with one with correct encoding..
    reader = new BinaryReader(reader.BaseStream, _encoding);
    // calculate the number of Fields in the header
    _numFields = (_headerLength - FileDescriptorSize - 1) / FileDescriptorSize;

    // read all of the header records
    _fieldDescriptions = new DbaseFieldDescriptor[_numFields];
    for (int i = 0; i < _numFields; i++)
    {
        _fieldDescriptions[i] = new DbaseFieldDescriptor();

        // read the field name				
        byte[] buffer = reader.ReadBytes(11);
        // NOTE: only this _encoding.GetString method is available in Silverlight
        String name = _encoding.GetString(buffer, 0, buffer.Length);
        int nullPoint = name.IndexOf((char)0);
        if (nullPoint != -1)
            name = name.Substring(0, nullPoint);
        _fieldDescriptions[i].Name = name;

        // read the field type
        _fieldDescriptions[i].DbaseType = (char)reader.ReadByte();

        // read the field data address, offset from the start of the record.
        _fieldDescriptions[i].DataAddress = reader.ReadInt32();

        // read the field length in bytes
        int tempLength = reader.ReadByte();
        if (tempLength < 0) tempLength = tempLength + 256;
        _fieldDescriptions[i].Length = tempLength;

        // read the field decimal count in bytes
        _fieldDescriptions[i].DecimalCount = reader.ReadByte();

        // read the reserved bytes.
        //reader.skipBytes(14);
        reader.ReadBytes(14);
    }

    // Last byte is a marker for the end of the field definitions.
    // Trond Benum: This fails for some presumeably valid test shapefiles, so I have commented it out. 
    byte lastByte = reader.ReadBytes(1)[0];
    // if (lastByte != 0x0d)
    //   throw new ShapefileException("DBase Header is not terminated");

    // Assure we are at the end of the header!
    if (reader.BaseStream.Position != _headerLength)
        reader.BaseStream.Seek(_headerLength, SeekOrigin.Begin);
}

Actually I just rewrite the code _encoding = DetectEncodingFromMark(lcid, filename);to ：_encoding = this.Encoding; Then I can get the header correctly by using Encoding with PageCode=936.

var dbf = shpPath.Substring(0,shpPath.LastIndexOf(".shp"))+".dbf";
FileStream stream = new FileStream(dbf, FileMode.Open, FileAccess.Read, FileShare.Read);
var fileReader = new BinaryReader(stream, Encoding.GetEncoding(936));
var header = new DbaseFileHeaderEx(Encoding.GetEncoding(936));
// read the header
header.ReadHeader(fileReader, dbf);

Dec 14 '20 02:12 RenZachary

NetTopologySuite.IO.ShapeFile NetTopologySuite.IO.ShapeFile copied to clipboard

Maybe a little bug about class DbaseFileHeader's encoding ？

NetTopologySuite.IO.ShapeFile
NetTopologySuite.IO.ShapeFile copied to clipboard