parquet-dotnet
parquet-dotnet copied to clipboard
Reading Parquet files using parquet.Net is slower than reading it from python
Version: All
Runtime Version: All
OS: Windows
Expected behavior
Reading parquet files using this package should take same or less time than accessing it from python
Actual behavior
Reading from python is comparatively far far better than using parquet.net package.
Code snippet reproducing the behavior
From C#:
public long ReadYColumnsV1(string path, int[] yColIndex)
{
List<double[]> dataValues = new List<double[]>();
using (Stream fileStream = File.OpenRead(path))
{
using (var parquetReader = new ParquetReader(fileStream))
{
DataField[] dataFields = parquetReader.Schema.GetDataFields();
for (int currentRowGroup = 0; currentRowGroup < parquetReader.RowGroupCount; currentRowGroup++)
{
using (ParquetRowGroupReader groupReader = parquetReader.OpenRowGroupReader(currentRowGroup))
{
for (int i = 0; i < yColIndex.Count(); i++)
{
var dataColumn = parquetReader.OpenRowGroupReader(currentRowGroup).ReadColumn(dataFields[yColIndex[i]]);
Array reData = dataColumn.Data;
dataValues.Add((double[])reData);
}
}
}
}
}
}
From python:
def read_column_data_v1(file_path, file_name, columns):
file_path = f"{file_path}\\{file_name}.parquet"
file_data = pq.ParquetFile(file_path)
for i in range(file_data.metadata.num_row_groups):
data = file_data.read_row_group(i, columns)