parquetjs
parquetjs copied to clipboard
Problems with reader and deep schemas
Here is an example of a schema that is three levels deep. Shreading and Materializing a single record works fine however writing a parquet file and reading it back results in an error:
const parquet = require('parquetjs');
var schema = new parquet.ParquetSchema({
a: {
fields: {
b: {
fields: {
c: {type: 'UTF8'}
}
}
}
}
});
let rec = {a: {b: {c: 'this is a test'}}};
async function main() {
// shread & materialize:
console.log('shread & materialize:');
let buf = {};
parquet.ParquetShredder.shredRecord(schema, rec, buf);
console.log(parquet.ParquetShredder.materializeRecords(schema, buf));
// writer and reader
console.log('writer & reader:');
const writer = await parquet.ParquetWriter.openFile(schema, 'test.parquet');
await writer.appendRow(rec);
await writer.close();
let reader = await parquet.ParquetReader.openFile('test.parquet');
let cursor = reader.getCursor();
let record = null;
while (record = await cursor.next()) {
console.log(record);
}
await reader.close();
}
main().then(console.log,console.log)
Output is:
shread & materialize:
[ { a: { b: [Object] } } ]
writer & reader:
TypeError: Cannot read property 'rLevelMax' of undefined
at ParquetEnvelopeReader.readColumnChunk (/home/zjonsson/git/parquetjs/lib/reader.js:344:24)
at <anonymous>
The problem seems to be with how the reader reconstructs the schema from the parquet file.
If I log the original fields from the schema (i.e. console.log(schema.fields)
), I get:
{
"a": {
"name": "a",
"path": [
"a"
],
"repetitionType": "REQUIRED",
"rLevelMax": 0,
"dLevelMax": 0,
"isNested": true,
"fieldCount": 1,
"fields": {
"b": {
"name": "b",
"path": [
"a",
"b"
],
"repetitionType": "REQUIRED",
"rLevelMax": 0,
"dLevelMax": 0,
"isNested": true,
"fieldCount": 1,
"fields": {
"c": {
"name": "c",
"path": [
"a",
"b",
"c"
],
"repetitionType": "REQUIRED",
"rLevelMax": 0,
"dLevelMax": 0,
"isNested": true,
"fieldCount": 1,
"fields": {
"d": {
"name": "d",
"primitiveType": "BYTE_ARRAY",
"originalType": "UTF8",
"path": [
"a",
"b",
"c",
"d"
],
"repetitionType": "REQUIRED",
"encoding": "PLAIN",
"compression": "UNCOMPRESSED",
"rLevelMax": 0,
"dLevelMax": 0
...
However if I look at the schema created by the reader (i.e. console.log(reader.schema.fields)
) I get:
{
"a": {
"name": "a",
"path": [
"a"
],
"repetitionType": "REQUIRED",
"rLevelMax": 0,
"dLevelMax": 0,
"isNested": true,
"fieldCount": 1,
"fields": {
"b": {
"name": "b",
"path": [
"a",
"b"
],
"repetitionType": "REQUIRED",
"rLevelMax": 0,
"dLevelMax": 0,
"isNested": true,
"fieldCount": 0,
"fields": {}
}
}
},
"c": {
"name": "c",
"path": [
"c"
],
"repetitionType": "REQUIRED",
"rLevelMax": 0,
"dLevelMax": 0,
"isNested": true,
"fieldCount": 1,
"fields": {
"d": {
"name": "d",
"primitiveType": "BYTE_ARRAY",
"originalType": "UTF8",
"path": [
"c",
"d"
],
"repetitionType": "REQUIRED",
"encoding": "PLAIN",
"compression": "UNCOMPRESSED",
"rLevelMax": 0,
"dLevelMax": 0
}
}
}
}
same here for me.
As a note this error is only happening when you read the whole row
cursor.next()
If you pass in the columns you want this error doesn't happen