parquet-go icon indicating copy to clipboard operation
parquet-go copied to clipboard

floor cannot read back empty list: "sub-group list or bag not found"

Open ignaskukenys opened this issue 2 years ago • 0 comments

Describe the bug Using floor struct writer/reader: an empty slice field (e.g. []string{}) written as a list cannot be read back, resulting in error sub-group list or bag not found.

Unit test to reproduce Can modify the existing test in floor/reader_test.go:

func TestReadWriteSlice(t *testing.T) {
	_ = os.Mkdir("files", 0755)

	sd, err := parquetschema.ParseSchemaDefinition(
		`message test_msg {
			required group foo (LIST) {
				repeated group list {
					required binary element (STRING);
				}
			}
		}`)
	require.NoError(t, err, "parsing schema definition failed")

	t.Logf("schema definition: %s", spew.Sdump(sd))

	hlWriter, err := NewFileWriter(
		"files/list.parquet",
		goparquet.WithCompressionCodec(parquet.CompressionCodec_SNAPPY),
		goparquet.WithCreator("floor-unittest"),
		goparquet.WithSchemaDefinition(sd),
	)
	require.NoError(t, err)

	type testMsg struct {
		Foo []string
	}

	testData := []testMsg{
		{Foo: []string{}},  // Note: empty slice
	}

	for _, tt := range testData {
		require.NoError(t, hlWriter.Write(tt))
	}
	require.NoError(t, hlWriter.Close())

	hlReader, err := NewFileReader("files/list.parquet")
	require.NoError(t, err)

	count := 0

	var result []testMsg

	for hlReader.Next() {
		var msg testMsg

		require.NoError(t, hlReader.Scan(&msg), "%d. Scan failed", count)
		t.Logf("%d. data = %#v", count, hlReader.data)

		result = append(result, msg)

		count++
	}

	require.NoError(t, hlReader.Err(), "hlReader returned an error")
	t.Logf("count = %d", count)

	for idx, elem := range result {
		require.Equal(t, testData[idx], elem, "%d. read result doesn't match expected data")
	}

	require.NoError(t, hlReader.Close())
}

parquet-go specific details

  • Version 0.11.0

ignaskukenys avatar Jun 23 '22 19:06 ignaskukenys