parquet-carpet
parquet-carpet copied to clipboard
CompressionCodec does not seem to be working properly
public List<MyRecord> generateRecords(int n) {
List<MyRecord> records = new ArrayList<>(n);
for (int i = 0; i < n; i++) {
int int32 = random.nextInt();
long int64 = random.nextLong();
boolean some_boolean = random.nextBoolean();
String byte_array = generateRandomString(4);
String flba = generateRandomString(4);
MyRecord record = new MyRecord(int32, int64, some_boolean, byte_array, flba);
records.add(record);
}
return records;
}
public static void main(String[] args) throws Exception {
MyRecordGenerator recordGenerator = new MyRecordGenerator(5);
List<MyRecord> data = recordGenerator.generateRecords(10000000);
try (OutputStream outputStream = new FileOutputStream("my_file.parquet")) {
try (CarpetWriter writer = new CarpetWriter.Builder(outputStream, MyRecord.class)
.withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
.withCompressionCodec(CompressionCodecName.UNCOMPRESSED)
.build()) {
writer.write(data);
}
}
}
arthur@arthur:~/parquet-carpet-attempt$ ls -l my_file.parquet
-rw-rw-r-- 1 arthur arthur 156779418 Sep 18 17:31 my_file.parquet
If I run the same code, but with CompressionCodecName.GZIP
, I get a very similar file size:
arthur@arthur:~/parquet-carpet-attempt$ ls -l my_file.parquet
-rw-rw-r-- 1 arthur arthur 156633097 Sep 18 17:32 my_file.parquet
Inspecting the file shows space saved ~0%:
arthur@arthur:~/parquet-carpet-attempt$ pipx run parquet-tools inspect my_file.parquet
############ file meta data ############
created_by: parquet-mr version 1.14.2 (build e7937382e7894f4780c90eb6f896c163cad4cd93)
num_columns: 5
num_rows: 10000000
num_row_groups: 2
format_version: 1.0
serialized_size: 1500
############ Columns ############
int32
int64
some_boolean
byte_array
flba
############ Column(int32) ############
name: int32
path: int32
max_definition_level: 0
max_repetition_level: 0
physical_type: INT32
logical_type: None
converted_type (legacy): NONE
compression: GZIP (space_saved: -0%)
############ Column(int64) ############
name: int64
path: int64
max_definition_level: 0
max_repetition_level: 0
physical_type: INT64
logical_type: None
converted_type (legacy): NONE
compression: GZIP (space_saved: -0%)
############ Column(some_boolean) ############
name: some_boolean
path: some_boolean
max_definition_level: 0
max_repetition_level: 0
physical_type: BOOLEAN
logical_type: None
converted_type (legacy): NONE
compression: GZIP (space_saved: -1%)
############ Column(byte_array) ############
name: byte_array
path: byte_array
max_definition_level: 1
max_repetition_level: 0
physical_type: BYTE_ARRAY
logical_type: String
converted_type (legacy): UTF8
compression: GZIP (space_saved: 0%)
############ Column(flba) ############
name: flba
path: flba
max_definition_level: 1
max_repetition_level: 0
physical_type: BYTE_ARRAY
logical_type: String
converted_type (legacy): UTF8
compression: GZIP (space_saved: 0%)