arrow2
arrow2 copied to clipboard
arrow2 cannot read ipc files compressed by official's arrow crate
I am quite not sure to understand why, but ipc files created by arrow
are not readable by arrow2
.
Please see reproduction:
[dependencies]
arrow2 = { version = "0.18.0", features = ["io_ipc", "io_ipc_compression"]}
arrow-schema = "50.0.0"
arrow-array = "50.0.0"
arrow-ipc = { version = "50.0.0", features = ["lz4", "zstd"]}
use std::{
sync::Arc,
fs::File
};
fn check_using_arrow(filepath: &str) {
let file = File::open(&filepath).unwrap();
let mut reader = arrow_ipc::reader::FileReader::try_new(&file, None).unwrap();
if let Some(maybe_batch) = reader.next() {
match maybe_batch {
Ok(_) => {
println!("✅ {:?} Recognized by Arrow", filepath);
},
Err(e) => {
println!("❌ {:?} Not recognized by Arrow: {:?}", filepath, e);
}
}
}
}
fn check_using_arrow2(filepath: &str) {
let file = File::open(&filepath).unwrap();
let metadata = arrow2::io::ipc::read::read_file_metadata(&mut File::open(filepath).unwrap()).unwrap().clone();
let mut reader = arrow2::io::ipc::read::FileReader::new(&file, metadata, None, None);
if let Some(maybe_chunk) = reader.next() {
match maybe_chunk {
Ok(_) => {
println!("✅ {:?} Recognized by Arrow2", filepath);
},
Err(e) => {
println!("❌ {:?} Not recognized by Arrow2: {:?}", filepath, e);
}
}
}
}
fn create_new_file_using_arrow(file: &str, compression: Option<arrow_ipc::CompressionType>) {
let batch = arrow_array::RecordBatch::try_from_iter(vec![
("col", Arc::new(arrow_array::Float64Array::from(vec![1.0])) as arrow_array::ArrayRef),
]).unwrap();
let mut file = File::create(&file).unwrap();
let options = arrow_ipc::writer::IpcWriteOptions::try_new(8, false, arrow_ipc::MetadataVersion::V5)
.unwrap()
.try_with_compression(compression)
.unwrap();
{
let mut writer = arrow_ipc::writer::FileWriter::try_new_with_options(&mut file, &batch.schema(), options).unwrap();
writer.write(&batch).unwrap();
writer.finish().unwrap();
}
}
fn create_new_file_using_arrow2(file: &str, compression: Option<arrow2::io::ipc::write::Compression>) {
let fields = vec![
arrow2::datatypes::Field::new(String::from("col"), arrow2::datatypes::DataType::Float64, false),
];
let schema = arrow2::datatypes::Schema::from(fields);
let mut file = File::create(&file).unwrap();
let options = arrow2::io::ipc::write::WriteOptions {
compression
};
let col = arrow2::array::PrimitiveArray::from_vec(vec![1.0]);
let mut writer = arrow2::io::ipc::write::FileWriter::try_new(&mut file, schema, None, options).unwrap();
writer.write(&arrow2::chunk::Chunk::new(vec![Box::new(col)]), None).unwrap();
writer.finish().unwrap();
}
fn main(){
let filepath_arrow_no_compress = "./created_by_arrow_without_compression.ipc";
create_new_file_using_arrow(filepath_arrow_no_compress, None);
check_using_arrow(filepath_arrow_no_compress);
check_using_arrow2(filepath_arrow_no_compress);
let filepath_arrow_zstd = "./created_by_arrow_with_zstd.ipc";
create_new_file_using_arrow(filepath_arrow_zstd, Some(arrow_ipc::CompressionType::ZSTD));
check_using_arrow(filepath_arrow_zstd);
check_using_arrow2(filepath_arrow_zstd);
let filepath_arrow_lz4 = "./created_by_arrow_with_lz4.ipc";
create_new_file_using_arrow(filepath_arrow_lz4, Some(arrow_ipc::CompressionType::LZ4_FRAME));
check_using_arrow(filepath_arrow_lz4);
check_using_arrow2(filepath_arrow_lz4);
}
yields:
✅ "./created_by_arrow_without_compression.ipc" Recognized by Arrow
✅ "./created_by_arrow_without_compression.ipc" Recognized by Arrow2
✅ "./created_by_arrow_with_zstd.ipc" Recognized by Arrow
❌ "./created_by_arrow_with_zstd.ipc" Not recognized by Arrow2: Io(Custom { kind: Other, error: "Unknown frame descriptor" })
✅ "./created_by_arrow_with_lz4.ipc" Recognized by Arrow
❌ "./created_by_arrow_with_lz4.ipc" Not recognized by Arrow2: Io(Custom { kind: Other, error: LZ4Error("ERROR_frameType_unknown") })
As you can see, whenever ipc files are compressed using Arrow, then arrow2 does not recognize it.