polars
polars copied to clipboard
Reading nested struct panics with `OutOfSpec` error
What language are you using?
Rust
Which feature gates did you use?
"polars-io", "parquet", "lazy", "dtype-struct"
Have you tried latest version of polars?
- [yes]
What version of polars are you using?
Latest, master branch.
What operating system are you using polars on?
macOS Monterey 12.3.1
What language version are you using
$ rustc --version
rustc 1.64.0-nightly (495b21669 2022-07-03)
$ cargo --version
cargo 1.64.0-nightly (dbff32b27 2022-06-24)
Describe your bug.
Reading nested struct panics with OutOfSpec error.
What are the steps to reproduce the behavior?
Given the attached parquet file with only 2 rows: nested_struct_OutOfSpec.snappy.parquet.zip
Running the following code:
let file_location = "nested_struct_OutOfSpec.snappy.parquet".to_string();
let df = LazyFrame::scan_parquet(
file_location,
ScanArgsParquet::default())
.unwrap()
.select([all()])
.collect()
.unwrap();
dbg!(df);
Results in this panic error:
thread 'main' panicked at 'called `Result::unwrap()` on an `Err` value: OutOfSpec("The children
DataTypes of a StructArray must equal the children data types.\n However, the
values 1 has a length of 11, which is different from values 0, 2.")',
/.../.cargo/git/checkouts/arrow2-945af624853845da/eeddfac/src/array/struct_/mod.rs:118:52
What is the actual behavior?
The result is a panic error with this output:
thread 'main' panicked at 'called `Result::unwrap()` on an `Err` value: OutOfSpec("The children
DataTypes of a StructArray must equal the children data types.\n However, the
values 1 has a length of 11, which is different from values 0, 2.")',
/.../.cargo/git/checkouts/arrow2-945af624853845da/eeddfac/src/array/struct_/mod.rs:118:52
stack backtrace:
0: rust_begin_unwind
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/std/src/panicking.rs:584:5
1: core::panicking::panic_fmt
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/panicking.rs:142:14
2: core::result::unwrap_failed
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/result.rs:1805:5
3: core::result::Result<T,E>::unwrap
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/result.rs:1098:23
4: arrow2::array::struct_::StructArray::new
at /.../.cargo/git/checkouts/arrow2-945af624853845da/eeddfac/src/array/struct_/mod.rs:118:9
5: arrow2::array::struct_::StructArray::from_data
at /.../.cargo/git/checkouts/arrow2-945af624853845da/eeddfac/src/array/struct_/mod.rs:127:9
6: <arrow2::io::parquet::read::deserialize::struct_::StructIterator as core::iter::traits::iterator::Iterator>::next
at /.../.cargo/git/checkouts/arrow2-945af624853845da/eeddfac/src/io/parquet/read/deserialize/struct_.rs:50:22
7: <alloc::boxed::Box<I,A> as core::iter::traits::iterator::Iterator>::next
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/alloc/src/boxed.rs:1868:9
8: <arrow2::io::parquet::read::deserialize::struct_::StructIterator as core::iter::traits::iterator::Iterator>::next::{{closure}}
at /.../.cargo/git/checkouts/arrow2-945af624853845da/eeddfac/src/io/parquet/read/deserialize/struct_.rs:26:25
9: core::iter::adapters::map::map_fold::{{closure}}
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/adapters/map.rs:84:28
10: core::iter::traits::iterator::Iterator::fold
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/traits/iterator.rs:2414:21
11: <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::fold
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/adapters/map.rs:124:9
12: core::iter::traits::iterator::Iterator::for_each
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/traits/iterator.rs:831:9
13: <alloc::vec::Vec<T,A> as alloc::vec::spec_extend::SpecExtend<T,I>>::spec_extend
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/alloc/src/vec/spec_extend.rs:40:17
14: <alloc::vec::Vec<T> as alloc::vec::spec_from_iter_nested::SpecFromIterNested<T,I>>::from_iter
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/alloc/src/vec/spec_from_iter_nested.rs:62:9
15: <alloc::vec::Vec<T> as alloc::vec::spec_from_iter::SpecFromIter<T,I>>::from_iter
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/alloc/src/vec/spec_from_iter.rs:33:9
16: <alloc::vec::Vec<T> as core::iter::traits::collect::FromIterator<T>>::from_iter
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/alloc/src/vec/mod.rs:2648:9
17: core::iter::traits::iterator::Iterator::collect
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/traits/iterator.rs:1836:9
18: <arrow2::io::parquet::read::deserialize::struct_::StructIterator as core::iter::traits::iterator::Iterator>::next
at /.../.cargo/git/checkouts/arrow2-945af624853845da/eeddfac/src/io/parquet/read/deserialize/struct_.rs:23:22
19: <alloc::boxed::Box<I,A> as core::iter::traits::iterator::Iterator>::next
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/alloc/src/boxed.rs:1868:9
20: <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::next
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/adapters/map.rs:103:9
21: <alloc::boxed::Box<I,A> as core::iter::traits::iterator::Iterator>::next
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/alloc/src/boxed.rs:1868:9
22: core::iter::traits::iterator::Iterator::try_fold
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/traits/iterator.rs:2237:29
23: <core::iter::adapters::GenericShunt<I,R> as core::iter::traits::iterator::Iterator>::try_fold
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/adapters/mod.rs:191:9
24: core::iter::traits::iterator::Iterator::try_for_each
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/adapters/mod.rs:174:9
25: <core::iter::adapters::GenericShunt<I,R> as core::iter::traits::iterator::Iterator>::next
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/adapters/mod.rs:174:9
26: <alloc::vec::Vec<T> as alloc::vec::spec_from_iter_nested::SpecFromIterNested<T,I>>::from_iter
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/alloc/src/vec/spec_from_iter_nested.rs:26:32
27: <alloc::vec::Vec<T> as alloc::vec::spec_from_iter::SpecFromIter<T,I>>::from_iter
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/alloc/src/vec/spec_from_iter.rs:33:9
28: <alloc::vec::Vec<T> as core::iter::traits::collect::FromIterator<T>>::from_iter
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/alloc/src/vec/mod.rs:2648:9
29: core::iter::traits::iterator::Iterator::collect
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/result.rs:2092:49
30: <core::result::Result<V,E> as core::iter::traits::collect::FromIterator<core::result::Result<A,E>>>::from_iter::{{closure}}
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/result.rs:2092:49
31: core::iter::adapters::try_process
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/adapters/mod.rs:160:17
32: <core::result::Result<V,E> as core::iter::traits::collect::FromIterator<core::result::Result<A,E>>>::from_iter
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/result.rs:2092:9
33: core::iter::traits::iterator::Iterator::collect
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/traits/iterator.rs:1836:9
34: polars_io::parquet::read_impl::array_iter_to_series
at /.../github/polars/polars/polars-io/src/parquet/read_impl.rs:47:17
35: polars_io::parquet::read_impl::column_idx_to_series
at /.../github/polars/polars/polars-io/src/parquet/read_impl.rs:36:9
36: polars_io::parquet::read_impl::rg_to_dfs::{{closure}}
at /.../github/polars/polars/polars-io/src/parquet/read_impl.rs:126:21
37: core::iter::adapters::map::map_try_fold::{{closure}}
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/adapters/map.rs:91:28
38: core::iter::traits::iterator::Iterator::try_fold
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/traits/iterator.rs:2238:21
39: <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::try_fold
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/adapters/map.rs:117:9
40: <core::iter::adapters::GenericShunt<I,R> as core::iter::traits::iterator::Iterator>::try_fold
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/adapters/mod.rs:191:9
41: core::iter::traits::iterator::Iterator::try_for_each
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/adapters/mod.rs:174:9
42: <core::iter::adapters::GenericShunt<I,R> as core::iter::traits::iterator::Iterator>::next
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/adapters/mod.rs:174:9
43: <alloc::vec::Vec<T> as alloc::vec::spec_from_iter_nested::SpecFromIterNested<T,I>>::from_iter
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/alloc/src/vec/spec_from_iter_nested.rs:26:32
44: <alloc::vec::Vec<T> as alloc::vec::spec_from_iter::SpecFromIter<T,I>>::from_iter
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/alloc/src/vec/spec_from_iter.rs:33:9
45: <alloc::vec::Vec<T> as core::iter::traits::collect::FromIterator<T>>::from_iter
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/alloc/src/vec/mod.rs:2648:9
46: core::iter::traits::iterator::Iterator::collect
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/result.rs:2092:49
47: <core::result::Result<V,E> as core::iter::traits::collect::FromIterator<core::result::Result<A,E>>>::from_iter::{{closure}}
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/result.rs:2092:49
48: core::iter::adapters::try_process
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/adapters/mod.rs:160:17
49: <core::result::Result<V,E> as core::iter::traits::collect::FromIterator<core::result::Result<A,E>>>::from_iter
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/result.rs:2092:9
50: core::iter::traits::iterator::Iterator::collect
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/traits/iterator.rs:1836:9
51: polars_io::parquet::read_impl::rg_to_dfs
at /.../github/polars/polars/polars-io/src/parquet/read_impl.rs:123:13
52: polars_io::parquet::read_impl::read_parquet
at /.../github/polars/polars/polars-io/src/parquet/read_impl.rs:249:63
53: polars_io::parquet::read::ParquetReader<R>::_finish_with_scan_ops
at /.../github/polars/polars/polars-io/src/parquet/read.rs:60:9
54: polars_lazy::physical_plan::executors::scan::parquet::ParquetExec::read
at /.../github/polars/polars/polars-lazy/src/physical_plan/executors/scan/parquet.rs:39:9
55: <polars_lazy::physical_plan::executors::scan::parquet::ParquetExec as polars_lazy::physical_plan::Executor>::execute::{{closure}}
at /.../github/polars/polars/polars-lazy/src/physical_plan/executors/scan/parquet.rs:61:68
56: polars_lazy::physical_plan::file_cache::FileCache::read
at /.../github/polars/polars/polars-lazy/src/physical_plan/file_cache.rs:40:13
57: <polars_lazy::physical_plan::executors::scan::parquet::ParquetExec as polars_lazy::physical_plan::Executor>::execute
at /.../github/polars/polars/polars-lazy/src/physical_plan/executors/scan/parquet.rs:59:9
58: <polars_lazy::physical_plan::executors::udf::UdfExec as polars_lazy::physical_plan::Executor>::execute
at /.../github/polars/polars/polars-lazy/src/physical_plan/executors/udf.rs:12:18
59: polars_lazy::frame::LazyFrame::collect
at /.../github/polars/polars/polars-lazy/src/frame/mod.rs:718:19
60: gyrfalcon::main
at ./src/main.rs:21:14
61: core::ops::function::FnOnce::call_once
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/ops/function.rs:248:5
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.
What is the expected behavior?
The parquet file should have been correctly loaded.
The parquet-tools util shows it property. Also, Apache Spark properly reads it and processes it.
@jorgecarleitao: I don't think that all the cases are covered in the current arrow2 implementation.
I would reopen the previous #3892 ticket but I cannot.
cc: @ritchie46
@ritchie46, @jorgecarleitao: Any ETA on having this fix pulled from arrow2 into here?
It already is.
Let me pull the master and try the test again.
@jorgecarleitao, I did run some tests and I did find another case with OutOfSpec error. Here is the error:
thread 'main' panicked at 'called `Result::unwrap()` on an `Err` value:
OutOfSpec("The children DataTypes of a StructArray must equal the children data
types.\n However, the values at index 1 have a length of 114072,
which is different from values at index 0, 630.")',
/.../.cargo/git/checkouts/arrow2-945af624853845da/da64106/src/array/struct_/mod.rs:118:52
stack backtrace:
0: rust_begin_unwind
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/std/src/panicking.rs:584:5
1: core::panicking::panic_fmt
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/panicking.rs:142:14
2: core::result::unwrap_failed
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/result.rs:1805:5
3: arrow2::array::struct_::StructArray::new
4: polars_core::chunked_array::logical::struct_::StructChunked::update_chunks
5: polars_core::series::implementations::struct_::<impl polars_core::series::series_trait::SeriesTrait
for polars_core::series::implementations::SeriesWrap<polars_core::
chunked_array::logical::struct_::StructChunked>>::append
6: polars_core::series::implementations::struct_::<impl polars_core::series::series_trait::SeriesTrait
for polars_core::series::implementations::SeriesWrap<polars_core::
chunked_array::logical::struct_::StructChunked>>::append
7: polars_core::series::implementations::struct_::<impl polars_core::series::series_trait::SeriesTrait
for polars_core::series::implementations::SeriesWrap<polars_core::
chunked_array::logical::struct_::StructChunked>>::append
...
Maybe can help in any way until I'll be able to create a slim parquet file. The current file that produces this error is about 110Mb.
@ritchie46, @jorgecarleitao: I managed to print out the conflicting data structures. This is how they are looking...
Values at index 0:
LargeUtf8Array[3490050010715265545, 2061035645983490919, 8001251476546823717, ...]
Values at index 1:
StructArray[{code: 3245164418740504690}, {code: 3245164418740504690}, ...]
The first line (the one with index 0) contains 630 strings formed out of 19 digits.
The second line contains code: 3245164418740504690 for 114072 times.
The fields are:
[
Field {
name: "id", data_type: LargeUtf8, is_nullable: true, metadata: {}
},
Field {
name: "namespace", data_type: Struct(
[
Field {
name: "code", data_type: LargeUtf8, is_nullable: true, metadata: {}
}
]
), is_nullable: true, metadata: {}
},
Field {
name: "primary", data_type: Boolean, is_nullable: true, metadata: {}
}
]
I don't think is the culprit is the data because there is no issue in Spark.
I think, there is an issue with the 114072 times that code there. That should not look like that.
Hey @andrei-ionescu . Thanks again for the patience and for the report - it is very useful 🙇. Sorry for the late reply, I am on vacations with limited access to internet.
Just to make sure I understood the last comment: "index 0" and "index 1" represent the column index, "line" represents the row number, and the issue is that the columns have a different number of rows.
Are you able to create a (mock) file with e.g. pandas or pyarrow that reproduces the problem?
@jorgecarleitao: Here is the file — part-00003-a422a23f-e65a-4cab-9bd0-6e877a8f7337-c000.snappy.parquet.zip — about 72Mb zipped, 117Mb parquet. I could not make it any slimmer.
@jorgecarleitao, @ritchie46: Is this cherry picked in polars?
@jorgecarleitao, @ritchie46: I just tried latest arrow2 + latest polars (both straight from the git repo) + the file above and I still see the same OutOfSpec error.
Am I missing something?
thread 'main' panicked at 'called `Result::unwrap()` on an `Err` value:
OutOfSpec("The children DataTypes of a StructArray must equal the children data
types.\n However, the values at index 1 have a length of 114072,
which is different from values at index 0, 630.")',
/.../arrow2/src/array/struct_/mod.rs:118:52
stack backtrace:
0: rust_begin_unwind
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/std/src/panicking.rs:584:5
1: core::panicking::panic_fmt
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/panicking.rs:142:14
2: core::result::unwrap_failed
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/result.rs:1805:5
3: core::result::Result<T,E>::unwrap
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/result.rs:1098:23
4: arrow2::array::struct_::StructArray::new
at /.../arrow2/src/array/struct_/mod.rs:118:9
5: polars_core::chunked_array::logical::struct_::StructChunked::update_chunks
at /.../polars/polars/polars-core/src/chunked_array/logical/struct_/mod.rs:76:32
6: polars_core::series::implementations::struct_::<impl polars_core::series::series_trait::SeriesTrait for polars_core::series::implementations::SeriesWrap<polars_core::chunked_array::logical::struct_::StructChunked>>::append
at /.../polars/polars/polars-core/src/series/implementations/struct_.rs:128:9
7: polars_core::series::Series::append
at /.../polars/polars/polars-core/src/series/mod.rs:210:9
8: polars_core::series::implementations::struct_::<impl polars_core::series::series_trait::SeriesTrait for polars_core::series::implementations::SeriesWrap<polars_core::chunked_array::logical::struct_::StructChunked>>::append
at /.../polars/polars/polars-core/src/series/implementations/struct_.rs:126:13
9: polars_core::series::Series::append
at /.../polars/polars/polars-core/src/series/mod.rs:210:9
10: polars_core::series::implementations::struct_::<impl polars_core::series::series_trait::SeriesTrait for polars_core::series::implementations::SeriesWrap<polars_core::chunked_array::logical::struct_::StructChunked>>::append
at /.../polars/polars/polars-core/src/series/implementations/struct_.rs:126:13
11: polars_core::series::Series::append
at /.../polars/polars/polars-core/src/series/mod.rs:210:9
12: polars_core::frame::DataFrame::vstack_mut::{{closure}}
at /.../polars/polars/polars-core/src/frame/mod.rs:908:17
13: core::iter::traits::iterator::Iterator::try_for_each::call::{{closure}}
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/traits/iterator.rs:2296:26
14: core::iter::traits::iterator::Iterator::try_fold
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/traits/iterator.rs:2238:21
15: core::iter::traits::iterator::Iterator::try_for_each
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/iter/traits/iterator.rs:2299:9
16: polars_core::frame::DataFrame::vstack_mut
at /.../polars/polars/polars-core/src/frame/mod.rs:903:9
17: polars_core::utils::accumulate_dataframes_vertical
at /.../polars/polars/polars-core/src/utils/mod.rs:813:9
18: polars_io::parquet::read_impl::read_parquet
at /.../polars/polars/polars-io/src/parquet/read_impl.rs:289:22
19: polars_io::parquet::read::ParquetReader<R>::_finish_with_scan_ops
at /.../polars/polars/polars-io/src/parquet/read.rs:61:9
20: polars_lazy::physical_plan::executors::scan::parquet::ParquetExec::read
at /.../polars/polars/polars-lazy/src/physical_plan/executors/scan/parquet.rs:39:9
21: <polars_lazy::physical_plan::executors::scan::parquet::ParquetExec as polars_lazy::physical_plan::Executor>::execute::{{closure}}
at /.../polars/polars/polars-lazy/src/physical_plan/executors/scan/parquet.rs:62:68
22: polars_lazy::physical_plan::file_cache::FileCache::read
at /.../polars/polars/polars-lazy/src/physical_plan/file_cache.rs:40:13
23: <polars_lazy::physical_plan::executors::scan::parquet::ParquetExec as polars_lazy::physical_plan::Executor>::execute
at /.../polars/polars/polars-lazy/src/physical_plan/executors/scan/parquet.rs:60:9
24: <polars_lazy::physical_plan::executors::udf::UdfExec as polars_lazy::physical_plan::Executor>::execute
at /.../polars/polars/polars-lazy/src/physical_plan/executors/udf.rs:12:18
25: polars_lazy::frame::LazyFrame::collect
at /.../polars/polars/polars-lazy/src/frame/mod.rs:720:19
26: gyrfalcon::main
at ./src/main.rs:21:14
27: core::ops::function::FnOnce::call_once
at /rustc/7b46aa594c4bdc507fbd904b6777ca30c37a9209/library/core/src/ops/function.rs:248:5
No, that was on me. The fix was insufficient - I believe https://github.com/jorgecarleitao/arrow2/pull/1188 fixes this. Your file is a really good fuzzy test.
@jorgecarleitao: I'm glad that it's helpful.
@jorgecarleitao, I just tested/checked the code changes you merged with the https://github.com/jorgecarleitao/arrow2/pull/1188 and I can still see the issue. I also can validate that the error message now is the new one you changed in the PR: The children must have an equal number of values.
thread 'main' panicked at 'called `Result::unwrap()` on an `Err` value:
OutOfSpec("The children must have an equal number of values.\n
However, the values at index 1 have a length of 114072, which is different
from values at index 0, 630.")',
/.../arrow2/src/array/struct_/mod.rs:118:52
stack backtrace:
0: rust_begin_unwind
at /rustc/6dbae3ad19309bb541d9e76638e6aa4b5449f29a/library/std/src/panicking.rs:584:5
1: core::panicking::panic_fmt
at /rustc/6dbae3ad19309bb541d9e76638e6aa4b5449f29a/library/core/src/panicking.rs:142:14
2: core::result::unwrap_failed
at /rustc/6dbae3ad19309bb541d9e76638e6aa4b5449f29a/library/core/src/result.rs:1814:5
3: core::result::Result<T,E>::unwrap
at /rustc/6dbae3ad19309bb541d9e76638e6aa4b5449f29a/library/core/src/result.rs:1107:23
4: arrow2::array::struct_::StructArray::new
at /.../arrow2/src/array/struct_/mod.rs:118:9
5: polars_core::chunked_array::logical::struct_::StructChunked::update_chunks
at /.../polars/polars/polars-core/src/chunked_array/logical/struct_/mod.rs:76:32
6: polars_core::series::implementations::struct_::<impl polars_core::series::series_trait::SeriesTrait for polars_core::series::implementations::SeriesWrap<polars_core::chunked_array::logical::struct_::StructChunked>>::append
at /.../polars/polars/polars-core/src/series/implementations/struct_.rs:128:9
7: polars_core::series::Series::append
at /.../polars/polars/polars-core/src/series/mod.rs:210:9
8: polars_core::series::implementations::struct_::<impl polars_core::series::series_trait::SeriesTrait for polars_core::series::implementations::SeriesWrap<polars_core::chunked_array::logical::struct_::StructChunked>>::append
at /.../polars/polars/polars-core/src/series/implementations/struct_.rs:126:13
9: polars_core::series::Series::append
at /.../polars/polars/polars-core/src/series/mod.rs:210:9
10: polars_core::series::implementations::struct_::<impl polars_core::series::series_trait::SeriesTrait for polars_core::series::implementations::SeriesWrap<polars_core::chunked_array::logical::struct_::StructChunked>>::append
at /.../polars/polars/polars-core/src/series/implementations/struct_.rs:126:13
11: polars_core::series::Series::append
at /.../polars/polars/polars-core/src/series/mod.rs:210:9
12: polars_core::frame::DataFrame::vstack_mut::{{closure}}
at /.../polars/polars/polars-core/src/frame/mod.rs:908:17
13: core::iter::traits::iterator::Iterator::try_for_each::call::{{closure}}
at /rustc/6dbae3ad19309bb541d9e76638e6aa4b5449f29a/library/core/src/iter/traits/iterator.rs:2296:26
14: core::iter::traits::iterator::Iterator::try_fold
at /rustc/6dbae3ad19309bb541d9e76638e6aa4b5449f29a/library/core/src/iter/traits/iterator.rs:2238:21
15: core::iter::traits::iterator::Iterator::try_for_each
at /rustc/6dbae3ad19309bb541d9e76638e6aa4b5449f29a/library/core/src/iter/traits/iterator.rs:2299:9
16: polars_core::frame::DataFrame::vstack_mut
at /.../polars/polars/polars-core/src/frame/mod.rs:903:9
17: polars_core::utils::accumulate_dataframes_vertical
at /.../polars/polars/polars-core/src/utils/mod.rs:813:9
18: polars_io::parquet::read_impl::read_parquet
at /.../polars/polars/polars-io/src/parquet/read_impl.rs:289:22
19: polars_io::parquet::read::ParquetReader<R>::_finish_with_scan_ops
at /.../polars/polars/polars-io/src/parquet/read.rs:61:9
20: polars_lazy::physical_plan::executors::scan::parquet::ParquetExec::read
at /.../polars/polars/polars-lazy/src/physical_plan/executors/scan/parquet.rs:39:9
21: <polars_lazy::physical_plan::executors::scan::parquet::ParquetExec as polars_lazy::physical_plan::Executor>::execute::{{closure}}
at /.../polars/polars/polars-lazy/src/physical_plan/executors/scan/parquet.rs:62:68
22: polars_lazy::physical_plan::file_cache::FileCache::read
at /.../polars/polars/polars-lazy/src/physical_plan/file_cache.rs:40:13
23: <polars_lazy::physical_plan::executors::scan::parquet::ParquetExec as polars_lazy::physical_plan::Executor>::execute
at /.../polars/polars/polars-lazy/src/physical_plan/executors/scan/parquet.rs:60:9
24: <polars_lazy::physical_plan::executors::udf::UdfExec as polars_lazy::physical_plan::Executor>::execute
at /.../polars/polars/polars-lazy/src/physical_plan/executors/udf.rs:12:18
25: polars_lazy::frame::LazyFrame::collect
at /.../polars/polars/polars-lazy/src/frame/mod.rs:720:19
26: gyrfalcon::main
at ./src/main.rs:21:14
27: core::ops::function::FnOnce::call_once
at /rustc/6dbae3ad19309bb541d9e76638e6aa4b5449f29a/library/core/src/ops/function.rs:248:5
Strange - I can read the file you posted here with
# in arrow2
cargo run --release --example parquet_read --features io_parquet,io_parquet_compression,io_print -- part-00003-a422a23f-e65a-4cab-9bd0-6e877a8f7337-c000.snappy.parquet
Changing limit and chunk_size of the reader does not impact this. I also tried using the parallel reader.
The error still comes from arrow2. Can it be the way Polars uses the arrow2 API?
The error still comes from
arrow2. Can it be the way Polars uses thearrow2API?
I don't think the fix was already in the polars branch.
I'm building the example from git with updated dependencies in Polars to reference the latest arrow2.
It may be something in the update_chunks in polars-core/src/chunked_array/logical/struct_/mod.rs (mod.rs#L76-L80). It does a
StructArray::new(
ArrowDataType::Struct(new_fields.clone()),
field_arrays,
None,
)
Maybe there is something wrong with the params received from polars.
Is there an update on this? I curious on whether something else is required here as this is an important use-case
Is there an update on this? I curious on whether something else is required here as this is an important use-cathis.
If you can run it in arrow, I expect this is something on our side. I will look into this
I can also read the file on latest master:
>>> pl.read_parquet("nested_struct_OutOfSpec.snappy.parquet")
shape: (2, 1)
┌─────────────────────────────────────┐
│ dim │
│ --- │
│ struct[4] │
╞═════════════════════════════════════╡
│ {{null,null,null,null,null,null,... │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ {{null,null,null,"2gYhOc2Edy8GBw... │
└─────────────────────────────────────┘
Thanks for the fix upstream @jorgecarleitao.
@andrei-ionescu we are close to a crates.io release. You can already point to latest master to have your fix working, but it will also work on crates.io soon. :)
I will close this now.
@ritchie46
-
Did you try it with the other file: part-00003-a422a23f-e65a-4cab-9bd0-6e877a8f7337-c000.snappy.parquet.zip? The first one works but not this one.
-
How can I re-open this ticket as it is not resolved?
The issue occurs when appending structs of different chunk sizes.
MWE:
s = pl.Series([{'_experience': {'aaid': {'id': '7759804769753743647',
'namespace': {'code': '3245164418740504690'},
'primary': True},
'mcid': {'id': None, 'namespace': {'code': None}, 'primary': None}}},
{'_experience': {'aaid': {'id': '8337071409830986729',
'namespace': {'code': '3245164418740504690'},
'primary': False},
'mcid': {'id': '6495617396286731444',
'namespace': {'code': '3624253825458969727'},
'primary': True}}},
{'_experience': {'aaid': {'id': '5948492535810675291',
'namespace': {'code': '3245164418740504690'},
'primary': True},
'mcid': {'id': None, 'namespace': {'code': None}, 'primary': None}}}])
s.append(s[:2])
stacktrace
thread 'PanicException Traceback (most recent call last) Input In [23], in <cell line: 16>() 1 s = pl.Series([{'_experience': {'aaid': {'id': '7759804769753743647', 2 'namespace': {'code': '3245164418740504690'}, 3 'primary': True}, (...) 13 'primary': True}, 14 'mcid': {'id': None, 'namespace': {'code': None}, 'primary': None}}}]) ---> 16 s.append(s[:2])
File ~/code/polars/py-polars/polars/internals/series.py:1410, in Series.append(self, other, append_chunks) 1408 try: 1409 if append_chunks: -> 1410 self._s.append(other._s) 1411 else: 1412 self._s.extend(other._s)
PanicException: called Result::unwrap() on an Err value: OutOfSpec("The children must have an equal number of values.\n However, the values at index 1 have a length of 3, which is different from values at index 0, 2.")
python-split_1649141344976/work/Objects/call.c:396 27: _PyObject_VectorcallTstate at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:118 28: PyObject_Vectorcall at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:127 29: call_function at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:5077 30: _PyEval_EvalFrameDefault at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:3506 31: _PyEval_EvalFrame at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40 32: _PyEval_EvalCode at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:4329 33: _PyEval_EvalCodeWithName at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:4361 34: PyEval_EvalCodeEx at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:4377 35: PyEval_EvalCode at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:828 36: builtin_exec_impl.isra.17 at /opt/conda/conda-bld/python-split_1649141344976/work/Python/bltinmodule.c:1026 37: builtin_exec at /opt/conda/conda-bld/python-split_1649141344976/work/Python/clinic/bltinmodule.c.h:396 38: cfunction_vectorcall_FASTCALL at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/methodobject.c:430 39: _PyObject_VectorcallTstate at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:118 40: PyObject_Vectorcall at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:127 41: call_function at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:5077 42: _PyEval_EvalFrameDefault at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:3520 43: _PyEval_EvalFrame at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40 44: gen_send_ex at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/genobject.c:215 45: _PyEval_EvalFrameDefault at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:2202 46: _PyEval_EvalFrame at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40 47: gen_send_ex at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/genobject.c:215 48: _PyEval_EvalFrameDefault at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:2202 49: _PyEval_EvalFrame at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40 50: gen_send_ex at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/genobject.c:215 51: method_vectorcall_O at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/descrobject.c:464 52: _PyObject_VectorcallTstate at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:118 53: PyObject_Vectorcall at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:127 54: call_function at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:5077 55: _PyEval_EvalFrameDefault at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:3506 56: _PyEval_EvalFrame at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40 57: function_code_fastcall at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:330 58: _PyFunction_Vectorcall at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:367 59: _PyObject_VectorcallTstate at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:118 60: PyObject_Vectorcall at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:127 61: call_function at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:5077 62: _PyEval_EvalFrameDefault at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:3520 63: _PyEval_EvalFrame at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40 64: function_code_fastcall at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:330 65: _PyFunction_Vectorcall at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:367 66: _PyObject_VectorcallTstate at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:118 67: PyObject_Vectorcall at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:127 68: call_function at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:5077 69: _PyEval_EvalFrameDefault at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:3506 70: _PyEval_EvalFrame at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40 71: _PyEval_EvalCode at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:4329 72: _PyFunction_Vectorcall at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:396 73: _PyObject_VectorcallTstate at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:118 74: method_vectorcall at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/classobject.c:53 75: PyVectorcall_Call at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:243 76: _PyObject_Call at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:266 77: PyObject_Call at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:293 78: do_call_core at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:5125 79: _PyEval_EvalFrameDefault at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:3582 80: _PyEval_EvalFrame at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40 81: _PyEval_EvalCode at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:4329 82: _PyFunction_Vectorcall at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:396 83: _PyObject_VectorcallTstate at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:118 84: method_vectorcall at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/classobject.c:53 85: _PyObject_VectorcallTstate at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:118 86: PyObject_Vectorcall at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:127 87: call_function at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:5077 88: _PyEval_EvalFrameDefault at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:3537 89: _PyEval_EvalFrame at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40 90: gen_send_ex at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/genobject.c:215 91: _PyEval_EvalFrameDefault at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:2202 92: _PyEval_EvalFrame at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40 93: gen_send_ex at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/genobject.c:215 94: _PyEval_EvalFrameDefault at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:2202 95: _PyEval_EvalFrame at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40 96: gen_send_ex at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/genobject.c:215 97: _PyEval_EvalFrameDefault at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:2202 98: _PyEval_EvalFrame at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40 99: gen_send_ex at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/genobject.c:215 100: _PyEval_EvalFrameDefault at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:2202 101: _PyEval_EvalFrame at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40 102: gen_send_ex at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/genobject.c:215 103: task_step_impl at /usr/local/src/conda/python-3.9.12/Modules/_asynciomodule.c:2669 104: task_step at /usr/local/src/conda/python-3.9.12/Modules/_asynciomodule.c:2969 105: task_wakeup at /usr/local/src/conda/python-3.9.12/Modules/_asynciomodule.c:3018 106: TaskWakeupMethWrapper_call at /usr/local/src/conda/python-3.9.12/Modules/_asynciomodule.c:1882 107: _PyObject_MakeTpCall at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:191 108: context_run at /opt/conda/conda-bld/python-split_1649141344976/work/Python/context.c:649 109: cfunction_vectorcall_FASTCALL_KEYWORDS at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/methodobject.c:446 110: PyVectorcall_Call at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:231 111: _PyObject_Call at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:266 112: PyObject_Call at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:293 113: do_call_core at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:5097 114: _PyEval_EvalFrameDefault at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:3582 115: _PyEval_EvalFrame at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40 116: function_code_fastcall at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:330 117: _PyFunction_Vectorcall at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:367 118: _PyObject_VectorcallTstate
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:118
119: PyObject_Vectorcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:127
120: call_function
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:5077
121: _PyEval_EvalFrameDefault
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:3506
122: _PyEval_EvalFrame
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40
123: function_code_fastcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:330
124: _PyFunction_Vectorcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:367
125: _PyObject_VectorcallTstate
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:118
126: PyObject_Vectorcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:127
127: call_function
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:5077
128: _PyEval_EvalFrameDefault
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:3506
129: _PyEval_EvalFrame
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40
130: function_code_fastcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:330
131: _PyFunction_Vectorcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:367
132: _PyObject_VectorcallTstate
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:118
133: PyObject_Vectorcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:127
134: call_function
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:5077
135: _PyEval_EvalFrameDefault
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:3506
136: _PyEval_EvalFrame
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40
137: function_code_fastcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:330
138: _PyFunction_Vectorcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:367
139: _PyObject_VectorcallTstate
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:118
140: PyObject_Vectorcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:127
141: call_function
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:5077
142: _PyEval_EvalFrameDefault
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:3506
143: _PyEval_EvalFrame
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40
144: function_code_fastcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:330
145: _PyFunction_Vectorcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:367
146: _PyObject_VectorcallTstate
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:118
147: PyObject_Vectorcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:127
148: call_function
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:5077
149: _PyEval_EvalFrameDefault
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:3506
150: _PyEval_EvalFrame
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40
151: _PyEval_EvalCode
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:4329
152: _PyFunction_Vectorcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:396
153: _PyObject_VectorcallTstate
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:118
154: method_vectorcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/classobject.c:53
155: _PyObject_VectorcallTstate
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:118
156: PyObject_Vectorcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:127
157: call_function
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:5077
158: _PyEval_EvalFrameDefault
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:3489
159: _PyEval_EvalFrame
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40
160: _PyEval_EvalCode
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:4329
161: _PyEval_EvalCodeWithName
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:4361
162: PyEval_EvalCodeEx
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:4377
163: PyEval_EvalCode
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:828
164: builtin_exec_impl.isra.17
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/bltinmodule.c:1026
165: builtin_exec
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/clinic/bltinmodule.c.h:396
166: cfunction_vectorcall_FASTCALL
at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/methodobject.c:430
167: _PyObject_VectorcallTstate
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:118
168: PyObject_Vectorcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:127
169: call_function
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:5077
170: _PyEval_EvalFrameDefault
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:3520
171: _PyEval_EvalFrame
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40
172: _PyEval_EvalCode
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:4329
173: _PyFunction_Vectorcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:396
174: _PyObject_VectorcallTstate
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:118
175: PyObject_Vectorcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/cpython/abstract.h:127
176: call_function
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:5077
177: _PyEval_EvalFrameDefault
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:3520
178: _PyEval_EvalFrame
at /opt/conda/conda-bld/python-split_1649141344976/work/Include/internal/pycore_ceval.h:40
179: _PyEval_EvalCode
at /opt/conda/conda-bld/python-split_1649141344976/work/Python/ceval.c:4329
180: _PyFunction_Vectorcall
at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:396
181: PyVectorcall_Call
at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:231
182: _PyObject_Call
at /opt/conda/conda-bld/python-split_1649141344976/work/Objects/call.c:266
183: pymain_run_module
at /opt/conda/conda-bld/python-split_1649141344976/work/Modules/main.c:297
184: pymain_run_python
at /opt/conda/conda-bld/python-split_1649141344976/work/Modules/main.c:598
185: Py_RunMain
at /opt/conda/conda-bld/python-split_1649141344976/work/Modules/main.c:683
186: Py_BytesMain
at /opt/conda/conda-bld/python-split_1649141344976/work/Modules/main.c:1129
187: __libc_start_main
at /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:308:16
note: Some details are omitted, run with RUST_BACKTRACE=full for a verbose backtrace.
#4217 fixes the issue. Note that we still cannot read the file because it contains a map datatype, which is not supported by polars.
@ritchie46, thanks for looking into this.
- Is there an issue opened for supporting
mapdatatype? Is there a plan to add map support in Polars? - Parquet format and Apache Spark do support map field types and it seems that arrow2 and parquet2 are supporting map field types. @jorgecarleitao, am I right saying this?
- How can I help adding this map support?
Polars will not add the map dtype. It's benefit do not outweigh the extra complexity. Maybe we can investigate conversion of maps to struct. But I will have to explore that.
With #4226 we can read the entire file. The map dtype will be converted to its physical type which is supported by polars.
@ritchie46, @jorgecarleitao: We need to re-open this one more time.
With the code given bellow and the previous file — part-00003-a422a23f-e65a-4cab-9bd0-6e877a8f7337-c000.snappy.parquet.zip — I get again the OutOfSpec error.
let df = LazyFrame::scan_parquet(
file_location,
ScanArgsParquet::default())
.unwrap()
.filter(
col("timestamp").cast(DataType::Datetime(TimeUnit::Nanoseconds, None))
.gt(datetime(DatetimeArgs {
year: lit(2022),
month: lit(1),
day: lit(1),
hour: None,
minute: None,
second: None,
millisecond: None
}))
)
.select([
count().alias("monthcount"),
col("timestamp"),
])
.collect()
.unwrap();
dbg!(df);
When I remove the filter, it does not panic.
Here is the panic error:
thread 'thread '<unnamed><unnamed>' panicked at '' panicked at
'called `Result::unwrap()` on an `Err` value:
OutOfSpec("The children must have an equal number of values.\n
However, the values at index 1 have a length of 1, which is different
from values at index 0, 2.")
called `Result::unwrap()` on an `Err` value: OutOfSpec("The children
must have an equal number of values.\n
However, the values at index 1 have a length of 1, which is different
from values at index 0, 2.")', ',
/.../.cargo/git/checkouts/arrow2-8a2ad61d97265680/8604cb7/src/array/struct_/mod.rs
/.../.cargo/git/checkouts/arrow2-8a2ad61d97265680/8604cb7/src/array/struct_/mod.rs::118118::5252
stack backtrace:
0: rust_begin_unwind
at /rustc/f9cba63746d0fff816250b2ba7b706b5d4dcf000/library/std/src/panicking.rs:584:5
1: core::panicking::panic_fmt
at /rustc/f9cba63746d0fff816250b2ba7b706b5d4dcf000/library/core/src/panicking.rs:142:14
2: core::result::unwrap_failed
at /rustc/f9cba63746d0fff816250b2ba7b706b5d4dcf000/library/core/src/result.rs:1814:5
3: <arrow2::io::parquet::read::statistics::struct_::DynMutableStructArray as arrow2::array::MutableArray>::as_box
4: <arrow2::io::parquet::read::statistics::list::DynMutableListArray as arrow2::array::MutableArray>::as_box
5: <arrow2::io::parquet::read::statistics::Statistics as core::convert::From<arrow2::io::parquet::read::statistics::MutableStatistics>>::from
6: arrow2::io::parquet::read::statistics::deserialize
7: polars_io::parquet::predicates::read_this_row_group
8: core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &mut F>::call_once
9: <alloc::vec::Vec<T,A> as alloc::vec::spec_extend::SpecExtend<T,I>>::spec_extend
10: <rayon::iter::map::MapFolder<C,F> as rayon::iter::plumbing::Folder<T>>::consume_iter
11: rayon::iter::plumbing::bridge_producer_consumer::helper
12: <rayon_core::job::StackJob<L,F,R> as rayon_core::job::Job>::execute
13: rayon_core::registry::WorkerThread::wait_until_cold
14: rayon_core::registry::ThreadBuilder::run
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.
stack backtrace:
0: rust_begin_unwind
at /rustc/f9cba63746d0fff816250b2ba7b706b5d4dcf000/library/std/src/panicking.rs:584:5
1: core::panicking::panic_fmt
at /rustc/f9cba63746d0fff816250b2ba7b706b5d4dcf000/library/core/src/panicking.rs:142:14
2: core::result::unwrap_failed
at /rustc/f9cba63746d0fff816250b2ba7b706b5d4dcf000/library/core/src/result.rs:1814:5
3: <arrow2::io::parquet::read::statistics::struct_::DynMutableStructArray as arrow2::array::MutableArray>::as_box
4: <arrow2::io::parquet::read::statistics::list::DynMutableListArray as arrow2::array::MutableArray>::as_box
5: <arrow2::io::parquet::read::statistics::Statistics as core::convert::From<arrow2::io::parquet::read::statistics::MutableStatistics>>::from
6: arrow2::io::parquet::read::statistics::deserialize
7: polars_io::parquet::predicates::read_this_row_group
8: core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &mut F>::call_once
9: <alloc::vec::Vec<T,A> as alloc::vec::spec_extend::SpecExtend<T,I>>::spec_extend
10: <rayon::iter::map::MapFolder<C,F> as rayon::iter::plumbing::Folder<T>>::consume_iter
11: rayon::iter::plumbing::bridge_producer_consumer::helper
12: <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once
13: <rayon_core::job::StackJob<L,F,R> as rayon_core::job::Job>::execute
14: rayon_core::registry::WorkerThread::wait_until_cold
15: rayon_core::registry::ThreadBuilder::run
Could you have another look?
@ritchie46, @jorgecarleitao Any updates on this?
@andrei-ionescu found another issue, opened it upstream https://github.com/jorgecarleitao/arrow2/issues/1239.