polars
polars copied to clipboard
Fail to read jsonline file with struct column
Polars version checks
-
[X] I have checked that this issue has not already been reported.
-
[X] I have confirmed this bug exists on the latest version of Polars.
Issue description
Error when reading jsonline with a struct column.
Reproducible example
use polars::prelude::*;
use std::io::Cursor;
fn main() {
let jsonlines = r#"
{"struct": {"int_list": [1, 2, 3], "float": 5.0, "str_list": ["a", "b", "c"]}, "int_opt": null, "float_list": [1.1, 2.2]}
{"struct": {"int_list": [4, 5, 6]}, "float": 4.0}
"#;
let cursor = Cursor::new(jsonlines);
let df = JsonLineReader::new(cursor)
.finish()
.expect("Failed to read jsonlines");
}
Error details:
thread '<unnamed>' panicked at 'index out of bounds: the len is 1 but the index is 1', /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/polars-core-0.25.1/src/series/any_value.rs:180:46
stack backtrace:
0: rust_begin_unwind
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/std/src/panicking.rs:584:5
1: core::panicking::panic_fmt
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/panicking.rs:142:14
2: core::panicking::panic_bounds_check
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/panicking.rs:84:5
3: <usize as core::slice::index::SliceIndex<[T]>>::index
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/slice/index.rs:250:10
4: core::slice::index::<impl core::ops::index::Index<I> for [T]>::index
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/slice/index.rs:18:9
5: <alloc::vec::Vec<T,A> as core::ops::index::Index<I>>::index
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/alloc/src/vec/mod.rs:2628:9
6: polars_core::series::any_value::<impl polars_core::series::Series>::from_any_values_and_dtype
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/polars-core-0.25.1/src/series/any_value.rs:180:46
7: polars_core::series::any_value::<impl polars_core::series::Series>::from_any_values
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/polars-core-0.25.1/src/series/any_value.rs:218:17
8: polars_core::series::any_value::<impl polars_core::named_from::NamedFrom<T,[polars_core::datatypes::AnyValue]> for polars_core::series::Series>::new
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/polars-core-0.25.1/src/series/any_value.rs:111:9
9: polars_io::ndjson_core::buffer::Buffer::into_series
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/polars-io-0.25.1/src/ndjson_core/buffer.rs:89:42
10: polars_io::ndjson_core::ndjson::CoreJsonReader::parse_json::{{closure}}::{{closure}}::{{closure}}
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/polars-io-0.25.1/src/ndjson_core/ndjson.rs:240:40
11: core::iter::adapters::map::map_try_fold::{{closure}}
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/map.rs:91:28
12: core::iter::traits::iterator::Iterator::try_fold
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/traits/iterator.rs:2238:21
13: <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::try_fold
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/map.rs:117:9
14: <core::iter::adapters::GenericShunt<I,R> as core::iter::traits::iterator::Iterator>::try_fold
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/mod.rs:195:9
15: core::iter::traits::iterator::Iterator::try_for_each
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/traits/iterator.rs:2299:9
16: <core::iter::adapters::GenericShunt<I,R> as core::iter::traits::iterator::Iterator>::next
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/mod.rs:178:9
17: <alloc::vec::Vec<T> as alloc::vec::spec_from_iter_nested::SpecFromIterNested<T,I>>::from_iter
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/alloc/src/vec/spec_from_iter_nested.rs:26:32
18: <alloc::vec::Vec<T> as alloc::vec::spec_from_iter::SpecFromIter<T,I>>::from_iter
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/alloc/src/vec/spec_from_iter.rs:33:9
19: <alloc::vec::Vec<T> as core::iter::traits::collect::FromIterator<T>>::from_iter
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/alloc/src/vec/mod.rs:2649:9
20: core::iter::traits::iterator::Iterator::collect
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/traits/iterator.rs:1836:9
21: <core::result::Result<V,E> as core::iter::traits::collect::FromIterator<core::result::Result<A,E>>>::from_iter::{{closure}}
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/result.rs:2072:49
22: core::iter::adapters::try_process
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/mod.rs:164:17
23: <core::result::Result<V,E> as core::iter::traits::collect::FromIterator<core::result::Result<A,E>>>::from_iter
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/result.rs:2072:9
24: core::iter::traits::iterator::Iterator::collect
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/traits/iterator.rs:1836:9
25: polars_io::ndjson_core::ndjson::CoreJsonReader::parse_json::{{closure}}::{{closure}}
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/polars-io-0.25.1/src/ndjson_core/ndjson.rs:238:25
26: core::ops::function::impls::<impl core::ops::function::FnMut<A> for &F>::call_mut
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/ops/function.rs:270:13
27: core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &mut F>::call_once
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/ops/function.rs:306:13
28: core::option::Option<T>::map
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/option.rs:929:29
29: <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::next
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/map.rs:103:9
30: <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::next
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/map.rs:103:9
31: <core::iter::adapters::take_while::TakeWhile<I,P> as core::iter::traits::iterator::Iterator>::next
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/take_while.rs:46:21
32: <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::next
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/map.rs:103:9
33: alloc::vec::Vec<T,A>::extend_desugared
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/alloc/src/vec/mod.rs:2749:35
34: <alloc::vec::Vec<T,A> as alloc::vec::spec_extend::SpecExtend<T,I>>::spec_extend
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/alloc/src/vec/spec_extend.rs:18:9
35: <alloc::vec::Vec<T,A> as core::iter::traits::collect::Extend<T>>::extend
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/alloc/src/vec/mod.rs:2723:9
36: <rayon::iter::extend::ListVecFolder<T> as rayon::iter::plumbing::Folder<T>>::consume_iter
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/extend.rs:73:9
37: <rayon::iter::while_some::WhileSomeFolder<C> as rayon::iter::plumbing::Folder<core::option::Option<T>>>::consume_iter
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/while_some.rs:139:21
38: <rayon::iter::map::MapFolder<C,F> as rayon::iter::plumbing::Folder<T>>::consume_iter
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/map.rs:248:21
39: <rayon::iter::map::MapFolder<C,F> as rayon::iter::plumbing::Folder<T>>::consume_iter
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/map.rs:248:21
40: rayon::iter::plumbing::Producer::fold_with
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/plumbing/mod.rs:110:9
41: rayon::iter::plumbing::bridge_producer_consumer::helper
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/plumbing/mod.rs:438:13
42: rayon::iter::plumbing::bridge_producer_consumer
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/plumbing/mod.rs:397:12
43: <rayon::iter::plumbing::bridge::Callback<C> as rayon::iter::plumbing::ProducerCallback<I>>::callback
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/plumbing/mod.rs:373:13
44: <rayon::vec::Drain<T> as rayon::iter::IndexedParallelIterator>::with_producer
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/vec.rs:147:13
45: <rayon::vec::IntoIter<T> as rayon::iter::IndexedParallelIterator>::with_producer
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/vec.rs:83:9
46: rayon::iter::plumbing::bridge
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/plumbing/mod.rs:357:12
47: <rayon::vec::IntoIter<T> as rayon::iter::ParallelIterator>::drive_unindexed
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/vec.rs:58:9
48: <rayon::iter::map::Map<I,F> as rayon::iter::ParallelIterator>::drive_unindexed
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/map.rs:49:9
49: <rayon::iter::map::Map<I,F> as rayon::iter::ParallelIterator>::drive_unindexed
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/map.rs:49:9
50: <rayon::iter::while_some::WhileSome<I> as rayon::iter::ParallelIterator>::drive_unindexed
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/while_some.rs:44:9
51: rayon::iter::extend::<impl rayon::iter::ParallelExtend<T> for alloc::vec::Vec<T>>::par_extend
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/extend.rs:576:28
52: rayon::iter::from_par_iter::collect_extended
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/from_par_iter.rs:17:5
53: rayon::iter::from_par_iter::<impl rayon::iter::FromParallelIterator<T> for alloc::vec::Vec<T>>::from_par_iter
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/from_par_iter.rs:30:9
54: rayon::iter::ParallelIterator::collect
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/mod.rs:2048:9
55: rayon::result::<impl rayon::iter::FromParallelIterator<core::result::Result<T,E>> for core::result::Result<C,E>>::from_par_iter
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/result.rs:121:26
56: rayon::iter::ParallelIterator::collect
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/mod.rs:2048:9
57: polars_io::ndjson_core::ndjson::CoreJsonReader::parse_json::{{closure}}
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/polars-io-0.25.1/src/ndjson_core/ndjson.rs:231:13
58: rayon_core::thread_pool::ThreadPool::install::{{closure}}
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/thread_pool/mod.rs:110:40
59: rayon_core::registry::Registry::in_worker_cold::{{closure}}::{{closure}}
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/registry.rs:506:21
60: rayon_core::job::JobResult<T>::call::{{closure}}
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/job.rs:212:41
61: <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/panic/unwind_safe.rs:271:9
62: std::panicking::try::do_call
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/std/src/panicking.rs:492:40
63: ___rust_try
64: std::panicking::try
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/std/src/panicking.rs:456:19
65: std::panic::catch_unwind
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/std/src/panic.rs:137:14
66: rayon_core::unwind::halt_unwinding
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/unwind.rs:17:5
67: rayon_core::job::JobResult<T>::call
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/job.rs:212:15
68: <rayon_core::job::StackJob<L,F,R> as rayon_core::job::Job>::execute
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/job.rs:114:32
69: rayon_core::job::JobRef::execute
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/job.rs:58:9
70: rayon_core::registry::WorkerThread::execute
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/registry.rs:804:9
71: rayon_core::registry::WorkerThread::wait_until_cold
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/registry.rs:781:17
72: rayon_core::registry::WorkerThread::wait_until
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/registry.rs:755:13
73: rayon_core::registry::main_loop
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/registry.rs:889:5
74: rayon_core::registry::ThreadBuilder::run
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/registry.rs:53:18
75: <rayon_core::registry::DefaultSpawn as rayon_core::registry::ThreadSpawn>::spawn::{{closure}}
at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/registry.rs:98:20
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.
Expected behavior
The example above should not panic
Installed versions
Cargo.toml:
[dependencies]
polars = { version = "0.25.1", features = [
"dtype-full",
"json",
] }
Rust version: 1.65.0 MacOS Monterey 12.6.1
The issue is from:
DataType::Struct(fields) => {
// the fields of the struct
let mut series_fields = Vec::with_capacity(fields.len());
for (i, field) in fields.iter().enumerate() {
let mut field_avs = Vec::with_capacity(av.len());
for av in av.iter() {
match av {
AnyValue::StructOwned(payload) => {
for (l, r) in fields.iter().zip(payload.1.iter()) {
if l.name() != r.name() {
return Err(PolarsError::ComputeError(
"struct orders must remain the same".into(),
));
}
}
let av_val = payload.0.[i].clone(); <------- Here
field_avs.push(av_val)
}
_ => field_avs.push(AnyValue::Null),
}
}
series_fields.push(Series::new(field.name(), &field_avs))
}
return Ok(StructChunked::new(name, &series_fields)
.unwrap()
.into_series());
}
I will make a PR to fix this
so this is actually partially fixed on main If i run the same code pointing to the latest commit I get the following error instead
thread 'main' panicked at 'Failed to read jsonlines: SchemaMisMatch(Owned("cannot vstack: because column datatypes (dtypes) in the two DataFrames do not match for left.name='struct' with left.dtype=struct[3] != right.dtype=struct[1] with right.name='struct'"))', src/main.rs:13:10
So it parses it correctly, but it has some issues with the schema. We could likely leverage the code in the python read_dicts for this.
This issue seems quite complex for me to fix, I cannot fix it by myself. Hope that you can help me @universalmind303 Here are some test cases with results at commit a9d25281fcef6b275e2196fb0daa7fbe45c77ba4
Test case 1:
Input:
{"struct": {"int_list": [4, 5, 6]}}
{"struct": {"int_list": [1, 2, 3], "float": 5.0, "str_list": ["a", "b", "c"]}, "int_opt": null, "float_list": [1.1, 2.2]}
Expected:
+------------------------------+---------+-----------------------+
| struct(struct[3]) | int_opt | float_list(list[f64]) |
+------------------------------+---------+-----------------------+
| {[4,5,6], null, null} | null | null |
| {[1,2,3], 5.0,["a","b","c"]} | null | [1.1,2.2] |
+------------------------------+---------+-----------------------+
Actual:
┌─────────────┬────────────┐
│ struct ┆ float_list │
│ --- ┆ --- │
│ struct[1] ┆ list[f64] │
╞═════════════╪════════════╡
│ {[4, 5, 6]} ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ {[1, 2, 3]} ┆ [1.1, 2.2] │
└─────────────┴────────────┘
Test case 2:
Data:
{"struct": {"int_list": [1, 2, 3], "float": 5.0, "str_list": ["a", "b", "c"]}, "int_opt": null, "float_list": [1.1, 2.2]}
{"struct": {"int_list": [4, 5, 6]}}
Expected:
+------------------------------+---------+-----------------------+
| struct(struct[3]) | int_opt | float_list(list[f64]) |
+------------------------------+---------+-----------------------+
| {[1,2,3], 5.0,["a","b","c"]} | null | [1.1,2.2] |
| {[4,5,6], null, null} | null | null |
+------------------------------+---------+-----------------------+
Actual:
thread 'main' panicked at 'Failed to read jsonlines: SchemaMisMatch(Owned("cannot vstack: because column datatypes (dtypes) in the two DataFrames do not match for left.name='struct' with left.dtype=struct[3] != right.dtype=struct[1] with right.name='struct'"))', src/main.rs:19:10
stack backtrace:
0: rust_begin_unwind
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/std/src/panicking.rs:584:5
1: core::panicking::panic_fmt
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/panicking.rs:142:14
2: core::result::unwrap_failed
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/result.rs:1785:5
3: core::result::Result<T,E>::expect
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/result.rs:1064:23
4: polars_ndjson_reader_issue::main
at ./src/main.rs:16:14
5: core::ops::function::FnOnce::call_once
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/ops/function.rs:248:5
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.
Test case 3:
Data:
{"struct": {"int_list": [4, 5, 6]}}
{"struct": {"int_list": [1, 2, 3], "float": 5.0, "str_list": ["a", "b", "c"]}, "int_opt": null, "float_list": [1.1, 2.2]}
{"struct": {"int_list": [4, 5, 6]}}
Expected:
+------------------------------+---------+-----------------------+
| struct(struct[3]) | int_opt | float_list(list[f64]) |
+------------------------------+---------+-----------------------+
| {[4,5,6], null, null} | null | null |
| {[1,2,3], 5.0,["a","b","c"]} | null | [1.1,2.2] |
| {[4,5,6], null, null} | null | null |
+------------------------------+---------+-----------------------+
Actual:
`target/debug/polars-ndjson-reader-issue`
thread 'main' panicked at 'Failed to read jsonlines: SchemaMisMatch(Owned("cannot vstack: because column datatypes (dtypes) in the two DataFrames do not match for left.name='struct' with left.dtype=struct[1] != right.dtype=struct[3] with right.name='struct'"))', src/main.rs:19:10
stack backtrace:
0: rust_begin_unwind
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/std/src/panicking.rs:584:5
1: core::panicking::panic_fmt
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/panicking.rs:142:14
2: core::result::unwrap_failed
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/result.rs:1785:5
3: core::result::Result<T,E>::expect
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/result.rs:1064:23
4: polars_ndjson_reader_issue::main
at ./src/main.rs:16:14
5: core::ops::function::FnOnce::call_once
at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/ops/function.rs:248:5
Errors still occur in version 0.26.1
I'm having the same/similar issue with the python library as well. Not sure If I should open a new issue, so putting it here:
test.json file contents:
{"key1":"value1", "key2": "value2", "key3": {"k1": 2, "k3": "value5", "k10": 5}}
{"key1":"value5", "key2": "value4", "key3": {"k1": 2, "k5": "value5", "k10": 4}}
{"key1":"value6", "key3": {"k1": 5, "k3": "value5"}}
df = pl.read_ndjson("test.json")
Error:
PanicException: called `Result::unwrap()` on an `Err` value: ComputeError(Borrowed("struct orders must remain the same"))
Sorry but any updates on this issue @universalmind303 ? Can you give me the branch of this issue, I may help by providing some test cases.
So some updates, I've been refactoring the json logic to use the same code as the row reader, so you should see the same results as using dicts.
One thing that i noticed though, in your first example
{"struct": {"int_list": [1, 2, 3], "float": 5.0, "str_list": ["a", "b", "c"]}, "int_opt": null, "float_list": [1.1, 2.2]}
{"struct": {"int_list": [4, 5, 6]}}
due to how arrow infers the schemas for null types, the int_opt field would only show up if there is a row without an int_opt: null such as
{"struct": {"int_list": [1, 2, 3], "float": 5.0, "str_list": ["a", "b", "c"]}, "int_opt": null, "float_list": [1.1, 2.2]}
{"struct": {"int_list": [4, 5, 6]}}
{"struct": {"int_list": [4, 5, 6]}, "int_opt": 1}
You can see that logic here: https://github.com/jorgecarleitao/arrow2/blob/main/src/io/ndjson/read/file.rs#L119