polars icon indicating copy to clipboard operation
polars copied to clipboard

Fail to read jsonline file with struct column

Open therealhieu opened this issue 2 years ago • 6 comments
trafficstars

Polars version checks

  • [X] I have checked that this issue has not already been reported.

  • [X] I have confirmed this bug exists on the latest version of Polars.

Issue description

Error when reading jsonline with a struct column.

Reproducible example

use polars::prelude::*;
use std::io::Cursor;

fn main() {
    let jsonlines = r#"
    {"struct": {"int_list": [1, 2, 3], "float": 5.0, "str_list": ["a", "b", "c"]}, "int_opt": null, "float_list": [1.1, 2.2]}
    {"struct": {"int_list": [4, 5, 6]}, "float": 4.0}
    "#;
    let cursor = Cursor::new(jsonlines);

    let df = JsonLineReader::new(cursor)
        .finish()
        .expect("Failed to read jsonlines");
}

Error details:

thread '<unnamed>' panicked at 'index out of bounds: the len is 1 but the index is 1', /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/polars-core-0.25.1/src/series/any_value.rs:180:46
stack backtrace:
   0: rust_begin_unwind
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/std/src/panicking.rs:584:5
   1: core::panicking::panic_fmt
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/panicking.rs:142:14
   2: core::panicking::panic_bounds_check
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/panicking.rs:84:5
   3: <usize as core::slice::index::SliceIndex<[T]>>::index
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/slice/index.rs:250:10
   4: core::slice::index::<impl core::ops::index::Index<I> for [T]>::index
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/slice/index.rs:18:9
   5: <alloc::vec::Vec<T,A> as core::ops::index::Index<I>>::index
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/alloc/src/vec/mod.rs:2628:9
   6: polars_core::series::any_value::<impl polars_core::series::Series>::from_any_values_and_dtype
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/polars-core-0.25.1/src/series/any_value.rs:180:46
   7: polars_core::series::any_value::<impl polars_core::series::Series>::from_any_values
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/polars-core-0.25.1/src/series/any_value.rs:218:17
   8: polars_core::series::any_value::<impl polars_core::named_from::NamedFrom<T,[polars_core::datatypes::AnyValue]> for polars_core::series::Series>::new
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/polars-core-0.25.1/src/series/any_value.rs:111:9
   9: polars_io::ndjson_core::buffer::Buffer::into_series
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/polars-io-0.25.1/src/ndjson_core/buffer.rs:89:42
  10: polars_io::ndjson_core::ndjson::CoreJsonReader::parse_json::{{closure}}::{{closure}}::{{closure}}
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/polars-io-0.25.1/src/ndjson_core/ndjson.rs:240:40
  11: core::iter::adapters::map::map_try_fold::{{closure}}
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/map.rs:91:28
  12: core::iter::traits::iterator::Iterator::try_fold
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/traits/iterator.rs:2238:21
  13: <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::try_fold
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/map.rs:117:9
  14: <core::iter::adapters::GenericShunt<I,R> as core::iter::traits::iterator::Iterator>::try_fold
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/mod.rs:195:9
  15: core::iter::traits::iterator::Iterator::try_for_each
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/traits/iterator.rs:2299:9
  16: <core::iter::adapters::GenericShunt<I,R> as core::iter::traits::iterator::Iterator>::next
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/mod.rs:178:9
  17: <alloc::vec::Vec<T> as alloc::vec::spec_from_iter_nested::SpecFromIterNested<T,I>>::from_iter
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/alloc/src/vec/spec_from_iter_nested.rs:26:32
  18: <alloc::vec::Vec<T> as alloc::vec::spec_from_iter::SpecFromIter<T,I>>::from_iter
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/alloc/src/vec/spec_from_iter.rs:33:9
  19: <alloc::vec::Vec<T> as core::iter::traits::collect::FromIterator<T>>::from_iter
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/alloc/src/vec/mod.rs:2649:9
  20: core::iter::traits::iterator::Iterator::collect
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/traits/iterator.rs:1836:9
  21: <core::result::Result<V,E> as core::iter::traits::collect::FromIterator<core::result::Result<A,E>>>::from_iter::{{closure}}
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/result.rs:2072:49
  22: core::iter::adapters::try_process
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/mod.rs:164:17
  23: <core::result::Result<V,E> as core::iter::traits::collect::FromIterator<core::result::Result<A,E>>>::from_iter
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/result.rs:2072:9
  24: core::iter::traits::iterator::Iterator::collect
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/traits/iterator.rs:1836:9
  25: polars_io::ndjson_core::ndjson::CoreJsonReader::parse_json::{{closure}}::{{closure}}
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/polars-io-0.25.1/src/ndjson_core/ndjson.rs:238:25
  26: core::ops::function::impls::<impl core::ops::function::FnMut<A> for &F>::call_mut
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/ops/function.rs:270:13
  27: core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &mut F>::call_once
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/ops/function.rs:306:13
  28: core::option::Option<T>::map
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/option.rs:929:29
  29: <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::next
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/map.rs:103:9
  30: <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::next
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/map.rs:103:9
  31: <core::iter::adapters::take_while::TakeWhile<I,P> as core::iter::traits::iterator::Iterator>::next
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/take_while.rs:46:21
  32: <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::next
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/iter/adapters/map.rs:103:9
  33: alloc::vec::Vec<T,A>::extend_desugared
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/alloc/src/vec/mod.rs:2749:35
  34: <alloc::vec::Vec<T,A> as alloc::vec::spec_extend::SpecExtend<T,I>>::spec_extend
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/alloc/src/vec/spec_extend.rs:18:9
  35: <alloc::vec::Vec<T,A> as core::iter::traits::collect::Extend<T>>::extend
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/alloc/src/vec/mod.rs:2723:9
  36: <rayon::iter::extend::ListVecFolder<T> as rayon::iter::plumbing::Folder<T>>::consume_iter
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/extend.rs:73:9
  37: <rayon::iter::while_some::WhileSomeFolder<C> as rayon::iter::plumbing::Folder<core::option::Option<T>>>::consume_iter
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/while_some.rs:139:21
  38: <rayon::iter::map::MapFolder<C,F> as rayon::iter::plumbing::Folder<T>>::consume_iter
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/map.rs:248:21
  39: <rayon::iter::map::MapFolder<C,F> as rayon::iter::plumbing::Folder<T>>::consume_iter
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/map.rs:248:21
  40: rayon::iter::plumbing::Producer::fold_with
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/plumbing/mod.rs:110:9
  41: rayon::iter::plumbing::bridge_producer_consumer::helper
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/plumbing/mod.rs:438:13
  42: rayon::iter::plumbing::bridge_producer_consumer
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/plumbing/mod.rs:397:12
  43: <rayon::iter::plumbing::bridge::Callback<C> as rayon::iter::plumbing::ProducerCallback<I>>::callback
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/plumbing/mod.rs:373:13
  44: <rayon::vec::Drain<T> as rayon::iter::IndexedParallelIterator>::with_producer
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/vec.rs:147:13
  45: <rayon::vec::IntoIter<T> as rayon::iter::IndexedParallelIterator>::with_producer
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/vec.rs:83:9
  46: rayon::iter::plumbing::bridge
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/plumbing/mod.rs:357:12
  47: <rayon::vec::IntoIter<T> as rayon::iter::ParallelIterator>::drive_unindexed
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/vec.rs:58:9
  48: <rayon::iter::map::Map<I,F> as rayon::iter::ParallelIterator>::drive_unindexed
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/map.rs:49:9
  49: <rayon::iter::map::Map<I,F> as rayon::iter::ParallelIterator>::drive_unindexed
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/map.rs:49:9
  50: <rayon::iter::while_some::WhileSome<I> as rayon::iter::ParallelIterator>::drive_unindexed
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/while_some.rs:44:9
  51: rayon::iter::extend::<impl rayon::iter::ParallelExtend<T> for alloc::vec::Vec<T>>::par_extend
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/extend.rs:576:28
  52: rayon::iter::from_par_iter::collect_extended
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/from_par_iter.rs:17:5
  53: rayon::iter::from_par_iter::<impl rayon::iter::FromParallelIterator<T> for alloc::vec::Vec<T>>::from_par_iter
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/from_par_iter.rs:30:9
  54: rayon::iter::ParallelIterator::collect
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/mod.rs:2048:9
  55: rayon::result::<impl rayon::iter::FromParallelIterator<core::result::Result<T,E>> for core::result::Result<C,E>>::from_par_iter
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/result.rs:121:26
  56: rayon::iter::ParallelIterator::collect
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-1.6.1/src/iter/mod.rs:2048:9
  57: polars_io::ndjson_core::ndjson::CoreJsonReader::parse_json::{{closure}}
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/polars-io-0.25.1/src/ndjson_core/ndjson.rs:231:13
  58: rayon_core::thread_pool::ThreadPool::install::{{closure}}
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/thread_pool/mod.rs:110:40
  59: rayon_core::registry::Registry::in_worker_cold::{{closure}}::{{closure}}
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/registry.rs:506:21
  60: rayon_core::job::JobResult<T>::call::{{closure}}
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/job.rs:212:41
  61: <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/panic/unwind_safe.rs:271:9
  62: std::panicking::try::do_call
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/std/src/panicking.rs:492:40
  63: ___rust_try
  64: std::panicking::try
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/std/src/panicking.rs:456:19
  65: std::panic::catch_unwind
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/std/src/panic.rs:137:14
  66: rayon_core::unwind::halt_unwinding
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/unwind.rs:17:5
  67: rayon_core::job::JobResult<T>::call
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/job.rs:212:15
  68: <rayon_core::job::StackJob<L,F,R> as rayon_core::job::Job>::execute
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/job.rs:114:32
  69: rayon_core::job::JobRef::execute
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/job.rs:58:9
  70: rayon_core::registry::WorkerThread::execute
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/registry.rs:804:9
  71: rayon_core::registry::WorkerThread::wait_until_cold
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/registry.rs:781:17
  72: rayon_core::registry::WorkerThread::wait_until
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/registry.rs:755:13
  73: rayon_core::registry::main_loop
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/registry.rs:889:5
  74: rayon_core::registry::ThreadBuilder::run
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/registry.rs:53:18
  75: <rayon_core::registry::DefaultSpawn as rayon_core::registry::ThreadSpawn>::spawn::{{closure}}
             at /Users/xxx/.cargo/registry/src/github.com-1ecc6299db9ec823/rayon-core-1.10.1/src/registry.rs:98:20
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.

Expected behavior

The example above should not panic

Installed versions

Cargo.toml:

[dependencies]
    polars = { version = "0.25.1", features = [
        "dtype-full",
        "json",
    ] }

Rust version: 1.65.0 MacOS Monterey 12.6.1

therealhieu avatar Dec 21 '22 12:12 therealhieu

The issue is from:

            DataType::Struct(fields) => {
                // the fields of the struct
                let mut series_fields = Vec::with_capacity(fields.len());
                for (i, field) in fields.iter().enumerate() {
                    let mut field_avs = Vec::with_capacity(av.len());

                    for av in av.iter() {
                        match av {
                            AnyValue::StructOwned(payload) => {
                                for (l, r) in fields.iter().zip(payload.1.iter()) {
                                    if l.name() != r.name() {
                                        return Err(PolarsError::ComputeError(
                                            "struct orders must remain the same".into(),
                                        ));
                                    }
                                }

                                let av_val = payload.0.[i].clone();  <------- Here
                                field_avs.push(av_val)
                            }
                            _ => field_avs.push(AnyValue::Null),
                        }
                    }
                    series_fields.push(Series::new(field.name(), &field_avs))
                }
                return Ok(StructChunked::new(name, &series_fields)
                    .unwrap()
                    .into_series());
            }

I will make a PR to fix this

therealhieu avatar Dec 21 '22 13:12 therealhieu

so this is actually partially fixed on main If i run the same code pointing to the latest commit I get the following error instead

thread 'main' panicked at 'Failed to read jsonlines: SchemaMisMatch(Owned("cannot vstack: because column datatypes (dtypes) in the two DataFrames do not match for left.name='struct' with left.dtype=struct[3] != right.dtype=struct[1] with right.name='struct'"))', src/main.rs:13:10

So it parses it correctly, but it has some issues with the schema. We could likely leverage the code in the python read_dicts for this.

universalmind303 avatar Dec 21 '22 20:12 universalmind303

This issue seems quite complex for me to fix, I cannot fix it by myself. Hope that you can help me @universalmind303 Here are some test cases with results at commit a9d25281fcef6b275e2196fb0daa7fbe45c77ba4

Test case 1:

Input:

{"struct": {"int_list": [4, 5, 6]}}
{"struct": {"int_list": [1, 2, 3], "float": 5.0, "str_list": ["a", "b", "c"]}, "int_opt": null, "float_list": [1.1, 2.2]}

Expected:

+------------------------------+---------+-----------------------+
|      struct(struct[3])       | int_opt | float_list(list[f64]) |
+------------------------------+---------+-----------------------+
| {[4,5,6], null, null}        | null    | null                  |
| {[1,2,3], 5.0,["a","b","c"]} | null    | [1.1,2.2]             |
+------------------------------+---------+-----------------------+

Actual:

┌─────────────┬────────────┐
│ struct      ┆ float_list │
│ ---         ┆ ---        │
│ struct[1]   ┆ list[f64]  │
╞═════════════╪════════════╡
│ {[4, 5, 6]} ┆ null       │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ {[1, 2, 3]} ┆ [1.1, 2.2] │
└─────────────┴────────────┘

Test case 2:

Data:

{"struct": {"int_list": [1, 2, 3], "float": 5.0, "str_list": ["a", "b", "c"]}, "int_opt": null, "float_list": [1.1, 2.2]}
{"struct": {"int_list": [4, 5, 6]}}

Expected:

+------------------------------+---------+-----------------------+
|      struct(struct[3])       | int_opt | float_list(list[f64]) |
+------------------------------+---------+-----------------------+
| {[1,2,3], 5.0,["a","b","c"]} | null    | [1.1,2.2]             |
| {[4,5,6], null, null}        | null    | null                  |
+------------------------------+---------+-----------------------+

Actual:

thread 'main' panicked at 'Failed to read jsonlines: SchemaMisMatch(Owned("cannot vstack: because column datatypes (dtypes) in the two DataFrames do not match for left.name='struct' with left.dtype=struct[3] != right.dtype=struct[1] with right.name='struct'"))', src/main.rs:19:10
stack backtrace:
   0: rust_begin_unwind
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/std/src/panicking.rs:584:5
   1: core::panicking::panic_fmt
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/panicking.rs:142:14
   2: core::result::unwrap_failed
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/result.rs:1785:5
   3: core::result::Result<T,E>::expect
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/result.rs:1064:23
   4: polars_ndjson_reader_issue::main
             at ./src/main.rs:16:14
   5: core::ops::function::FnOnce::call_once
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/ops/function.rs:248:5
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.

Test case 3:

Data:

{"struct": {"int_list": [4, 5, 6]}}
{"struct": {"int_list": [1, 2, 3], "float": 5.0, "str_list": ["a", "b", "c"]}, "int_opt": null, "float_list": [1.1, 2.2]}
{"struct": {"int_list": [4, 5, 6]}}

Expected:

+------------------------------+---------+-----------------------+
|      struct(struct[3])       | int_opt | float_list(list[f64]) |
+------------------------------+---------+-----------------------+
| {[4,5,6], null, null}        | null    | null                  |
| {[1,2,3], 5.0,["a","b","c"]} | null    | [1.1,2.2]             |
| {[4,5,6], null, null}        | null    | null                  |
+------------------------------+---------+-----------------------+

Actual:

`target/debug/polars-ndjson-reader-issue`
thread 'main' panicked at 'Failed to read jsonlines: SchemaMisMatch(Owned("cannot vstack: because column datatypes (dtypes) in the two DataFrames do not match for left.name='struct' with left.dtype=struct[1] != right.dtype=struct[3] with right.name='struct'"))', src/main.rs:19:10
stack backtrace:
   0: rust_begin_unwind
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/std/src/panicking.rs:584:5
   1: core::panicking::panic_fmt
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/panicking.rs:142:14
   2: core::result::unwrap_failed
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/result.rs:1785:5
   3: core::result::Result<T,E>::expect
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/result.rs:1064:23
   4: polars_ndjson_reader_issue::main
             at ./src/main.rs:16:14
   5: core::ops::function::FnOnce::call_once
             at /rustc/897e37553bba8b42751c67658967889d11ecd120/library/core/src/ops/function.rs:248:5

therealhieu avatar Dec 22 '22 07:12 therealhieu

Errors still occur in version 0.26.1

therealhieu avatar Dec 22 '22 13:12 therealhieu

I'm having the same/similar issue with the python library as well. Not sure If I should open a new issue, so putting it here: test.json file contents:

{"key1":"value1", "key2": "value2", "key3": {"k1": 2, "k3": "value5", "k10": 5}}
{"key1":"value5", "key2": "value4", "key3": {"k1": 2, "k5": "value5", "k10": 4}}
{"key1":"value6", "key3": {"k1": 5, "k3": "value5"}}
df = pl.read_ndjson("test.json")

Error: PanicException: called `Result::unwrap()` on an `Err` value: ComputeError(Borrowed("struct orders must remain the same"))

Cyb3r-Monk avatar Jan 02 '23 19:01 Cyb3r-Monk

Sorry but any updates on this issue @universalmind303 ? Can you give me the branch of this issue, I may help by providing some test cases.

therealhieu avatar Jan 04 '23 06:01 therealhieu

So some updates, I've been refactoring the json logic to use the same code as the row reader, so you should see the same results as using dicts.

One thing that i noticed though, in your first example

{"struct": {"int_list": [1, 2, 3], "float": 5.0, "str_list": ["a", "b", "c"]}, "int_opt": null, "float_list": [1.1, 2.2]}
{"struct": {"int_list": [4, 5, 6]}}

due to how arrow infers the schemas for null types, the int_opt field would only show up if there is a row without an int_opt: null such as

{"struct": {"int_list": [1, 2, 3], "float": 5.0, "str_list": ["a", "b", "c"]}, "int_opt": null, "float_list": [1.1, 2.2]}
{"struct": {"int_list": [4, 5, 6]}}
{"struct": {"int_list": [4, 5, 6]}, "int_opt": 1}

You can see that logic here: https://github.com/jorgecarleitao/arrow2/blob/main/src/io/ndjson/read/file.rs#L119

universalmind303 avatar Jan 04 '23 19:01 universalmind303