awkward icon indicating copy to clipboard operation
awkward copied to clipboard

Segfault in pyarrow observed in #1619

Open jpivarski opened this issue 2 years ago • 13 comments

Version of Awkward Array

agoose77/fix-pyarrow-empty-field (branch)

Description and code to reproduce

In #1619, @agoose77 saw the following segfault:

>>> import awkward as ak
>>> array = ak._v2.Array(
...         [
...             {"": {"x": 1, "y": 1.1}},
...             {"": {"x": 2, "y": 2.2}},
...             {"": {"x": 3, "y": 3.3}},
...             {"": {"x": 4, "y": 4.4}},
...             {"": {"x": 5, "y": 5.5}},
...             {"": {"x": 6, "y": 6.6}},
...             {"": {"x": 7, "y": 7.7}},
...             {"": {"x": 8, "y": 8.8}},
...             {"": {"x": 9, "y": 9.9}},
...         ]
...     )
>>> ak._v2.to_parquet(array, "tmp.parquet")   # note: to_parquet should return None, not FileMetaData
<pyarrow._parquet.FileMetaData object at 0x7f8239f93b80>
  created_by: parquet-cpp-arrow version 9.0.0
  num_columns: 2
  num_rows: 9
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 0

>>> ak._v2.from_parquet("tmp.parquet")   # read all columns: okay
<Array [{x: 1, y: 1.1}, ..., {x: 9, ...}] type='9 * {x: int64, y: float64}'>

>>> ak._v2.from_parquet("tmp.parquet", columns=["x"])   # read one column: bad
Segmentation fault (core dumped)

Moreover, this segfault does not occur with nested records:

>>> import awkward as ak
>>> array = ak._v2.Array(
...         [
...             {"x": 1, "y": 1.1},
...             {"x": 2, "y": 2.2},
...             {"x": 3, "y": 3.3},
...             {"x": 4, "y": 4.4},
...             {"x": 5, "y": 5.5},
...             {"x": 6, "y": 6.6},
...             {"x": 7, "y": 7.7},
...             {"x": 8, "y": 8.8},
...             {"x": 9, "y": 9.9},
...         ]
...     )
>>> ak._v2.to_parquet(array, "tmp.parquet")
<pyarrow._parquet.FileMetaData object at 0x7f7e10ddfd10>
  created_by: parquet-cpp-arrow version 9.0.0
  num_columns: 2
  num_rows: 9
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 0

>>> ak._v2.from_parquet("tmp.parquet")
<Array [{x: 1, y: 1.1}, ..., {x: 9, ...}] type='9 * {x: int64, y: float64}'>

>>> ak._v2.from_parquet("tmp.parquet", columns=["x"])
<Array [{x: 1}, {x: 2}, {...}, ..., {x: 8}, {x: 9}] type='9 * {x: int64}'>

It's the nested records, not the empty-string field name, that's the source of trouble:

>>> import awkward as ak
>>> array = ak._v2.Array(
...         [
...             {"z": {"x": 1, "y": 1.1}},
...             {"z": {"x": 2, "y": 2.2}},
...             {"z": {"x": 3, "y": 3.3}},
...             {"z": {"x": 4, "y": 4.4}},
...             {"z": {"x": 5, "y": 5.5}},
...             {"z": {"x": 6, "y": 6.6}},
...             {"z": {"x": 7, "y": 7.7}},
...             {"z": {"x": 8, "y": 8.8}},
...             {"z": {"x": 9, "y": 9.9}},
...         ]
...     )
>>> ak._v2.to_parquet(array, "tmp.parquet")
<pyarrow._parquet.FileMetaData object at 0x7fe1f831e9f0>
  created_by: parquet-cpp-arrow version 9.0.0
  num_columns: 2
  num_rows: 9
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 0

>>> ak._v2.from_parquet("tmp.parquet")
<Array [{z: {x: 1, y: 1.1}}, {...}, ..., {...}] type='9 * {z: {x: int64, y:...'>

>>> ak._v2.from_parquet("tmp.parquet", columns=["x"])
<Array [{}, {}, {}, {}, {}, {}, {}, {}, {}] type='9 * {}'>

>>> ak._v2.from_parquet("tmp.parquet", columns=["z.x"])
Segmentation fault (core dumped)

The first step in to_parquet is to_arrow_table (specifically a pyarrow.Table to go to Parquet). However, something's already wrong at this stage:

>>> import awkward as ak
>>> array = ak._v2.Array(
...         [
...             {"z": {"x": 1, "y": 1.1}},
...             {"z": {"x": 2, "y": 2.2}},
...             {"z": {"x": 3, "y": 3.3}},
...             {"z": {"x": 4, "y": 4.4}},
...             {"z": {"x": 5, "y": 5.5}},
...             {"z": {"x": 6, "y": 6.6}},
...             {"z": {"x": 7, "y": 7.7}},
...             {"z": {"x": 8, "y": 8.8}},
...             {"z": {"x": 9, "y": 9.9}},
...         ]
...     )

>>> ak.to_arrow_table(array)
pyarrow.Table
: large_list<item: string not null> not null
  child 0, item: string not null
----
: [[["z"],["z"],...,["z"],["z"]]]

>>> ak.to_arrow_table(array).to_pylist()
[{'': ['z']},
 {'': ['z']},
 {'': ['z']},
 {'': ['z']},
 {'': ['z']},
 {'': ['z']},
 {'': ['z']},
 {'': ['z']},
 {'': ['z']}]

If I build this construction using only pyarrow, it would be

>>> import pyarrow as pa
>>> array = pa.array(
...         [
...             {"z": {"x": 1, "y": 1.1}},
...             {"z": {"x": 2, "y": 2.2}},
...             {"z": {"x": 3, "y": 3.3}},
...             {"z": {"x": 4, "y": 4.4}},
...             {"z": {"x": 5, "y": 5.5}},
...             {"z": {"x": 6, "y": 6.6}},
...             {"z": {"x": 7, "y": 7.7}},
...             {"z": {"x": 8, "y": 8.8}},
...             {"z": {"x": 9, "y": 9.9}},
...         ]
...     )
>>> array.to_pylist()
[{'z': {'x': 1, 'y': 1.1}},
 {'z': {'x': 2, 'y': 2.2}},
 {'z': {'x': 3, 'y': 3.3}},
 {'z': {'x': 4, 'y': 4.4}},
 {'z': {'x': 5, 'y': 5.5}},
 {'z': {'x': 6, 'y': 6.6}},
 {'z': {'x': 7, 'y': 7.7}},
 {'z': {'x': 8, 'y': 8.8}},
 {'z': {'x': 9, 'y': 9.9}}]
>>> pa.Table.from_arrays([array], names=[""])
pyarrow.Table
: struct<z: struct<x: int64, y: double>>
  child 0, z: struct<x: int64, y: double>
      child 0, x: int64
      child 1, y: double
----
: [
  -- is_valid: all not null
  -- child 0 type: struct<x: int64, y: double>
    -- is_valid: all not null
    -- child 0 type: int64
[1,2,3,4,5,6,7,8,9]
    -- child 1 type: double
[1.1,2.2,3.3,4.4,5.5,6.6,7.7,8.8,9.9]]
>>> pa.Table.from_arrays([array], names=[""]).to_pylist()
[{'': {'z': {'x': 1, 'y': 1.1}}},
 {'': {'z': {'x': 2, 'y': 2.2}}},
 {'': {'z': {'x': 3, 'y': 3.3}}},
 {'': {'z': {'x': 4, 'y': 4.4}}},
 {'': {'z': {'x': 5, 'y': 5.5}}},
 {'': {'z': {'x': 6, 'y': 6.6}}},
 {'': {'z': {'x': 7, 'y': 7.7}}},
 {'': {'z': {'x': 8, 'y': 8.8}}},
 {'': {'z': {'x': 9, 'y': 9.9}}}]

So evidently, we're building it wrong. This is the first step toward enlightenment.

jpivarski avatar Aug 25 '22 22:08 jpivarski

Reproducible in main.

jpivarski avatar Aug 25 '22 23:08 jpivarski

Ah yes, my test was a bit daft when I first wrote it (and we probably should error if the user tries to create such a file, as we reserve the root "" field).

agoose77 avatar Aug 26 '22 10:08 agoose77

we probably should error if the user tries to create such a file, as we reserve the root "" field

Users creating fields with empty names is not a case I had considered. Maybe we'd still be okay because the first one (only) is reserved for the pyarrow.Table and we'd always have the right off-by-one. But if there are ambiguous cases, then we could forbid it.

jpivarski avatar Aug 26 '22 15:08 jpivarski

In the above, where I thought that ak.to_arrow_table was wrong, it's because I didn't include the ._v2. The v1 function was misinterpreting the v2 array and making a mess of it. So—never mind the conclusion that to_arrow_table is at fault. As far as I can see, it isn't.

jpivarski avatar Aug 26 '22 17:08 jpivarski

>>> array.show(type=True)
type: 9 * {
    z: {
        x: int64,
        y: float64
    }
}
[{z: {x: 1, y: 1.1}},
 {z: {x: 2, y: 2.2}},
 {z: {x: 3, y: 3.3}},
 {z: {x: 4, y: 4.4}},
 {z: {x: 5, y: 5.5}},
 {z: {x: 6, y: 6.6}},
 {z: {x: 7, y: 7.7}},
 {z: {x: 8, y: 8.8}},
 {z: {x: 9, y: 9.9}}]
>>> ak._v2.from_arrow(ak._v2.to_arrow_table(array)).show(type=True)
type: 9 * {
    z: {
        x: int64,
        y: float64
    }
}
[{z: {x: 1, y: 1.1}},
 {z: {x: 2, y: 2.2}},
 {z: {x: 3, y: 3.3}},
 {z: {x: 4, y: 4.4}},
 {z: {x: 5, y: 5.5}},
 {z: {x: 6, y: 6.6}},
 {z: {x: 7, y: 7.7}},
 {z: {x: 8, y: 8.8}},
 {z: {x: 9, y: 9.9}}]

jpivarski avatar Aug 26 '22 17:08 jpivarski

Constructing the Parquet data with pyarray alone, I can't reproduce the segfault.

Without ExtensionArrays:

import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

one = pa.Array.from_buffers(
    pa.int64(),
    3,
    [None, pa.py_buffer(np.array([10, 20, 30], dtype=np.int64))],
)
two = pa.Array.from_buffers(
    pa.float64(),
    3,
    [None, pa.py_buffer(np.array([1.1, 2.2, 3.3], dtype=np.float64))],
)
shallow = pa.Array.from_buffers(
    pa.struct([
        pa.field("one", one.type, False),
        pa.field("two", two.type, False),
    ]),
    3,
    [None],
    children=[one, two],
)
assert shallow.to_pylist() == [
    {"one": 10, "two": 1.1},
    {"one": 20, "two": 2.2},
    {"one": 30, "two": 3.3},
]

table = pa.Table.from_arrays([shallow], names=["column"])
pq.write_table(table, "shallow.parquet")
table2 = pq.read_table("shallow.parquet")
assert table2.to_pylist() == [
    {"column": {"one": 10, "two": 1.1}},
    {"column": {"one": 20, "two": 2.2}},
    {"column": {"one": 30, "two": 3.3}},
]

deep = pa.Array.from_buffers(
    pa.struct([
        pa.field("nest", shallow.type, False),
    ]),
    3,
    [None],
    children=[shallow],
)
assert deep.to_pylist() == [
    {"nest": {"one": 10, "two": 1.1}},
    {"nest": {"one": 20, "two": 2.2}},
    {"nest": {"one": 30, "two": 3.3}},
]

table = pa.Table.from_arrays([deep], names=["column"])
pq.write_table(table, "deep.parquet")
table2 = pq.read_table("deep.parquet")
assert table2.to_pylist() == [
    {"column": {"nest": {"one": 10, "two": 1.1}}},
    {"column": {"nest": {"one": 20, "two": 2.2}}},
    {"column": {"nest": {"one": 30, "two": 3.3}}},
]

With ExtensionArrays:

import json

class AnnotatedType(pa.ExtensionType):
    def __init__(self, storage_type, annotation):
        self.annotation = annotation
        super().__init__(storage_type, "my:app")
    def __arrow_ext_serialize__(self):
        return json.dumps(self.annotation).encode()
    @classmethod
    def __arrow_ext_deserialize__(cls, storage_type, serialized):
        annotation = json.loads(serialized.decode())
        return cls(storage_type, annotation)
    @property
    def num_buffers(self):
        return self.storage_type.num_buffers
    @property
    def num_fields(self):
        return self.storage_type.num_fields

pa.register_extension_type(AnnotatedType(pa.null(), None))

one = pa.Array.from_buffers(
    AnnotatedType(pa.int64(), {"annotated": 1}),
    3,
    [None, pa.py_buffer(np.array([10, 20, 30], dtype=np.int64))],
)
two = pa.Array.from_buffers(
    AnnotatedType(pa.float64(), {"annotated": 2}),
    3,
    [None, pa.py_buffer(np.array([1.1, 2.2, 3.3], dtype=np.float64))],
)
shallow = pa.Array.from_buffers(
    AnnotatedType(
        pa.struct([
            pa.field("one", one.type, False),
            pa.field("two", two.type, False),
        ]),
        {"annotated": 3},
    ),
    3,
    [None],
    children=[one, two],
)
assert shallow.to_pylist() == [
    {"one": 10, "two": 1.1},
    {"one": 20, "two": 2.2},
    {"one": 30, "two": 3.3},
]

table = pa.Table.from_arrays([shallow], names=["column"])
pq.write_table(table, "shallow_annotated.parquet")
table2 = pq.read_table("shallow_annotated.parquet")
assert table2.to_pylist() == [
    {"column": {"one": 10, "two": 1.1}},
    {"column": {"one": 20, "two": 2.2}},
    {"column": {"one": 30, "two": 3.3}},
]

deep = pa.Array.from_buffers(
    AnnotatedType(
        pa.struct([
            pa.field("nest", shallow.type, False),
        ]),
        {"annotated": 4},
    ),
    3,
    [None],
    children=[shallow],
)
assert deep.to_pylist() == [
    {"nest": {"one": 10, "two": 1.1}},
    {"nest": {"one": 20, "two": 2.2}},
    {"nest": {"one": 30, "two": 3.3}},
]

table = pa.Table.from_arrays([deep], names=["column"])
pq.write_table(table, "deep_annotated.parquet")
table2 = pq.read_table("deep_annotated.parquet")
assert table2.to_pylist() == [
    {"column": {"nest": {"one": 10, "two": 1.1}}},
    {"column": {"nest": {"one": 20, "two": 2.2}}},
    {"column": {"nest": {"one": 30, "two": 3.3}}},
]

jpivarski avatar Aug 26 '22 18:08 jpivarski

Actually, that wasn't quite the right test; this is:

>>> import pyarrow.parquet as pq
>>> pq.ParquetFile("shallow.parquet").read_row_groups([0], ["column.one"])
pyarrow.Table
column: struct<one: int64 not null>
  child 0, one: int64 not null
----
column: [
  -- is_valid: all not null
  -- child 0 type: int64
[10,20,30]]
>>> pq.ParquetFile("shallow.parquet").read_row_groups([0], ["column.two"])
pyarrow.Table
column: struct<two: double not null>
  child 0, two: double not null
----
column: [
  -- is_valid: all not null
  -- child 0 type: double
[1.1,2.2,3.3]]
>>> pq.ParquetFile("shallow_annotated.parquet").read_row_groups([0], ["column.one"])
pyarrow.Table
column: struct<one: int64 not null>
  child 0, one: int64 not null
----
column: [
  -- is_valid: all not null
  -- child 0 type: int64
[10,20,30]]
>>> pq.ParquetFile("shallow_annotated.parquet").read_row_groups([0], ["column.two"])
pyarrow.Table
column: struct<two: double not null>
  child 0, two: double not null
----
column: [
  -- is_valid: all not null
  -- child 0 type: double
[1.1,2.2,3.3]]
>>> pq.ParquetFile("deep.parquet").read_row_groups([0], ["column.nest.one"])
pyarrow.Table
column: struct<nest: struct<one: int64 not null> not null>
  child 0, nest: struct<one: int64 not null> not null
      child 0, one: int64 not null
----
column: [
  -- is_valid: all not null
  -- child 0 type: struct<one: int64 not null>
    -- is_valid: all not null
    -- child 0 type: int64
[10,20,30]]
>>> pq.ParquetFile("deep.parquet").read_row_groups([0], ["column.nest.two"])
pyarrow.Table
column: struct<nest: struct<two: double not null> not null>
  child 0, nest: struct<two: double not null> not null
      child 0, two: double not null
----
column: [
  -- is_valid: all not null
  -- child 0 type: struct<two: double not null>
    -- is_valid: all not null
    -- child 0 type: double
[1.1,2.2,3.3]]
>>> pq.ParquetFile("deep_annotated.parquet").read_row_groups([0], ["column.nest.one"])
pyarrow.Table
column: struct<nest: struct<one: int64 not null> not null>
  child 0, nest: struct<one: int64 not null> not null
      child 0, one: int64 not null
----
column: [
  -- is_valid: all not null
  -- child 0 type: struct<one: int64 not null>
    -- is_valid: all not null
    -- child 0 type: int64
[10,20,30]]
>>> pq.ParquetFile("deep_annotated.parquet").read_row_groups([0], ["column.nest.two"])
pyarrow.Table
column: struct<nest: struct<two: double not null> not null>
  child 0, nest: struct<two: double not null> not null
      child 0, two: double not null
----
column: [
  -- is_valid: all not null
  -- child 0 type: struct<two: double not null>
    -- is_valid: all not null
    -- child 0 type: double
[1.1,2.2,3.3]]

No segfaults, even when we select a column.

jpivarski avatar Aug 26 '22 18:08 jpivarski

If I create a tmp.parquet that would segfault in ak._v2.from_parquet,

>>> ak._v2.from_parquet("tmp.parquet", columns=["z.x"])
Segmentation fault (core dumped)

the same does not segfault when being read directly by pyarrow (i.e. it doesn't seem to be a problem with the file itself):

>>> pq.ParquetFile("tmp.parquet").read_row_groups([0], ["z.x"])
pyarrow.Table
z: struct<x: int64 not null> not null
  child 0, x: int64 not null
----
z: [
  -- is_valid: all not null
  -- child 0 type: int64
[1,2]]
>>> pq.ParquetFile("tmp.parquet").read_row_groups([0], ["z.y"])
pyarrow.Table
z: struct<y: double not null> not null
  child 0, y: double not null
----
z: [
  -- is_valid: all not null
  -- child 0 type: double
[1.1,2.2]]

jpivarski avatar Aug 26 '22 18:08 jpivarski

@jpivarski is our code producing the same columns with content.columns?

agoose77 avatar Aug 26 '22 19:08 agoose77

So far, it looks like Awkward is preparing Arrow arrays in the same format as the all-pyarrow method. Though there must be some difference.

jpivarski avatar Aug 26 '22 19:08 jpivarski

Reading back this tmp.parquet file (produced by Awkward), with pure pyarrow causes the segfault, if we register our mock ExtensionType with the name "awkward":

>>> import pyarrow.parquet as pq
>>> import pyarrow as pa
>>> import json
>>> 
>>> class AnnotatedType(pa.ExtensionType):
...     def __init__(self, storage_type, annotation):
...         self.annotation = annotation
...         super().__init__(storage_type, "awkward")
...     def __arrow_ext_serialize__(self):
...         return json.dumps(self.annotation).encode()
...     @classmethod
...     def __arrow_ext_deserialize__(cls, storage_type, serialized):
...         annotation = json.loads(serialized.decode())
...         return cls(storage_type, annotation)
...     @property
...     def num_buffers(self):
...         return self.storage_type.num_buffers
...     @property
...     def num_fields(self):
...         return self.storage_type.num_fields
... 
>>> pa.register_extension_type(AnnotatedType(pa.null(), None))
>>> pq.ParquetFile("tmp.parquet").read_row_groups([0])
pyarrow.Table
z: extension<awkward<AnnotatedType>> not null
----
z: [  -- is_valid: all not null
  -- child 0 type: extension<awkward<AnnotatedType>>
[1,2]
  -- child 1 type: extension<awkward<AnnotatedType>>
[1.1,2.2]]
>>> pq.ParquetFile("tmp.parquet").read_row_groups([0], ["z.x"])
Segmentation fault (core dumped)

jpivarski avatar Aug 26 '22 19:08 jpivarski

Okay, I have a minimally reproducing example that doesn't use Awkward. The reason I missed it before is because https://github.com/scikit-hep/awkward/issues/1635#issuecomment-1228804025 didn't load the extension type. The reason we need two levels of nested records in Awkward is because the first is interpreted as the Table (when we have a RecordArray, it becomes a Table).

In this minimally reproducing example, there is only one level of pa.StructArray (not pa.Table!), and that's all you need.

import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

one = pa.Array.from_buffers(
    pa.int64(),
    3,
    [None, pa.py_buffer(np.array([10, 20, 30], dtype=np.int64))],
)
two = pa.Array.from_buffers(
    pa.float64(),
    3,
    [None, pa.py_buffer(np.array([1.1, 2.2, 3.3], dtype=np.float64))],
)
record = pa.Array.from_buffers(
    pa.struct([
        pa.field("one", one.type, False),
        pa.field("two", two.type, False),
    ]),
    3,
    [None],
    children=[one, two],
)
assert record.to_pylist() == [
    {"one": 10, "two": 1.1},
    {"one": 20, "two": 2.2},
    {"one": 30, "two": 3.3},
]

table = pa.Table.from_arrays([record], names=["column"])
pq.write_table(table, "record.parquet")
table2 = pq.ParquetFile("record.parquet").read_row_groups([0], ["column.one"])
assert table2.to_pylist() == [
    {"column": {"one": 10}},
    {"column": {"one": 20}},
    {"column": {"one": 30}},
]

################################# now add an ExtensionType

import json

class AnnotatedType(pa.ExtensionType):
    def __init__(self, storage_type, annotation):
        self.annotation = annotation
        super().__init__(storage_type, "my:app")
    def __arrow_ext_serialize__(self):
        return json.dumps(self.annotation).encode()
    @classmethod
    def __arrow_ext_deserialize__(cls, storage_type, serialized):
        annotation = json.loads(serialized.decode())
        print(storage_type, annotation)
        return cls(storage_type, annotation)
    @property
    def num_buffers(self):
        return self.storage_type.num_buffers
    @property
    def num_fields(self):
        return self.storage_type.num_fields

pa.register_extension_type(AnnotatedType(pa.null(), None))

one = pa.Array.from_buffers(
    AnnotatedType(pa.int64(), {"annotated": "one"}),
    3,
    [None, pa.py_buffer(np.array([10, 20, 30], dtype=np.int64))],
)
two = pa.Array.from_buffers(
    AnnotatedType(pa.float64(), {"annotated": "two"}),
    3,
    [None, pa.py_buffer(np.array([1.1, 2.2, 3.3], dtype=np.float64))],
)
record = pa.Array.from_buffers(
    AnnotatedType(
        pa.struct([
            pa.field("one", one.type, False),
            pa.field("two", two.type, False),
        ]),
        {"annotated": "record"},
    ),
    3,
    [None],
    children=[one, two],
)
assert record.to_pylist() == [
    {"one": 10, "two": 1.1},
    {"one": 20, "two": 2.2},
    {"one": 30, "two": 3.3},
]

table = pa.Table.from_arrays([record], names=["column"])
pq.write_table(table, "record_annotated.parquet")

print("before segfault")

table2 = pq.ParquetFile("record_annotated.parquet").read_row_groups([0], ["column.one"])

print("after segfault")

assert table2.to_pylist() == [
    {"column": {"one": 10}},
    {"column": {"one": 20}},
    {"column": {"one": 30}},
]

The print-out is

before segfault
int64 {'annotated': 'one'}
double {'annotated': 'two'}
int64 {'annotated': 'one'}
double {'annotated': 'two'}
struct<one: extension<my:app<AnnotatedType>> not null, two: extension<my:app<AnnotatedType>> not null> {'annotated': 'record'}
Segmentation fault (core dumped)

I'm going to report it.

jpivarski avatar Aug 26 '22 20:08 jpivarski

Okay! This has been reported as ARROW-17539 and since there's no action to do here (Awkward doesn't need to be changed, as far as we know), I'll close this issue. Watch the Arrow JIRA (as I am) to see how this gets resolved.

jpivarski avatar Aug 26 '22 20:08 jpivarski