ibis
ibis copied to clipboard
bug[duckdb-geospatial]: read_parquet defaults silently to pyarrow reading geometry as binary
EDIT: SEE NEW COMMENTS.
When we read a parquet file that was written with to_parquet (ibis) and contains a geometry column, it reads it back as binary. This is a bug on our end, as this doesn't happen if the file is written with plain duckdb.
Reproducer
import ibis
from ibis import _
con = ibis.get_backend()
url = "s3://overturemaps-us-west-2/release/2024-07-22.0/theme=base/type=infrastructure"
t = con.read_parquet(url, table_name="infra")
expr = t.filter(_.bbox.xmin > -77.119795,
_.bbox.xmax < -76.909366,
_.bbox.ymin > 38.791631,
_.bbox.ymax < 38.995968
)
con.to_parquet(expr, "infra_ibis.parquet")
t2 = con.read_parquet("infra_ibis.parquet")
ibis.options.interactive = True
t2.head(3).select("geometry")
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ geometry ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ binary │
├────────────────────────────────────────────────────────────┤
│ b'\x00\x00\x00\x00\x01\xc0SGY\xb5\xe9[y@Ceo\xef|$>' │
│ b'\x00\x00\x00\x00\x01\xc0SGY\xb3\xd0|\x85@Cet$\x95\n\xbf' │
│ b'\x00\x00\x00\x00\x01\xc0SGX+:@n@Cev\x88\x1c\x99\x94' │
└────────────────────────────────────────────────────────────┘