datafusion
datafusion copied to clipboard
Support for filtered arrow datasets
It seems like DataFusion doesn't support working with filtered pyarrow datasets even if other engines do:
from pathlib import Path
import shutil
import polars as pl
from deltalake import DeltaTable
import pyarrow.dataset as ds
import duckdb
from datafusion import SessionContext
DIR = Path("data/records")
shutil.rmtree(DIR, ignore_errors=True)
df = pl.DataFrame({'project_id': [1, 2, 3, 1, 2, 3], 'x': [1, 2, 3, 1, 2, 3]})
df.write_delta(DIR, delta_write_options={'partition_by': ['project_id'], 'engine': 'rust'})
table = DeltaTable(DIR)
dataset = table.to_pyarrow_dataset()
filtered = dataset.filter(ds.field('project_id').isin([1, 2]))
db = duckdb.arrow(filtered)
con = duckdb.connect()
df = pl.from_arrow(con.execute("SELECT * FROM filtered").arrow())
print(df)
df = pl.from_arrow(con.execute("SELECT * FROM filtered WHERE project_id = 3").arrow())
print(df)
df = pl.scan_pyarrow_dataset(filtered)
print(df.collect())
ctx = SessionContext()
ctx.register_dataset("records", filtered)
df = ctx.sql("SELECT * FROM records")
print(df)
shape: (4, 2)
┌────────────┬─────┐
│ project_id ┆ x │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞════════════╪═════╡
│ 1 ┆ 1 │
│ 1 ┆ 1 │
│ 2 ┆ 2 │
│ 2 ┆ 2 │
└────────────┴─────┘
shape: (0, 2)
┌────────────┬─────┐
│ project_id ┆ x │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞════════════╪═════╡
└────────────┴─────┘
shape: (4, 2)
┌────────────┬─────┐
│ project_id ┆ x │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞════════════╪═════╡
│ 1 ┆ 1 │
│ 1 ┆ 1 │
│ 2 ┆ 2 │
│ 2 ┆ 2 │
└────────────┴─────┘
Traceback (most recent call last):
File "/Users/adriangb/GitHub/platform/test.py", line 39, in <module>
print(df)
Exception: External error: Python error PyErr { type: <class 'ValueError'>, value: ValueError('Retrieving fragments of a filtered or projected dataset is not allowed. Remove the filtering.'), traceback: Some(<traceback object at 0x134a09680>) }
Thanks for the report @adriangb
I am not familiar with this feature but it appears the error comes from pyarrow itself
https://github.com/apache/arrow/blob/304650145689291eb87db5dd58f7b9776bdfaacf/python/pyarrow/_dataset.pyx#L226-L247
Perhaps datafusion-python needs to strip off the filtered options or something 🤔