polars
polars copied to clipboard
Spurious pytest failure
Polars version checks
-
[X] I have checked that this issue has not already been reported.
-
[X] I have confirmed this bug exists on the latest version of Polars.
Issue description
I believe that the the parallelization lead to a race condition with a file.
@stinodego I think we must ensure that all file creators and consumers end up on the same workers.
@pytest.mark.xfail(sys.platform == "win32", reason="Does not work on Windows")
def test_parquet_struct_categorical() -> None:
df = pl.DataFrame(
[
pl.Series("a", ["bob"], pl.Categorical),
pl.Series("b", ["foo"], pl.Categorical),
]
)
df.write_parquet("/tmp/tmp.pq")
with pl.StringCache():
> out = pl.read_parquet("/tmp/tmp.pq").select(pl.col("b").value_counts())
tests/unit/io/test_lazy_parquet.py:207:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
polars/internals/dataframe/frame.py:5603: in select
self.lazy().select(exprs).collect(no_optimization=True)._df
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <polars.LazyFrame object at 0x7F8EBD9BBDF0>
def collect(
self,
*,
type_coercion: bool = True,
predicate_pushdown: bool = True,
projection_pushdown: bool = True,
simplify_expression: bool = True,
no_optimization: bool = False,
slice_pushdown: bool = True,
common_subplan_elimination: bool = True,
streaming: bool = False,
) -> pli.DataFrame:
"""
Collect into a DataFrame.
Note: use :func:`fetch` if you want to run your query on the first `n` rows
only. This can be a huge time saver in debugging queries.
Parameters
----------
type_coercion
Do type coercion optimization.
predicate_pushdown
Do predicate pushdown optimization.
projection_pushdown
Do projection pushdown optimization.
simplify_expression
Run simplify expressions optimization.
no_optimization
Turn off (certain) optimizations.
slice_pushdown
Slice pushdown optimization.
common_subplan_elimination
Will try to cache branching subplans that occur on self-joins or unions.
streaming
Run parts of the query in a streaming fashion (this is in an alpha state)
Returns
-------
DataFrame
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": ["a", "b", "a", "b", "b", "c"],
... "b": [1, 2, 3, 4, 5, 6],
... "c": [6, 5, 4, 3, 2, 1],
... }
... ).lazy()
>>> df.groupby("a", maintain_order=True).agg(pl.all().sum()).collect()
shape: (3, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ a ┆ 4 ┆ 10 │
│ b ┆ 11 ┆ 10 │
│ c ┆ 6 ┆ 1 │
└─────┴─────┴─────┘
"""
if no_optimization:
predicate_pushdown = False
projection_pushdown = False
slice_pushdown = False
common_subplan_elimination = False
if streaming:
common_subplan_elimination = False
ldf = self._ldf.optimization_toggle(
type_coercion,
predicate_pushdown,
projection_pushdown,
simplify_expression,
slice_pushdown,
common_subplan_elimination,
streaming,
)
> return pli.wrap_df(ldf.collect())
E exceptions.NotFoundError: b
E
E > Error originated just after operation: ' DF ["name", "amount"]; PROJECT */2 COLUMNS; SELECTION: "None"
E '
E This operation could not be added to the plan.
polars/internals/lazyframe/frame.py:1147: NotFoundError
Reproducible example
None
Expected behavior
Run tests successfully.
Installed versions
~