polars
polars copied to clipboard
Lazy schema dtype error
Polars version checks
-
[X] I have checked that this issue has not already been reported.
-
[X] I have confirmed this bug exists on the latest version of Polars.
Issue description
Similar to the bug I reported previously, #6643.
Reproducible example
import polars as pl
# sqrt
df = pl.DataFrame({"x": pl.Series(values=[], dtype=pl.Float32)})
correct_result = df.select(pl.col("x").sqrt()).select(pl.col(pl.Float32))
lazy_result = df.lazy().select(pl.col("x").sqrt()).select(pl.col(pl.Float32)).collect()
print(correct_result.shape == lazy_result.shape) # False
# diff
df = pl.DataFrame({"x": pl.Series(values=[], dtype=pl.UInt8)})
correct_result = df.select(pl.col("x").diff()).select(pl.col(pl.Int16))
lazy_result = df.lazy().select(pl.col("x").diff()).select(pl.col(pl.Int16)).collect()
print(correct_result.shape == lazy_result.shape) # False
Expected behavior
Lazy and eager give the same result.
Installed versions
---Version info---
Polars: 0.16.4
Index type: UInt32
Platform: Windows-10-10.0.19044-SP0
Python: 3.10.8 | packaged by conda-forge | (main, Nov 24 2022, 14:07:00) [MSC v.1916 64 bit (AMD64)]
---Optional dependencies---
pyarrow: 8.0.0
pandas: 1.5.2
numpy: 1.22.3
fsspec: <not installed>
connectorx: <not installed>
xlsx2csv: <not installed>
deltalake: <not installed>
matplotlib: <not installed>
Here are some more from the Series.computation
list of exprs:
import polars as pl
from polars.datatypes import NUMERIC_DTYPES
x = pl.col("x")
funcs = [
x.dot(x),
x.entropy(),
x.rolling_mean(1),
x.rolling_quantile(1),
x.rolling_skew(1), # Simplifies to "rolling_apply_float()"
x.rolling_std(1),
x.rolling_var(1),
]
for func in funcs:
bad_dtypes = []
for dtype in NUMERIC_DTYPES:
df = pl.DataFrame({"x": pl.Series(values=[1, 2, 3], dtype=dtype)})
result_eager = df.select(func)
dtype_eager = result_eager.get_column("x").dtype
result_lazy = df.lazy().select(func).select(pl.col(dtype_eager)).collect()
if not result_lazy.frame_equal(result_eager):
bad_dtypes.append(dtype)
print(func, bad_dtypes)
col("x").dot([col("x")]) [UInt16, UInt8, Int16, Int8]
col("x").rolling_mean() [UInt64, UInt16, UInt32, Int64, UInt8, Int32, Int16, Int8]
col("x").rolling_quantile() [UInt64, UInt16, UInt32, Int64, UInt8, Int32, Int16, Int8]
col("x").rolling_apply_float() [Float64, UInt64, UInt16, UInt32, Int64, UInt8, Int32, Int16, Int8, Float32]
col("x").rolling_std() [UInt64, UInt16, UInt32, Int64, UInt8, Int32, Int16, Int8]
col("x").rolling_var() [UInt64, UInt16, UInt32, Int64, UInt8, Int32, Int16, Int8]
A few more from Series.aggregation
.
Note that product()
panics for certain dtypes.
import polars as pl
from polars.datatypes import NUMERIC_DTYPES
x = pl.col("x")
funcs = [
x.arg_max(),
x.arg_min(),
x.max(),
x.mean(),
x.median(),
x.min(),
x.mode(),
x.nan_max(),
x.nan_min(),
# x.product(), # Panic with Int32, UInt32, or UInt64
x.quantile(0.5),
x.std(),
x.sum(),
x.var()
]
for func in funcs:
bad_dtypes = []
for dtype in NUMERIC_DTYPES:
df = pl.DataFrame({
"x": pl.Series(values=[1, 2, 3] * 2, dtype=dtype),
"y": pl.Series(values=["a"] * 3 + ["b"] * 3)
})
try:
result_eager = df.select(func.over("y")).select("x")
dtype_eager = result_eager["x"].dtype
result_lazy = df.lazy().select(func.over("y")).select(pl.col(dtype_eager)).collect()
if not result_eager.frame_equal(result_lazy):
bad_dtypes.append(dtype)
except:
bad_dtypes.append(f"({dtype} -> Exception)")
if len(bad_dtypes) > 0:
print(func, bad_dtypes)
col("x").median() [Float32]
col("x").mode() [UInt32, Int16, '(Float64 -> Exception)', UInt64, Int8, '(Float32 -> Exception)', Int64, Int32]
Same test for Series.arr
import polars as pl
from polars.datatypes import NUMERIC_DTYPES
x = pl.col("x")
funcs = [
x.arr.arg_max(),
x.arr.arg_min(),
x.arr.concat(x),
# x.arr.contains(3),
x.arr.diff(1),
# x.arr.eval(),
x.arr.explode(),
x.arr.first(),
x.arr.get(0),
x.arr.head(2),
# x.arr.join("_"),
x.arr.last(),
x.arr.lengths(),
x.arr.max(),
x.arr.mean(),
x.arr.min(),
x.arr.reverse(),
x.arr.shift(1),
x.arr.slice(0),
x.arr.sort(),
x.arr.sum(),
x.arr.tail(2),
x.arr.take([0]),
x.arr.to_struct(),
x.arr.unique(),
]
for func in funcs:
bad_dtypes = []
for dtype in NUMERIC_DTYPES:
df = pl.DataFrame({"x": pl.Series(values=[[1, 2, 3]], dtype=pl.List(dtype))})
result_eager = df.select(func)
dtype_eager = result_eager["x"].dtype
result_lazy = df.lazy().select(func).select(pl.col(dtype_eager)).collect()
if not result_eager.frame_equal(result_lazy):
bad_dtypes.append(dtype)
if len(bad_dtypes) > 0:
print(func, bad_dtypes)
col("x").arr.diff() [UInt16, UInt64, UInt32, UInt8]
col("x").arr.sum() [UInt16, Int8, Int16, UInt8]
col("x").arr.to_struct() [UInt16, Int32, Float32, Int8, Int64, Int16, UInt64, Float64, UInt32, UInt8]
Similar problem with Series.bin.encode
import polars as pl
df = pl.DataFrame({"x": [b"a", b"b", b"c"]})
expr = pl.col("x").bin.encode("hex")
result = df.select(expr)
dtype = result["x"].dtype # utf8
result_lazy = df.lazy().select(expr).select(pl.col(dtype)).collect()
assert result.shape != result_lazy.shape
This diff is a good example of what needs to be done per function. Move towards the proper FunctionExpr
and ensure the schema
reports the correct dtypes.
I will give it a try.
I will give it a try.
Cool! Could you try to do only a few functions at a time? This keeps the PRs small.
I have added a checklist to keep track of the progress (please correct me if I'm missing something):
- [ ] col("x").dot([col("x")]) [UInt16, UInt8, Int16, Int8]
- [ ] col("x").rolling_mean() [UInt64, UInt16, UInt32, Int64, UInt8, Int32, Int16, Int8]
- [ ] col("x").rolling_quantile() [UInt64, UInt16, UInt32, Int64, UInt8, Int32, Int16, Int8]
- [ ] col("x").rolling_apply_float() [Float64, UInt64, UInt16, UInt32, Int64, UInt8, Int32, Int16, Int8, Float32]
- [ ] col("x").rolling_std() [UInt64, UInt16, UInt32, Int64, UInt8, Int32, Int16, Int8]
- [ ] col("x").rolling_var() [UInt64, UInt16, UInt32, Int64, UInt8, Int32, Int16, Int8]
- [ ] col("x").median() [Float32]
- [ ] col("x").mode() [UInt32, Int16, '(Float64 -> Exception)', UInt64, Int8, '(Float32 -> Exception)', Int64, Int32]
- [ ] col("x").product() ['(Int32 -> Exception)', '(UInt32 -> Exception)', '(UInt64 -> Exception)']
- [ ] col("x").arr.diff() [UInt16, UInt64, UInt32, UInt8]
- [ ] col("x").arr.sum() [UInt16, Int8, Int16, UInt8]
- [ ] col("x").arr.to_struct() [UInt16, Int32, Float32, Int8, Int64, Int16, UInt64, Float64, UInt32, UInt8]
- [ ] col("x").bin.encode("hex") [utf8]
@ritchie46 is this still relevant?
I think this should be small enough for me to start as a first issue on. However, the codebase has changed quite a bit since the example you provided, so im a bit in the dark so if you could give me some pointers on where to start and what the desired types are, that would be great.
The reproducible example in the original issue has been fixed. There have been other mentions in this issue, but since it is quite outdated, I think it's better to close this as it's not clear what is still relevant. If there are still unresolved problems here, please open a new issue!