polars
polars copied to clipboard
New Schema error updating 0.20.15 -> 0.20.16
Checks
- [X] I have checked that this issue has not already been reported.
- [X] I have confirmed this bug exists on the latest version of Polars.
Reproducible example
RAW = {
'histo': [
[{'name': 'NA', 'NA': 100.0}],
[{'name': 1, 'cat_pop': 46.0}, {'name': 'NA', 'NA': 54.0}]]}
pl_histo = pl.DataFrame(RAW)
throws the following error on 0.20.16
name = 'histo', values = [[{'NA': 100.0, 'name': 'NA'}], [{'cat_pop': 46.0, 'name': 1}, {'NA': 54.0, 'name': 'NA'}]]
dtype = None
def sequence_to_pyseries(
name: str,
values: Sequence[Any],
dtype: PolarsDataType | None = None,
*,
strict: bool = True,
nan_to_null: bool = False,
) -> PySeries:
"""Construct a PySeries from a sequence."""
python_dtype: type | None = None
if isinstance(values, range):
return range_to_series(name, values, dtype=dtype)._s
# empty sequence
if not values and dtype is None:
# if dtype for empty sequence could be guessed
# (e.g comparisons between self and other), default to Null
dtype = Null
# lists defer to subsequent handling; identify nested type
elif dtype == List:
python_dtype = list
# infer temporal type handling
py_temporal_types = {date, datetime, timedelta, time}
pl_temporal_types = {Date, Datetime, Duration, Time}
value = get_first_non_none(values)
if value is not None:
if (
dataclasses.is_dataclass(value)
or is_pydantic_model(value)
or is_namedtuple(value.__class__)
) and dtype != Object:
return pl.DataFrame(values).to_struct(name)._s
elif isinstance(value, range) and dtype is None:
values = [range_to_series("", v) for v in values]
else:
# for temporal dtypes:
# * if the values are integer, we take the physical branch.
# * if the values are python types, take the temporal branch.
# * if the values are ISO-8601 strings, init then convert via strptime.
# * if the values are floats/other dtypes, this is an error.
if dtype in py_temporal_types and isinstance(value, int):
dtype = py_type_to_dtype(dtype) # construct from integer
elif (
dtype in pl_temporal_types or type(dtype) in pl_temporal_types
) and not isinstance(value, int):
python_dtype = dtype_to_py_type(dtype) # type: ignore[arg-type]
# physical branch
# flat data
if (
dtype is not None
and dtype not in (List, Struct, Unknown)
and is_polars_dtype(dtype)
and (python_dtype is None)
):
constructor = polars_type_to_constructor(dtype)
pyseries = _construct_series_with_fallbacks(
constructor, name, values, dtype, strict=strict
)
if dtype in (Date, Datetime, Duration, Time, Categorical, Boolean, Enum):
if pyseries.dtype() != dtype:
pyseries = pyseries.cast(dtype, strict=strict)
return pyseries
elif dtype == Struct:
struct_schema = dtype.to_schema() if isinstance(dtype, Struct) else None
empty = {} # type: ignore[var-annotated]
return plc.sequence_to_pydf(
data=[(empty if v is None else v) for v in values],
schema=struct_schema,
orient="row",
).to_struct(name)
else:
if python_dtype is None:
if value is None:
constructor = polars_type_to_constructor(Null)
return constructor(name, values, strict)
# generic default dtype
python_dtype = type(value)
# temporal branch
if python_dtype in py_temporal_types:
if dtype is None:
dtype = py_type_to_dtype(python_dtype) # construct from integer
elif dtype in py_temporal_types:
dtype = py_type_to_dtype(dtype)
values_dtype = (
None
if value is None
else py_type_to_dtype(type(value), raise_unmatched=False)
)
if values_dtype is not None and values_dtype.is_float():
msg = f"'float' object cannot be interpreted as a {python_dtype.__name__!r}"
raise TypeError(
# we do not accept float values as temporal; if this is
# required, the caller should explicitly cast to int first.
msg
)
# We use the AnyValue builder to create the datetime array
# We store the values internally as UTC and set the timezone
py_series = PySeries.new_from_any_values(name, values, strict)
time_unit = getattr(dtype, "time_unit", None)
time_zone = getattr(dtype, "time_zone", None)
if time_unit is None or values_dtype == Date:
s = wrap_s(py_series)
else:
s = wrap_s(py_series).dt.cast_time_unit(time_unit)
if (values_dtype == Date) & (dtype == Datetime):
return (
s.cast(Datetime(time_unit or "us"))
.dt.replace_time_zone(time_zone)
._s
)
if (dtype == Datetime) and (
value.tzinfo is not None or time_zone is not None
):
values_tz = str(value.tzinfo) if value.tzinfo is not None else None
dtype_tz = dtype.time_zone # type: ignore[union-attr]
if values_tz is not None and (
dtype_tz is not None and dtype_tz != "UTC"
):
msg = (
"time-zone-aware datetimes are converted to UTC"
"\n\nPlease either drop the time zone from the dtype, or set it to 'UTC'."
" To convert to a different time zone, please use `.dt.convert_time_zone`."
)
raise ValueError(msg)
if values_tz != "UTC" and dtype_tz is None:
warnings.warn(
"Constructing a Series with time-zone-aware "
"datetimes results in a Series with UTC time zone. "
"To silence this warning, you can filter "
"warnings of class TimeZoneAwareConstructorWarning, or "
"set 'UTC' as the time zone of your datatype.",
TimeZoneAwareConstructorWarning,
stacklevel=find_stacklevel(),
)
return s.dt.replace_time_zone(dtype_tz or "UTC")._s
return s._s
elif (
_check_for_numpy(value)
and isinstance(value, np.ndarray)
and len(value.shape) == 1
):
n_elems = len(value)
if all(len(v) == n_elems for v in values):
# can take (much) faster path if all lists are the same length
return numpy_to_pyseries(
name,
np.vstack(values),
strict=strict,
nan_to_null=nan_to_null,
)
else:
return PySeries.new_series_list(
name,
[
numpy_to_pyseries("", v, strict=strict, nan_to_null=nan_to_null)
for v in values
],
strict,
)
elif python_dtype in (list, tuple):
if dtype is None:
> return PySeries.new_from_any_values(name, values, strict=strict)
E polars.exceptions.SchemaError: unexpected value while building Series of type String; found value of type Int64: 1
My test suite discovered this bug
Log output
No response
Issue description
I didn't expect a minor update to cause a new error like this.
Expected behavior
I would have expected a deprecation message
Installed versions
>>> pl.show_versions()
pl.show_versions()
--------Version info---------
Polars: 0.20.16
Index type: UInt32
Platform: macOS-13.6.1-arm64-arm-64bit
Python: 3.8.18 (default, Sep 11 2023, 08:17:16)
[Clang 14.0.6 ]
----Optional dependencies----
adbc_driver_manager: <not installed>
cloudpickle: <not installed>
connectorx: <not installed>
deltalake: <not installed>
fastexcel: <not installed>
fsspec: <not installed>
gevent: <not installed>
hvplot: <not installed>
matplotlib: <not installed>
numpy: 1.24.3
openpyxl: <not installed>
pandas: 2.0.3
pyarrow: 14.0.2
pydantic: <not installed>
pyiceberg: <not installed>
pyxlsb: <not installed>
sqlalchemy: <not installed>
xlsx2csv: <not installed>
xlsxwriter: <not installed>
It's because 'name' contains mixed types (str/int).
The DataFrame constructor was made strict by default which you need to disable.
pl.DataFrame(RAW, strict=False)
# shape: (2, 1)
# ┌─────────────────────────────────────┐
# │ histo │
# │ --- │
# │ list[struct[3]] │
# ╞═════════════════════════════════════╡
# │ [{"NA",null,100.0}] │
# │ [{"1",46.0,null}, {"NA",null,54.0}] │ # 1 is now "1"
# └─────────────────────────────────────┘
This was a breaking change that was introduced to fix a related bug. It's a bit unfortunate. On main you get the following error that is hopefully a bit more helpful:
TypeError: unexpected value while building Series of type String; found value of type Int64: 1
Hint: Try setting `strict=False` to allow passing data with mixed types.
In the next release we will probably restore the support for mixed types, and then break it again in the 1.0.0 release.
That error message on Main is great, exactly what I was going ask for.
I've decided not to temporarily restore the old behavior - the next breaking release is out soon enough and I can better spend my time improving other things. Setting strict=False will fix the issue and prepare you for the next release at the same time.
So I will close this for now.