pandera
pandera copied to clipboard
NaT coercion fails when entire column is NaT
Describe the bug
Schema with coerced, nullable dates choke when the entire column is NaT
values
- [x] I have checked that this issue has not already been reported.
- [x] I have confirmed this bug exists on the latest version of pandera.
- [ ] (optional) I have confirmed this bug exists on the master branch of pandera.
Code Sample, a copy-pastable example
import datetime
import pandas as pd
import pandera as pa
class Foo(pa.SchemaModel):
date: pa.typing.Series[datetime.date] = pa.Field(nullable=True)
class Config:
coerce = True
pandas_datetimes = pd.bdate_range("2023-01-01-", "2023-01-31").tolist()
df = pd.DataFrame(pandas_datetimes, columns=["date"])
Foo.validate(df) # OK
df = pd.DataFrame(pandas_datetimes + [None], columns=["date"])
Foo.validate(df) # OK
df = pd.DataFrame(pandas_datetimes + [pd.NaT], columns=["date"])
Foo.validate(df) # OK
df = pd.DataFrame([pd.NaT] * 2, columns=["date"])
Foo.validate(df) # raises TypeError
Expected behavior
Should just return a columns of NaT
's
Desktop (please complete the following information):
- OS: Ubuntu
hi @ludaavics can you provide the stack trace of the error?
TypeError Traceback (most recent call last)
Cell In [16], line 21
19 Foo.validate(df) # OK
20 df = pd.DataFrame([pd.NaT] * 2, columns=["date"])
---> 21 Foo.validate(df) # raises TypeError
File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/model.py:264, in SchemaModel.validate(cls, check_obj, head, tail, sample, random_state, lazy, inplace)
249 @classmethod
250 @docstring_substitution(validate_doc=DataFrameSchema.validate.__doc__)
251 def validate(
(...)
259 inplace: bool = False,
260 ) -> DataFrameBase[TSchemaModel]:
261 """%(validate_doc)s"""
262 return cast(
263 DataFrameBase[TSchemaModel],
--> 264 cls.to_schema().validate(
265 check_obj, head, tail, sample, random_state, lazy, inplace
266 ),
267 )
File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/schemas.py:534, in DataFrameSchema.validate(self, check_obj, head, tail, sample, random_state, lazy, inplace)
521 check_obj = check_obj.map_partitions(
522 self._validate,
523 head=head,
(...)
529 meta=check_obj,
530 )
532 return check_obj.pandera.add_schema(self)
--> 534 return self._validate(
535 check_obj=check_obj,
536 head=head,
537 tail=tail,
538 sample=sample,
539 random_state=random_state,
540 lazy=lazy,
541 inplace=inplace,
542 )
File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/schemas.py:724, in DataFrameSchema._validate(self, check_obj, head, tail, sample, random_state, lazy, inplace)
722 for schema_component in schema_components:
723 try:
--> 724 result = schema_component(
725 df_to_validate,
726 lazy=lazy,
727 # don't make a copy of the data
728 inplace=True,
729 )
730 check_results.append(check_utils.is_table(result))
731 except errors.SchemaError as err:
File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/schemas.py:2138, in SeriesSchemaBase.__call__(self, check_obj, head, tail, sample, random_state, lazy, inplace)
2127 def __call__(
2128 self,
2129 check_obj: Union[pd.DataFrame, pd.Series],
(...)
2135 inplace: bool = False,
2136 ) -> Union[pd.DataFrame, pd.Series]:
2137 """Alias for ``validate`` method."""
-> 2138 return self.validate(
2139 check_obj, head, tail, sample, random_state, lazy, inplace
2140 )
File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/schema_components.py:223, in Column.validate(self, check_obj, head, tail, sample, random_state, lazy, inplace)
219 validate_column(
220 check_obj[column_name].iloc[:, [i]], column_name
221 )
222 else:
--> 223 validate_column(check_obj, column_name)
225 return check_obj
File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/schema_components.py:196, in Column.validate.<locals>.validate_column(check_obj, column_name)
195 def validate_column(check_obj, column_name):
--> 196 super(Column, copy(self).set_name(column_name)).validate(
197 check_obj,
198 head,
199 tail,
200 sample,
201 random_state,
202 lazy,
203 inplace=inplace,
204 )
File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/schemas.py:2057, in SeriesSchemaBase.validate(self, check_obj, head, tail, sample, random_state, lazy, inplace)
2052 msg = (
2053 f"expected series '{series.name}' to have type {self._dtype}, "
2054 + f"got {series.dtype}"
2055 )
2056 elif not isinstance(check_output, bool):
-> 2057 _, failure_cases = check_utils.prepare_series_check_output(
2058 series,
2059 pd.Series(list(check_output))
2060 if not isinstance(check_output, pd.Series)
2061 else check_output,
2062 )
2063 failure_cases = reshape_failure_cases(failure_cases)
2064 msg = (
2065 f"expected series '{series.name}' to have type {self._dtype}:\n"
2066 f"failure cases:\n{failure_cases}"
2067 )
File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/check_utils.py:113, in prepare_series_check_output(check_obj, check_output, ignore_na, n_failure_cases)
107 isna = (
108 check_obj.isna().all(axis="columns")
109 if isinstance(check_obj, pd.DataFrame)
110 else check_obj.isna()
111 )
112 try:
--> 113 check_output = check_output | isna
114 except AttributeError:
115 # convert check_output to numpy for modin compatibility
116 check_output = check_output.to_numpy() | isna
File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandas/core/ops/common.py:70, in _unpack_zerodim_and_defer.<locals>.new_method(self, other)
66 return NotImplemented
68 other = item_from_zerodim(other)
---> 70 return method(self, other)
File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandas/core/arraylike.py:78, in OpsMixin.__or__(self, other)
76 @unpack_zerodim_and_defer("__or__")
77 def __or__(self, other):
---> 78 return self._logical_method(other, operator.or_)
File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandas/core/series.py:5634, in Series._logical_method(self, other, op)
5631 lvalues = self._values
5632 rvalues = extract_array(other, extract_numpy=True, extract_range=True)
-> 5634 res_values = ops.logical_op(lvalues, rvalues, op)
5635 return self._construct_result(res_values, name=res_name)
File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandas/core/ops/array_ops.py:376, in logical_op(left, right, op)
372 rvalues = right
374 if should_extension_dispatch(lvalues, rvalues):
375 # Call the method on lvalues
--> 376 res_values = op(lvalues, rvalues)
378 else:
379 if isinstance(rvalues, np.ndarray):
File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandas/core/arrays/datetimelike.py:1752, in TimelikeOps.__array_ufunc__(self, ufunc, method, *inputs, **kwargs)
1744 if (
1745 ufunc in [np.isnan, np.isinf, np.isfinite]
1746 and len(inputs) == 1
1747 and inputs[0] is self
1748 ):
1749 # numpy 1.18 changed isinf and isnan to not raise on dt64/td64
1750 return getattr(ufunc, method)(self._ndarray, **kwargs)
-> 1752 return super().__array_ufunc__(ufunc, method, *inputs, **kwargs)
File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandas/core/arrays/base.py:1629, in ExtensionArray.__array_ufunc__(self, ufunc, method, *inputs, **kwargs)
1626 if result is not NotImplemented:
1627 return result
-> 1629 return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs)
File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandas/core/arraylike.py:490, in default_array_ufunc(self, ufunc, method, *inputs, **kwargs)
486 raise NotImplementedError
488 new_inputs = [x if x is not self else np.asarray(x) for x in inputs]
--> 490 return getattr(ufunc, method)(*new_inputs, **kwargs)
TypeError: ufunc 'bitwise_or' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
@cosmicBboy any thoughts on this ?
This is a bigger issue when we use a Date column with coerce=True, since a column of all invalid dates will get coerced to pd.NaT but then the entire schema fails validation. Here is an example:
`import pandas as pd import pandera as pa import pandera.typing as pt import numpy as np
class DateOutput(pa.SchemaModel): date: pt.Series[pa.dtypes.Date] = pa.Field(nullable=True, coerce=True)
df = pd.DataFrame({"date": ["2021-08-01", np.nan, pd.NaT]})
df_nat = pd.DataFrame({"date": [np.nan, pd.NaT]})
print(DateOutput.validate(df)) # valid with pd.NaT print(DateOutput.validate(df_nat)) # returns pd.NaT not a valid date`
@ludaavics , I ran into this same issue. The problem was using date
instead of datetime
. This minor fix works:
import datetime
import pandas as pd
import pandera as pa
class Foo(pa.SchemaModel):
date: pa.typing.Series[datetime.datetime] = pa.Field(nullable=True)
class Config:
coerce = True
df = pd.DataFrame([pd.NaT] * 2, columns=["date"])
Foo.validate(df) # works
@lih7 , pa.dtypes.Date
appears to be undocumented. @cosmicBboy , what are your thoughts on adding some sort of warning in the docstring on this class?
As far as I know pandas doesn't have first class support for datetime.date
, and will be represented as an object
.
+1 to adding documentation for pa.dtypes.Date
. re: the warning, what kind of warning message did you have in mind @riziles ?
In VS Code, if I type from pandera.dtypes import D
, then Date
is the first autocomplete suggestion. If I hover over it, I can see the docstring Semantic representation of a date data type.
Maybe just add Note: not supported by pandas
?