ibis
ibis copied to clipboard
bug: Date.to_pandas() errors, having trouble repro-ing
What happened?
In my app I am running into this. On ibis main, I can't repro. I'm guessing this has something to do with the combo of other libraries I have installed, eg pandas and duckdb. I figure this is worth pointing out to you because others might also have this incompatible version of a 3rd party lib installed, so it would be great if
- ibis added a version constraint to avoid that version
- or (better?) we added some compatibility wrapper to just make it work
Do you have any tips on what you think the cause could be? what libs I should start bisecting to try to pin down the cause?
import ibis
ibis.date("2019-01-01").to_pandas()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[1], line 3
1 import ibis
----> 3 ibis.date("2019-01-01").to_pandas()
File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/expr/types/generic.py:1202, in Value.to_pandas(self, **kwargs)
1180 def to_pandas(self, **kwargs) -> pd.Series:
1181 """Convert a column expression to a pandas Series or scalar object.
1182
1183 Parameters
(...)
1200 [5 rows x 8 columns]
1201 """
-> 1202 return self.execute(**kwargs)
File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/expr/types/core.py:322, in Expr.execute(self, limit, timecontext, params, **kwargs)
295 def execute(
296 self,
297 limit: int | str | None = "default",
(...)
300 **kwargs: Any,
301 ):
302 """Execute an expression against its backend if one exists.
303
304 Parameters
(...)
320 Keyword arguments
321 """
--> 322 return self._find_backend(use_default=True).execute(
323 self, limit=limit, timecontext=timecontext, params=params, **kwargs
324 )
File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/backends/base/sql/__init__.py:343, in BaseSQLBackend.execute(self, expr, params, limit, **kwargs)
340 schema = expr.as_table().schema()
342 with self._safe_raw_sql(sql, **kwargs) as cursor:
--> 343 result = self.fetch_from_cursor(cursor, schema)
345 return expr.__pandas_result__(result)
File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/backends/duckdb/__init__.py:1201, in Backend.fetch_from_cursor(self, cursor, schema)
1183 table = cursor.cursor.fetch_arrow_table()
1185 df = pd.DataFrame(
1186 {
1187 name: (
(...)
1199 }
1200 )
-> 1201 df = PandasData.convert_table(df, schema)
1202 if not df.empty and geospatial_supported:
1203 return self._to_geodataframe(df, schema)
File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/formats/pandas.py:118, in PandasData.convert_table(cls, df, schema)
113 raise ValueError(
114 "schema column count does not match input data column count"
115 )
117 for (name, series), dtype in zip(df.items(), schema.types):
--> 118 df[name] = cls.convert_column(series, dtype)
120 # return data with the schema's columns which may be different than the
121 # input columns
122 df.columns = schema.names
File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/formats/pandas.py:135, in PandasData.convert_column(cls, obj, dtype)
132 method_name = f"convert_{dtype.__class__.__name__}"
133 convert_method = getattr(cls, method_name, cls.convert_default)
--> 135 result = convert_method(obj, dtype, pandas_type)
136 assert not isinstance(result, np.ndarray), f"{convert_method} -> {type(result)}"
137 return result
File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/formats/pandas.py:201, in PandasData.convert_Date(cls, s, dtype, pandas_type)
199 s = s.dt.tz_convert("UTC").dt.tz_localize(None)
200 try:
--> 201 return s.astype(pandas_type).dt.date
202 except (TypeError, pd._libs.tslibs.OutOfBoundsDatetime):
204 def try_date(v):
File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/generic.py:6637, in NDFrame.astype(self, dtype, copy, errors)
6631 results = [
6632 ser.astype(dtype, copy=copy, errors=errors) for _, ser in self.items()
6633 ]
6635 else:
6636 # else, only a single dtype is given
-> 6637 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
6638 res = self._constructor_from_mgr(new_data, axes=new_data.axes)
6639 return res.__finalize__(self, method="astype")
File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/internals/managers.py:431, in BaseBlockManager.astype(self, dtype, copy, errors)
428 elif using_copy_on_write():
429 copy = False
--> 431 return self.apply(
432 "astype",
433 dtype=dtype,
434 copy=copy,
435 errors=errors,
436 using_cow=using_copy_on_write(),
437 )
File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/internals/managers.py:364, in BaseBlockManager.apply(self, f, align_keys, **kwargs)
362 applied = b.apply(f, **kwargs)
363 else:
--> 364 applied = getattr(b, f)(**kwargs)
365 result_blocks = extend_blocks(applied, result_blocks)
367 out = type(self).from_blocks(result_blocks, self.axes)
File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/internals/blocks.py:758, in Block.astype(self, dtype, copy, errors, using_cow, squeeze)
755 raise ValueError("Can not squeeze with more than one column.")
756 values = values[0, :] # type: ignore[call-overload]
--> 758 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
760 new_values = maybe_coerce_values(new_values)
762 refs = None
File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:237, in astype_array_safe(values, dtype, copy, errors)
234 dtype = dtype.numpy_dtype
236 try:
--> 237 new_values = astype_array(values, dtype, copy=copy)
238 except (ValueError, TypeError):
239 # e.g. _astype_nansafe can fail on object-dtype of strings
240 # trying to convert to float
241 if errors == "ignore":
File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:182, in astype_array(values, dtype, copy)
179 values = values.astype(dtype, copy=copy)
181 else:
--> 182 values = _astype_nansafe(values, dtype, copy=copy)
184 # in pandas we don't store numpy str dtypes, so convert to object
185 if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):
File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:110, in _astype_nansafe(arr, dtype, copy, skipna)
107 if lib.is_np_dtype(dtype, "M"):
108 from pandas.core.arrays import DatetimeArray
--> 110 dta = DatetimeArray._from_sequence(arr, dtype=dtype)
111 return dta._ndarray
113 elif lib.is_np_dtype(dtype, "m"):
File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:327, in DatetimeArray._from_sequence(cls, scalars, dtype, copy)
325 @classmethod
326 def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):
--> 327 return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy)
File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:354, in DatetimeArray._from_sequence_not_strict(cls, data, dtype, copy, tz, freq, dayfirst, yearfirst, ambiguous)
351 else:
352 tz = timezones.maybe_get_tz(tz)
--> 354 dtype = _validate_dt64_dtype(dtype)
355 # if dtype has an embedded tz, capture it
356 tz = _validate_tz_from_dtype(dtype, tz, explicit_tz_none)
File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:2550, in _validate_dt64_dtype(dtype)
2544 raise ValueError(msg)
2546 if (
2547 isinstance(dtype, np.dtype)
2548 and (dtype.kind != "M" or not is_supported_dtype(dtype))
2549 ) or not isinstance(dtype, (np.dtype, DatetimeTZDtype)):
-> 2550 raise ValueError(
2551 f"Unexpected value for 'dtype': '{dtype}'. "
2552 "Must be 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', "
2553 "'datetime64[ns]' or DatetimeTZDtype'."
2554 )
2556 if getattr(dtype, "tz", None):
2557 # https://github.com/pandas-dev/pandas/issues/18595
2558 # Ensure that we have a standard timezone for pytz objects.
2559 # Without this, things like adding an array of timedeltas and
2560 # a tz-aware Timestamp (with a tz specific to its datetime) will
2561 # be incorrect(ish?) for the array as a whole
2562 dtype = cast(DatetimeTZDtype, dtype)
ValueError: Unexpected value for 'dtype': 'datetime64[D]'. Must be 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]' or DatetimeTZDtype'.
What version of ibis are you using?
my app using 8.0.0.dev210, released Jan 27, has this bug.
In the ibis repo, I can't repro using commits from jan 27th. Is there a way to see the exact commit SHA that went into 8.0.0.dev210 on pypi?
Full output of pip freeze
in my app:
altair==5.2.0 anyio==4.2.0 appnope==0.1.3 asttokens==2.4.1 attrs==23.1.0 beautifulsoup4==4.12.2 cachetools==5.3.2 certifi==2023.11.17 chardet==5.2.0 charset-normalizer==3.3.2 click==8.1.7 click-default-group==1.2.4 colorama==0.4.6 comm==0.2.1 contourpy==1.2.0 cycler==0.12.1 debugpy==1.8.0 decorator==5.1.1 distlib==0.3.8 duckdb==0.9.3.dev1045+g9c91b3a329 executing==2.0.1 fastjsonschema==2.19.1 filelock==3.13.1 fonttools==4.45.1 gdown==4.7.1 h11==0.14.0 humanize==4.9.0 idna==3.6 igraph==0.11.3 ipykernel==6.28.0 ipython==8.19.0 ipyvue==1.10.1 ipyvuetify==1.8.10 ipywidgets==8.1.1 jedi==0.19.1 Jinja2==3.1.2 jsonschema==4.20.0 jsonschema-specifications==2023.12.1 jupyter_client==8.6.0 jupyter_core==5.7.0 jupyterlab-widgets==3.0.9 kiwisolver==1.4.5 Markdown==3.5.1 markdown-it-py==3.0.0 MarkupSafe==2.1.3 matplotlib==3.8.2 matplotlib-inline==0.1.6 mdurl==0.1.2 nbformat==5.9.2 nest-asyncio==1.5.8 networkx==3.2.1 numpy==1.26.2 optree==0.10.0 packaging==23.2 pandas==2.2.0 parso==0.8.3 pexpect==4.9.0 Pillow==10.1.0 platformdirs==4.1.0 pluggy==1.3.0 prompt-toolkit==3.0.43 protobuf==4.25.2 psutil==5.9.7 psycopg2==2.9.9 ptyprocess==0.7.0 pure-eval==0.2.2 pyarrow==15.0.0 Pygments==2.17.2 pymdown-extensions==10.7 pyparsing==3.1.1 pyproject-api==1.6.1 PySocks==1.7.1 pytask==0.4.2 python-dateutil==2.8.2 pytz==2023.3.post1 PyYAML==6.0.1 pyzmq==25.1.2 reacton==1.8.2 referencing==0.32.1 requests==2.31.0 rich==13.7.0 rich-click==1.7.3 rpds-py==0.16.2 six==1.16.0 sniffio==1.3.0 solara @ git+https://github.com/NickCrews/solara@b4f7eee9d1292dd69eaa32b31f91901545d22a17 soupsieve==2.5 SQLAlchemy==2.0.23 stack-data==0.6.3 starlette==0.34.0 texttable==1.7.0 tomli==2.0.1 toolz==0.12.0 tornado==6.4 tox==4.12.1 tqdm==4.66.1 traitlets==5.14.1 typing_extensions==4.8.0 tzdata==2023.4 urllib3==2.1.0 uvicorn==0.25.0 vegafusion==1.6.1 vegafusion-python-embed==1.6.1 virtualenv==20.25.0 vl-convert-python==1.2.2 watchdog==3.0.0 watchfiles==0.21.0 wcwidth==0.2.13 websockets==12.0 widgetsnbextension==4.0.9
What backend(s) are you using, if any?
duckdb and pandas
Relevant log output
No response
Code of Conduct
- [X] I agree to follow this project's Code of Conduct
This looks to be caused by pandas==2.2.0
, which breaks a bunch of timestamp related functionality. We have a bot PR (#8056) that I am slowly working through to try to get pandas 2.2.0 working
It's getting more and more difficult for us to preserve compatibility with pandas 1.x, and apparently even between 2.1 and 2.2 there were some disruptive changes.
It seems like there was a bunch of churn in supported datetime64 units:
- 1.x: only
datetime64[ns]
supported - 2.1.x: additional units including
datetime64[D]
supported - 2.2.x:
datetime64[D]
no longer supported
I'm not sure how to create a compatibility layer for that off the top of my head.
Here's the commit SHA for the 210th commit since the last release, which should correspond to the prerelease build you have from PyPI:
~/g/i/ibis git rev-list 7.2.0..HEAD --count 14 12:33
210
~/g/i/ibis git rev-parse --short HEAD 15 12:33
0f4366743
@gforsyth ahh, thanks for the explanation of how those prerelease numbers work! Now in the future I can find the exact SHA myself. PS, would it be possible to include the SHA into the build, eg ibis.__sha__
or something? Not sure if there is some convention around that, or if there is already an extension for build tooling that does this.
@cpcloud I noticed how I had pandas 2.2.x in my environment, and you had 2.1.x in ibis, but I dismissed that as a cause because semantic versioning says it shouldnt' break. Buttttttt, we all know how much to trust semantic versioning 😉 I python -m pip install pandas==2.1.4
in my app's environment, and no error. Thanks for the quick unblocked!
2.2.x: datetime64[D] no longer supported
Do you know if this was an explicit choice? or a mistake they accidentally left this one out of that conversion/verification logic? I would love to see their reasoning for not supporting it, I would really like them to support it, how else are we supposed to represent dates in pandas?
2.2.x: datetime64[D] no longer supported
FWIW: this affected me too in https://github.com/googleapis/python-bigquery-dataframes/pull/492
stack trace
___________________ test_remote_function_stringify_with_ibis ___________________ [gw1] linux -- Python 3.11.6 /tmpfs/src/github/python-bigquery-dataframes/.nox/e2e/bin/python session =scalars_table_id = 'bigframes-load-testing.bigframes_testing.scalars_269e578a0cb35c2ee0eedfef3d91d3fc' ibis_client = dataset_id = 'bigframes-load-testing.bigframes_tests_system_20240322001149_109284_dataset_id' bq_cf_connection = 'bigframes-rf-conn' @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_stringify_with_ibis( session, scalars_table_id, ibis_client, dataset_id, bq_cf_connection, ): try: @session.remote_function( [int], str, dataset_id, bq_cf_connection, reuse=False, ) def stringify(x): return f"I got {x}" project_id, dataset_name, table_name = scalars_table_id.split(".") if not ibis_client.dataset: ibis_client.dataset = dataset_name col_name = "int64_col" table = ibis_client.tables[table_name] table = table.filter(table[col_name].notnull()).order_by("rowindex").head(10) > pandas_df_orig = table.execute() [tests/system/large/test_remote_function.py:197](https://cs.corp.google.com/piper///depot/google3/tests/system/large/test_remote_function.py?l=197): _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ [.nox/e2e/lib/python3.11/site-packages/ibis/expr/types/core.py:324](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/expr/types/core.py?l=324): in execute return self._find_backend(use_default=True).execute( [.nox/e2e/lib/python3.11/site-packages/ibis/backends/bigquery/__init__.py:698](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/backends/bigquery/__init__.py?l=698): in execute result = self.fetch_from_cursor(cursor, expr.as_table().schema()) [.nox/e2e/lib/python3.11/site-packages/ibis/backends/bigquery/__init__.py:707](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/backends/bigquery/__init__.py?l=707): in fetch_from_cursor return PandasData.convert_table(df, schema) [.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py:118](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py?l=118): in convert_table df[name] = cls.convert_column(series, dtype) [.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py:135](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py?l=135): in convert_column result = convert_method(obj, dtype, pandas_type) [.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py:201](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py?l=201): in convert_Date return s.astype(pandas_type).dt.date [.nox/e2e/lib/python3.11/site-packages/pandas/core/generic.py:6640](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/generic.py?l=6640): in astype new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) [.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/managers.py:430](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/managers.py?l=430): in astype return self.apply( [.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/managers.py:363](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/managers.py?l=363): in apply applied = getattr(b, f)(**kwargs) [.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/blocks.py:758](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/blocks.py?l=758): in astype new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) [.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:237](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py?l=237): in astype_array_safe new_values = astype_array(values, dtype, copy=copy) [.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:182](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py?l=182): in astype_array values = _astype_nansafe(values, dtype, copy=copy) [.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:110](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py?l=110): in _astype_nansafe dta = DatetimeArray._from_sequence(arr, dtype=dtype) [.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:327](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py?l=327): in _from_sequence return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy) [.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:354](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py?l=354): in _from_sequence_not_strict dtype = _validate_dt64_dtype(dtype) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ dtype = dtype(' raise ValueError( f"Unexpected value for 'dtype': '{dtype}'. " "Must be 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', " "'datetime64[ns]' or DatetimeTZDtype'." ) E ValueError: Unexpected value for 'dtype': 'datetime64[D]'. Must be 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]' or DatetimeTZDtype'. [.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:2550](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py?l=2550): ValueError =============================== warnings summary ===============================
I'm working around it by replacing table.execute()
with sql = table.compile() ; pandas_df_orig = bigquery_client.query(sql).to_dataframe()
which does the conversion to pandas in a different way.
Fixed by #8758.