bug: Date.to_pandas() errors, having trouble repro-ing

1 year ago • 4 comments

What happened?

In my app I am running into this. On ibis main, I can't repro. I'm guessing this has something to do with the combo of other libraries I have installed, eg pandas and duckdb. I figure this is worth pointing out to you because others might also have this incompatible version of a 3rd party lib installed, so it would be great if

  1. ibis added a version constraint to avoid that version
  2. or (better?) we added some compatibility wrapper to just make it work

Do you have any tips on what you think the cause could be? what libs I should start bisecting to try to pin down the cause?

import ibis

ValueError                                Traceback (most recent call last)
Cell In[1], line 3
      1 import ibis
----> 3 ibis.date("2019-01-01").to_pandas()

File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/expr/types/generic.py:1202, in Value.to_pandas(self, **kwargs)
   1180 def to_pandas(self, **kwargs) -> pd.Series:
   1181     """Convert a column expression to a pandas Series or scalar object.
   1183     Parameters
   1200     [5 rows x 8 columns]
   1201     """
-> 1202     return self.execute(**kwargs)

File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/expr/types/core.py:322, in Expr.execute(self, limit, timecontext, params, **kwargs)
    295 def execute(
    296     self,
    297     limit: int | str | None = "default",
    300     **kwargs: Any,
    301 ):
    302     """Execute an expression against its backend if one exists.
    304     Parameters
    320         Keyword arguments
    321     """
--> 322     return self._find_backend(use_default=True).execute(
    323         self, limit=limit, timecontext=timecontext, params=params, **kwargs
    324     )

File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/backends/base/sql/__init__.py:343, in BaseSQLBackend.execute(self, expr, params, limit, **kwargs)
    340 schema = expr.as_table().schema()
    342 with self._safe_raw_sql(sql, **kwargs) as cursor:
--> 343     result = self.fetch_from_cursor(cursor, schema)
    345 return expr.__pandas_result__(result)

File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/backends/duckdb/__init__.py:1201, in Backend.fetch_from_cursor(self, cursor, schema)
   1183 table = cursor.cursor.fetch_arrow_table()
   1185 df = pd.DataFrame(
   1186     {
   1187         name: (
   1199     }
   1200 )
-> 1201 df = PandasData.convert_table(df, schema)
   1202 if not df.empty and geospatial_supported:
   1203     return self._to_geodataframe(df, schema)

File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/formats/pandas.py:118, in PandasData.convert_table(cls, df, schema)
    113     raise ValueError(
    114         "schema column count does not match input data column count"
    115     )
    117 for (name, series), dtype in zip(df.items(), schema.types):
--> 118     df[name] = cls.convert_column(series, dtype)
    120 # return data with the schema's columns which may be different than the
    121 # input columns
    122 df.columns = schema.names

File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/formats/pandas.py:135, in PandasData.convert_column(cls, obj, dtype)
    132 method_name = f"convert_{dtype.__class__.__name__}"
    133 convert_method = getattr(cls, method_name, cls.convert_default)
--> 135 result = convert_method(obj, dtype, pandas_type)
    136 assert not isinstance(result, np.ndarray), f"{convert_method} -> {type(result)}"
    137 return result

File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/formats/pandas.py:201, in PandasData.convert_Date(cls, s, dtype, pandas_type)
    199     s = s.dt.tz_convert("UTC").dt.tz_localize(None)
    200 try:
--> 201     return s.astype(pandas_type).dt.date
    202 except (TypeError, pd._libs.tslibs.OutOfBoundsDatetime):
    204     def try_date(v):

File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/generic.py:6637, in NDFrame.astype(self, dtype, copy, errors)
   6631     results = [
   6632         ser.astype(dtype, copy=copy, errors=errors) for _, ser in self.items()
   6633     ]
   6635 else:
   6636     # else, only a single dtype is given
-> 6637     new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   6638     res = self._constructor_from_mgr(new_data, axes=new_data.axes)
   6639     return res.__finalize__(self, method="astype")

File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/internals/managers.py:431, in BaseBlockManager.astype(self, dtype, copy, errors)
    428 elif using_copy_on_write():
    429     copy = False
--> 431 return self.apply(
    432     "astype",
    433     dtype=dtype,
    434     copy=copy,
    435     errors=errors,
    436     using_cow=using_copy_on_write(),
    437 )

File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/internals/managers.py:364, in BaseBlockManager.apply(self, f, align_keys, **kwargs)
    362         applied = b.apply(f, **kwargs)
    363     else:
--> 364         applied = getattr(b, f)(**kwargs)
    365     result_blocks = extend_blocks(applied, result_blocks)
    367 out = type(self).from_blocks(result_blocks, self.axes)

File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/internals/blocks.py:758, in Block.astype(self, dtype, copy, errors, using_cow, squeeze)
    755         raise ValueError("Can not squeeze with more than one column.")
    756     values = values[0, :]  # type: ignore[call-overload]
--> 758 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    760 new_values = maybe_coerce_values(new_values)
    762 refs = None

File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:237, in astype_array_safe(values, dtype, copy, errors)
    234     dtype = dtype.numpy_dtype
    236 try:
--> 237     new_values = astype_array(values, dtype, copy=copy)
    238 except (ValueError, TypeError):
    239     # e.g. _astype_nansafe can fail on object-dtype of strings
    240     #  trying to convert to float
    241     if errors == "ignore":

File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:182, in astype_array(values, dtype, copy)
    179     values = values.astype(dtype, copy=copy)
    181 else:
--> 182     values = _astype_nansafe(values, dtype, copy=copy)
    184 # in pandas we don't store numpy str dtypes, so convert to object
    185 if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):

File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:110, in _astype_nansafe(arr, dtype, copy, skipna)
    107 if lib.is_np_dtype(dtype, "M"):
    108     from pandas.core.arrays import DatetimeArray
--> 110     dta = DatetimeArray._from_sequence(arr, dtype=dtype)
    111     return dta._ndarray
    113 elif lib.is_np_dtype(dtype, "m"):

File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:327, in DatetimeArray._from_sequence(cls, scalars, dtype, copy)
    325 @classmethod
    326 def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):
--> 327     return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy)

File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:354, in DatetimeArray._from_sequence_not_strict(cls, data, dtype, copy, tz, freq, dayfirst, yearfirst, ambiguous)
    351 else:
    352     tz = timezones.maybe_get_tz(tz)
--> 354 dtype = _validate_dt64_dtype(dtype)
    355 # if dtype has an embedded tz, capture it
    356 tz = _validate_tz_from_dtype(dtype, tz, explicit_tz_none)

File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:2550, in _validate_dt64_dtype(dtype)
   2544     raise ValueError(msg)
   2546 if (
   2547     isinstance(dtype, np.dtype)
   2548     and (dtype.kind != "M" or not is_supported_dtype(dtype))
   2549 ) or not isinstance(dtype, (np.dtype, DatetimeTZDtype)):
-> 2550     raise ValueError(
   2551         f"Unexpected value for 'dtype': '{dtype}'. "
   2552         "Must be 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', "
   2553         "'datetime64[ns]' or DatetimeTZDtype'."
   2554     )
   2556 if getattr(dtype, "tz", None):
   2557     # https://github.com/pandas-dev/pandas/issues/18595
   2558     # Ensure that we have a standard timezone for pytz objects.
   2559     # Without this, things like adding an array of timedeltas and
   2560     # a  tz-aware Timestamp (with a tz specific to its datetime) will
   2561     # be incorrect(ish?) for the array as a whole
   2562     dtype = cast(DatetimeTZDtype, dtype)

ValueError: Unexpected value for 'dtype': 'datetime64[D]'. Must be 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]' or DatetimeTZDtype'.

What version of ibis are you using?

my app using 8.0.0.dev210, released Jan 27, has this bug.

In the ibis repo, I can't repro using commits from jan 27th. Is there a way to see the exact commit SHA that went into 8.0.0.dev210 on pypi?

Full output of pip freeze in my app:

altair==5.2.0 anyio==4.2.0 appnope==0.1.3 asttokens==2.4.1 attrs==23.1.0 beautifulsoup4==4.12.2 cachetools==5.3.2 certifi==2023.11.17 chardet==5.2.0 charset-normalizer==3.3.2 click==8.1.7 click-default-group==1.2.4 colorama==0.4.6 comm==0.2.1 contourpy==1.2.0 cycler==0.12.1 debugpy==1.8.0 decorator==5.1.1 distlib==0.3.8 duckdb==0.9.3.dev1045+g9c91b3a329 executing==2.0.1 fastjsonschema==2.19.1 filelock==3.13.1 fonttools==4.45.1 gdown==4.7.1 h11==0.14.0 humanize==4.9.0 idna==3.6 igraph==0.11.3 ipykernel==6.28.0 ipython==8.19.0 ipyvue==1.10.1 ipyvuetify==1.8.10 ipywidgets==8.1.1 jedi==0.19.1 Jinja2==3.1.2 jsonschema==4.20.0 jsonschema-specifications==2023.12.1 jupyter_client==8.6.0 jupyter_core==5.7.0 jupyterlab-widgets==3.0.9 kiwisolver==1.4.5 Markdown==3.5.1 markdown-it-py==3.0.0 MarkupSafe==2.1.3 matplotlib==3.8.2 matplotlib-inline==0.1.6 mdurl==0.1.2 nbformat==5.9.2 nest-asyncio==1.5.8 networkx==3.2.1 numpy==1.26.2 optree==0.10.0 packaging==23.2 pandas==2.2.0 parso==0.8.3 pexpect==4.9.0 Pillow==10.1.0 platformdirs==4.1.0 pluggy==1.3.0 prompt-toolkit==3.0.43 protobuf==4.25.2 psutil==5.9.7 psycopg2==2.9.9 ptyprocess==0.7.0 pure-eval==0.2.2 pyarrow==15.0.0 Pygments==2.17.2 pymdown-extensions==10.7 pyparsing==3.1.1 pyproject-api==1.6.1 PySocks==1.7.1 pytask==0.4.2 python-dateutil==2.8.2 pytz==2023.3.post1 PyYAML==6.0.1 pyzmq==25.1.2 reacton==1.8.2 referencing==0.32.1 requests==2.31.0 rich==13.7.0 rich-click==1.7.3 rpds-py==0.16.2 six==1.16.0 sniffio==1.3.0 solara @ git+https://github.com/NickCrews/solara@b4f7eee9d1292dd69eaa32b31f91901545d22a17 soupsieve==2.5 SQLAlchemy==2.0.23 stack-data==0.6.3 starlette==0.34.0 texttable==1.7.0 tomli==2.0.1 toolz==0.12.0 tornado==6.4 tox==4.12.1 tqdm==4.66.1 traitlets==5.14.1 typing_extensions==4.8.0 tzdata==2023.4 urllib3==2.1.0 uvicorn==0.25.0 vegafusion==1.6.1 vegafusion-python-embed==1.6.1 virtualenv==20.25.0 vl-convert-python==1.2.2 watchdog==3.0.0 watchfiles==0.21.0 wcwidth==0.2.13 websockets==12.0 widgetsnbextension==4.0.9

What backend(s) are you using, if any?

duckdb and pandas

Relevant log output

No response

NickCrews avatar Jan 29 '24 07:01 NickCrews

This looks to be caused by pandas==2.2.0, which breaks a bunch of timestamp related functionality. We have a bot PR (#8056) that I am slowly working through to try to get pandas 2.2.0 working

It's getting more and more difficult for us to preserve compatibility with pandas 1.x, and apparently even between 2.1 and 2.2 there were some disruptive changes.

cpcloud avatar Jan 29 '24 15:01 cpcloud

It seems like there was a bunch of churn in supported datetime64 units:

  • 1.x: only datetime64[ns] supported
  • 2.1.x: additional units including datetime64[D] supported
  • 2.2.x: datetime64[D] no longer supported

I'm not sure how to create a compatibility layer for that off the top of my head.

cpcloud avatar Jan 29 '24 15:01 cpcloud

Here's the commit SHA for the 210th commit since the last release, which should correspond to the prerelease build you have from PyPI:

 ~/g/i/ibis  git rev-list 7.2.0..HEAD --count                                                14  12:33 
 ~/g/i/ibis  git rev-parse --short HEAD                                                      15  12:33 

gforsyth avatar Jan 29 '24 17:01 gforsyth

@gforsyth ahh, thanks for the explanation of how those prerelease numbers work! Now in the future I can find the exact SHA myself. PS, would it be possible to include the SHA into the build, eg ibis.__sha__ or something? Not sure if there is some convention around that, or if there is already an extension for build tooling that does this.

@cpcloud I noticed how I had pandas 2.2.x in my environment, and you had 2.1.x in ibis, but I dismissed that as a cause because semantic versioning says it shouldnt' break. Buttttttt, we all know how much to trust semantic versioning 😉 I python -m pip install pandas==2.1.4 in my app's environment, and no error. Thanks for the quick unblocked!

2.2.x: datetime64[D] no longer supported

Do you know if this was an explicit choice? or a mistake they accidentally left this one out of that conversion/verification logic? I would love to see their reasoning for not supporting it, I would really like them to support it, how else are we supposed to represent dates in pandas?

NickCrews avatar Jan 29 '24 17:01 NickCrews

2.2.x: datetime64[D] no longer supported

FWIW: this affected me too in https://github.com/googleapis/python-bigquery-dataframes/pull/492

stack trace
___________________ test_remote_function_stringify_with_ibis ___________________
[gw1] linux -- Python 3.11.6 /tmpfs/src/github/python-bigquery-dataframes/.nox/e2e/bin/python

session = 
scalars_table_id = 'bigframes-load-testing.bigframes_testing.scalars_269e578a0cb35c2ee0eedfef3d91d3fc'
ibis_client = 
dataset_id = 'bigframes-load-testing.bigframes_tests_system_20240322001149_109284_dataset_id'
bq_cf_connection = 'bigframes-rf-conn'

    @pytest.mark.flaky(retries=2, delay=120)
    def test_remote_function_stringify_with_ibis(
            def stringify(x):
                return f"I got {x}"
            project_id, dataset_name, table_name = scalars_table_id.split(".")
            if not ibis_client.dataset:
                ibis_client.dataset = dataset_name
            col_name = "int64_col"
            table = ibis_client.tables[table_name]
            table = table.filter(table[col_name].notnull()).order_by("rowindex").head(10)
>           pandas_df_orig = table.execute()

_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
[.nox/e2e/lib/python3.11/site-packages/ibis/expr/types/core.py:324](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/expr/types/core.py?l=324): in execute
    return self._find_backend(use_default=True).execute(
[.nox/e2e/lib/python3.11/site-packages/ibis/backends/bigquery/__init__.py:698](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/backends/bigquery/__init__.py?l=698): in execute
    result = self.fetch_from_cursor(cursor, expr.as_table().schema())
[.nox/e2e/lib/python3.11/site-packages/ibis/backends/bigquery/__init__.py:707](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/backends/bigquery/__init__.py?l=707): in fetch_from_cursor
    return PandasData.convert_table(df, schema)
[.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py:118](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py?l=118): in convert_table
    df[name] = cls.convert_column(series, dtype)
[.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py:135](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py?l=135): in convert_column
    result = convert_method(obj, dtype, pandas_type)
[.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py:201](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py?l=201): in convert_Date
    return s.astype(pandas_type).dt.date
[.nox/e2e/lib/python3.11/site-packages/pandas/core/generic.py:6640](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/generic.py?l=6640): in astype
    new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
[.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/managers.py:430](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/managers.py?l=430): in astype
    return self.apply(
[.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/managers.py:363](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/managers.py?l=363): in apply
    applied = getattr(b, f)(**kwargs)
[.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/blocks.py:758](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/blocks.py?l=758): in astype
    new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
[.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:237](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py?l=237): in astype_array_safe
    new_values = astype_array(values, dtype, copy=copy)
[.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:182](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py?l=182): in astype_array
    values = _astype_nansafe(values, dtype, copy=copy)
[.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:110](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py?l=110): in _astype_nansafe
    dta = DatetimeArray._from_sequence(arr, dtype=dtype)
[.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:327](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py?l=327): in _from_sequence
    return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy)
[.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:354](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py?l=354): in _from_sequence_not_strict
    dtype = _validate_dt64_dtype(dtype)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

dtype = dtype('               raise ValueError(
                    f"Unexpected value for 'dtype': '{dtype}'. "
                    "Must be 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', "
                    "'datetime64[ns]' or DatetimeTZDtype'."
E               ValueError: Unexpected value for 'dtype': 'datetime64[D]'. Must be 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]' or DatetimeTZDtype'.

[.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:2550](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py?l=2550): ValueError
=============================== warnings summary ===============================

I'm working around it by replacing table.execute() with sql = table.compile() ; pandas_df_orig = bigquery_client.query(sql).to_dataframe() which does the conversion to pandas in a different way.

tswast avatar Mar 22 '24 15:03 tswast

Fixed by #8758.

cpcloud avatar Apr 15 '24 12:04 cpcloud