cudf icon indicating copy to clipboard operation
cudf copied to clipboard

[BUG] datetime comparison is sensitive to ordering with different precision

Open hoxbro opened this issue 1 year ago • 1 comments

Describe the bug Comparison of datetime objects are sensitive to run order when dtype differs.

Steps/Code to reproduce bug

import numpy as np
import cudf

dt1 = np.datetime64('2024-03-04T18:24:35.67')  # ms
dt2 = np.datetime64('2024-03-04T18:24:35.670870310')  # ns

cudf.Series([dt1]) > dt2  # Works
cudf.Series([dt2]) > dt1  # Works
dt1 < cudf.Series([dt2])  # Works
dt2 < cudf.Series([dt1])  # Fails with: `ValueError: Converting an integer to a NumPy datetime requires a specified unit`
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[12], line 11
      9 cudf.Series([dt2]) > dt1  # Works
     10 dt1 < cudf.Series([dt2])  # Works
---> 11 dt2 < cudf.Series([dt1])  # Fails

File ~/projects/holoviz/repos/holoviews/.pixi/envs/test-gpu/lib/python3.12/site-packages/cudf/core/indexed_frame.py:4828, in IndexedFrame.__array_ufunc__(self, ufunc, method, *inputs, **kwargs)
   4827 def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
-> 4828     ret = super().__array_ufunc__(ufunc, method, *inputs, **kwargs)
   4829     fname = ufunc.__name__
   4831     if ret is not None:

File ~/projects/holoviz/repos/holoviews/.pixi/envs/test-gpu/lib/python3.12/site-packages/cudf/utils/performance_tracking.py:51, in _performance_tracking.<locals>.wrapper(*args, **kwargs)
     43 if nvtx.enabled():
     44     stack.enter_context(
     45         nvtx.annotate(
     46             message=func.__qualname__,
   (...)
     49         )
     50     )
---> 51 return func(*args, **kwargs)

File ~/projects/holoviz/repos/holoviews/.pixi/envs/test-gpu/lib/python3.12/site-packages/cudf/core/frame.py:1540, in Frame.__array_ufunc__(self, ufunc, method, *inputs, **kwargs)
   1538 @_performance_tracking
   1539 def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
-> 1540     return _array_ufunc(self, ufunc, method, inputs, kwargs)

File ~/projects/holoviz/repos/holoviews/.pixi/envs/test-gpu/lib/python3.12/site-packages/cudf/utils/utils.py:92, in _array_ufunc(obj, ufunc, method, inputs, kwargs)
     90     if fname == "float_power":
     91         return getattr(obj, op)(other).astype(float)
---> 92     return getattr(obj, op)(other)
     94 # Special handling for various unary operations.
     95 if fname == "negative":

File ~/projects/holoviz/repos/holoviews/.pixi/envs/test-gpu/lib/python3.12/site-packages/cudf/core/mixins/mixin_factory.py:11, in _partialmethod.<locals>.wrapper(self, *args2, **kwargs2)
     10 def wrapper(self, *args2, **kwargs2):
---> 11     return method(self, *args1, *args2, **kwargs1, **kwargs2)

File ~/projects/holoviz/repos/holoviews/.pixi/envs/test-gpu/lib/python3.12/site-packages/cudf/core/indexed_frame.py:4804, in IndexedFrame._binaryop(self, other, op, fill_value, can_reindex, *args, **kwargs)
   4797     return NotImplemented
   4799 level_names = (
   4800     self._data._level_names if can_use_self_column_name else None
   4801 )
   4802 return self._from_data(
   4803     ColumnAccessor(
-> 4804         type(self)._colwise_binop(operands, op),
   4805         level_names=level_names,
   4806     ),
   4807     index=out_index,
   4808 )

File ~/projects/holoviz/repos/holoviews/.pixi/envs/test-gpu/lib/python3.12/site-packages/cudf/utils/performance_tracking.py:51, in _performance_tracking.<locals>.wrapper(*args, **kwargs)
     43 if nvtx.enabled():
     44     stack.enter_context(
     45         nvtx.annotate(
     46             message=func.__qualname__,
   (...)
     49         )
     50     )
---> 51 return func(*args, **kwargs)

File ~/projects/holoviz/repos/holoviews/.pixi/envs/test-gpu/lib/python3.12/site-packages/cudf/core/frame.py:1528, in Frame._colwise_binop(cls, operands, fn)
   1520         assert False, "At least one operand must be a column."
   1522 # TODO: Disable logical and binary operators between columns that
   1523 # are not numerical using the new binops mixin.
   1525 outcol = (
   1526     getattr(operator, fn)(right_column, left_column)
   1527     if reflect
-> 1528     else getattr(operator, fn)(left_column, right_column)
   1529 )
   1531 if output_mask is not None:
   1532     outcol = outcol.set_mask(output_mask)

File ~/projects/holoviz/repos/holoviews/.pixi/envs/test-gpu/lib/python3.12/site-packages/cudf/core/mixins/mixin_factory.py:11, in _partialmethod.<locals>.wrapper(self, *args2, **kwargs2)
     10 def wrapper(self, *args2, **kwargs2):
---> 11     return method(self, *args1, *args2, **kwargs1, **kwargs2)

File ~/projects/holoviz/repos/holoviews/.pixi/envs/test-gpu/lib/python3.12/site-packages/cudf/core/column/datetime.py:628, in DatetimeColumn._binaryop(self, other, op)
    626 def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
    627     reflect, op = self._check_reflected_op(op)
--> 628     other = self._wrap_binop_normalization(other)
    629     if other is NotImplemented:
    630         return NotImplemented

File ~/projects/holoviz/repos/holoviews/.pixi/envs/test-gpu/lib/python3.12/site-packages/cudf/core/column/column.py:583, in ColumnBase._wrap_binop_normalization(self, other)
    580     return cudf.Scalar(other, dtype=self.dtype)
    581 if isinstance(other, np.ndarray) and other.ndim == 0:
    582     # Try and maintain the dtype
--> 583     other = other.dtype.type(other.item())
    584 return self.normalize_binop_value(other)

ValueError: Converting an integer to a NumPy datetime requires a specified unit

Expected behavior A clear and concise description of what you expected to happen.

Environment overview (please complete the following information)

  • Environment location: Bare-metal
  • Method of cuDF install: conda

Environment details Please run and paste the output of the cudf/print_env.sh script here, to gather any other relevant environment details

Additional context Add any other context about the problem here.

hoxbro avatar Oct 15 '24 10:10 hoxbro

Thanks for reporting this. From some quick digging, it looks like the issue here is that numpy's datetime object upcasts to an array and attempts to use the __array_ufunc__ overload when the datetime is the LHS. That is why we get different results here depending on the argument ordering. We can avoid the confusing ordering behavior by wrapping the date objects in np.array, i.e.

cudf.Series([dt1]) > np.array([dt2])  # Also fails
cudf.Series([dt2]) > np.array([dt1])  # Also fails

We'll have to do a bit more digging to determine a fix. I expect that we need to add some extra preprocessing to handle the array case instead of the scalar case.

vyasr avatar Oct 16 '24 23:10 vyasr