dask-sql
dask-sql copied to clipboard
[BUG][GPU Error Bug] "SELECT (<column> IN (<string>, <string>, (CASE <column> WHEN <string> THEN <column> END ), ((<column>)||(<string>)))) FROM <table>" brings Error
What happened:
"SELECT (<column> IN (<string>, <string>, (CASE <column> WHEN <string> THEN <column> END ), ((<column>)||(<string>)))) FROM <table>" brings error, when using GPU.
However it is able to output result, when using CPU.
What you expected to happen:
It will not bring error, when using GPU.
Minimal Complete Verifiable Example:
import pandas as pd
import dask.dataframe as dd
from dask_sql import Context
c = Context()
df0 = pd.DataFrame({
'c0': ['A'],
})
t0 = dd.from_pandas(df0, npartitions=1)
c.create_table('t0', t0, gpu=False)
c.create_table('t0_gpu', t0, gpu=True)
print('CPU Result:')
result1= c.sql("SELECT (t0.c0 IN ('A', 'B', (CASE t0.c0 WHEN 'A' THEN t0.c0 END ), ((t0.c0)||('C')))) FROM t0").compute()
print(result1)
print('GPU Result:')
result2= c.sql("SELECT (t0_gpu.c0 IN ('A', 'B', (CASE t0_gpu.c0 WHEN 'A' THEN t0_gpu.c0 END ), ((t0_gpu.c0)||('C')))) FROM t0_gpu").compute()
print(result2)
Result:
INFO:numba.cuda.cudadrv.driver:init
CPU Result:
t0.c0 IN (Map { iter: Iter([Utf8("A"), Utf8("B"), CASE t0.c0 WHEN Utf8("A") THEN t0.c0 END, t0.c0 || Utf8("C")]) })
0 True
GPU Result:
Traceback (most recent call last):
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/core/column/column.py", line 2210, in as_column
memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null
TypeError: memoryview: a bytes-like object is required, not 'tuple'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/core/column/column.py", line 2327, in as_column
pa.array(
File "pyarrow/array.pxi", line 320, in pyarrow.lib.array
File "pyarrow/array.pxi", line 39, in pyarrow.lib._sequence_to_array
File "pyarrow/error.pxi", line 144, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow/error.pxi", line 123, in pyarrow.lib.check_status
pyarrow.lib.ArrowTypeError: Expected bytes, got a 'Series' object
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/core/column/column.py", line 2375, in _construct_array
arbitrary = cupy.asarray(arbitrary, dtype=dtype)
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cupy/_creation/from_data.py", line 76, in asarray
return _core.array(a, dtype, False, order)
File "cupy/_core/core.pyx", line 2382, in cupy._core.core.array
File "cupy/_core/core.pyx", line 2406, in cupy._core.core.array
File "cupy/_core/core.pyx", line 2531, in cupy._core.core._array_default
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/dask/dataframe/core.py", line 591, in __array__
x = np.array(self._computed)
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/core/frame.py", line 454, in __array__
raise TypeError(
TypeError: Implicit conversion to a host NumPy array via __array__ is not allowed, To explicitly construct a GPU matrix, consider using .to_cupy()
To explicitly construct a host matrix, consider using .to_numpy().
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/tmp/bug.py", line 20, in <module>
result2= c.sql("SELECT (t0_gpu.c0 IN ('A', 'B', (CASE t0_gpu.c0 WHEN 'A' THEN t0_gpu.c0 END ), ((t0_gpu.c0)||('C')))) FROM t0_gpu").compute()
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/dask_sql/context.py", line 513, in sql
return self._compute_table_from_rel(rel, return_futures)
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/dask_sql/context.py", line 839, in _compute_table_from_rel
dc = RelConverter.convert(rel, context=self)
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/dask_sql/physical/rel/convert.py", line 61, in convert
df = plugin_instance.convert(rel, context=context)
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/dask_sql/physical/rel/logical/project.py", line 57, in convert
new_columns[random_name] = RexConverter.convert(
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/dask_sql/physical/rex/convert.py", line 74, in convert
df = plugin_instance.convert(rel, rex, dc, context=context)
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/dask_sql/physical/rex/core/call.py", line 1129, in convert
return operation(*operands, **kwargs)
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/dask_sql/physical/rex/core/call.py", line 77, in __call__
return self.f(*operands, **kwargs)
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/dask_sql/physical/rex/core/call.py", line 965, in inList
result = series.isin(operands)
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/dask/dataframe/core.py", line 4147, in isin
return super().isin(values)
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/dask/dataframe/core.py", line 3383, in isin
meta = self._meta_nonempty.isin(values)
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/nvtx/nvtx.py", line 101, in inner
result = func(*args, **kwargs)
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/core/series.py", line 2868, in isin
{self.name: self._column.isin(values)}, index=self.index
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/core/column/column.py", line 814, in isin
lhs, rhs = self._process_values_for_isin(values)
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/core/column/column.py", line 832, in _process_values_for_isin
rhs = as_column(values, nan_as_null=False)
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/core/column/column.py", line 2360, in as_column
_construct_array(arbitrary, dtype),
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/core/column/column.py", line 2381, in _construct_array
and infer_dtype(arbitrary)
File "pandas/_libs/lib.pyx", line 1491, in pandas._libs.lib.infer_dtype
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/pandas/core/dtypes/cast.py", line 1784, in construct_1d_object_array_from_listlike
result[:] = values
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/dask/dataframe/core.py", line 591, in __array__
x = np.array(self._computed)
File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/core/frame.py", line 454, in __array__
raise TypeError(
TypeError: Implicit conversion to a host NumPy array via __array__ is not allowed, To explicitly construct a GPU matrix, consider using .to_cupy()
To explicitly construct a host matrix, consider using .to_numpy().
Anything else we need to know?:
Environment:
- dask-sql version: [2023.6.0](https://github.com/dask-contrib/dask-sql/tree/2023.6.0)
- Python version: Python 3.10.11
- Operating System: Ubuntu22.04
- Install method (conda, pip, source): Docker deploy by https://hub.docker.com/layers/rapidsai/rapidsai-dev/23.06-cuda11.8-devel-ubuntu22.04-py3.10/images/sha256-cfbb61fdf7227b090a435a2e758114f3f1c31872ed8dbd96e5e564bb5fd184a7?context=explore