dask-sql
dask-sql copied to clipboard
[BUG] Unable to select same column under multiple aliases on GPU
What happened:
When attempting to select the same column with multiple aliases on a GPU table, we get a ValueError:
File ~/dev/dask-sql/main/dask_sql/context.py:503, in Context.sql(self, sql, return_futures, dataframes, gpu, config_options)
498 else:
499 raise RuntimeError(
500 f"Encountered unsupported `LogicalPlan` sql type: {type(sql)}"
501 )
--> 503 return self._compute_table_from_rel(rel, return_futures)
File ~/dev/dask-sql/main/dask_sql/context.py:859, in Context._compute_table_from_rel(self, rel, return_futures)
851 cc = cc.rename(
852 {
853 df_col: select_name
854 for df_col, select_name in zip(cc.columns, select_names)
855 }
856 )
857 dc = DataContainer(dc.df, cc)
--> 859 df = dc.assign()
860 if not return_futures:
861 df = df.compute()
File ~/dev/dask-sql/main/dask_sql/datacontainer.py:229, in DataContainer.assign(self)
218 """
219 Combine the column mapping with the actual data and return
220 a dataframe which has the the columns specified in the
221 stored ColumnContainer.
222 """
223 df = self.df[
224 [
225 self.column_container._frontend_backend_mapping[out_col]
226 for out_col in self.column_container.columns
227 ]
228 ]
--> 229 df.columns = self.column_container.columns
231 return df
File /datasets/charlesb/mambaforge/envs/dask-sql-gpuci-py310/lib/python3.10/site-packages/dask/dataframe/core.py:4887, in DataFrame.__setattr__(self, key, value)
4885 self[key] = value
4886 else:
-> 4887 object.__setattr__(self, key, value)
File /datasets/charlesb/mambaforge/envs/dask-sql-gpuci-py310/lib/python3.10/site-packages/dask/dataframe/core.py:4746, in DataFrame.columns(self, columns)
4744 @columns.setter
4745 def columns(self, columns):
-> 4746 renamed = _rename_dask(self, columns)
4747 self._meta = renamed._meta
4748 self._name = renamed._name
File /datasets/charlesb/mambaforge/envs/dask-sql-gpuci-py310/lib/python3.10/site-packages/dask/dataframe/core.py:7084, in _rename_dask(df, names)
7068 """
7069 Destructively rename columns of dd.DataFrame or name of dd.Series.
7070 Not for pd.DataFrame or pd.Series.
(...)
7080 Column names/Series name
7081 """
7083 assert isinstance(df, _Frame)
-> 7084 metadata = _rename(names, df._meta)
7085 name = f"rename-{tokenize(df, metadata)}"
7087 dsk = partitionwise_graph(_rename, name, metadata, df)
File /datasets/charlesb/mambaforge/envs/dask-sql-gpuci-py310/lib/python3.10/site-packages/dask/dataframe/core.py:7055, in _rename(columns, df)
7053 # deep=False doesn't doesn't copy any data/indices, so this is cheap
7054 df = df.copy(deep=False)
-> 7055 df.columns = columns
7056 return df
7057 elif is_series_like(df) or is_index_like(df):
File /datasets/charlesb/mambaforge/envs/dask-sql-gpuci-py310/lib/python3.10/site-packages/cudf/core/dataframe.py:1096, in DataFrame.__setattr__(self, key, col)
1094 super().__setattr__(key, col)
1095 else:
-> 1096 super().__setattr__(key, col)
File /datasets/charlesb/mambaforge/envs/dask-sql-gpuci-py310/lib/python3.10/contextlib.py:79, in ContextDecorator.__call__.<locals>.inner(*args, **kwds)
76 @wraps(func)
77 def inner(*args, **kwds):
78 with self._recreate_cm():
---> 79 return func(*args, **kwds)
File /datasets/charlesb/mambaforge/envs/dask-sql-gpuci-py310/lib/python3.10/site-packages/cudf/core/dataframe.py:2469, in DataFrame.columns(self, columns)
2466 columns = pd.Index(columns, tupleize_cols=is_multiindex)
2468 if not len(columns) == len(self._data.names):
-> 2469 raise ValueError(
2470 f"Length mismatch: expected {len(self._data.names)} elements, "
2471 f"got {len(columns)} elements"
2472 )
2474 self._set_column_names(columns, is_multiindex, columns.names)
ValueError: Length mismatch: expected 1 elements, got 2 elements
What you expected to happen: I would expect this to succeed, as it does with CPU tables.
Minimal Complete Verifiable Example:
from dask_sql import Context
from dask.datasets import timeseries
c = Context()
c.create_table("df", timeseries(), gpu=True)
c.sql(
"""
select
x as a,
x as b
from
df
"""
)
Anything else we need to know?: Think that the root issue here is that we aren't able to have duplicate column names in cuDF:
import cudf
import pandas as pd
gdf = cudf.DataFrame({"a": [1, 2, 3]})
pdf = pd.DataFrame({"a": [1, 2, 3]})
gdf[["a", "a"]]
# a
# 0 1
# 1 2
# 2 3
pdf[["a", "a"]]
# a a
# 0 1 1
# 1 2 2
# 2 3 3
Which dask-sql depends on to avoid unnecessary work when we're reusing the same column in multiple places; to circumvent this we would probably need to refactor some of our column assignment code to avoid creating tables with duplicate column names.
Environment:
- dask-sql version: latest
- Python version: 3.10
- Operating System: ubuntu
- Install method (conda, pip, source): source