dask-sql icon indicating copy to clipboard operation
dask-sql copied to clipboard

Move Dask config overrides to sql.yaml

Open charlesbluca opened this issue 2 years ago • 0 comments

Riight now, many trivial SQL queries are failing due to Dask dataframe config defaults not being supported in Dask-SQL right now:

from dask_sql import Context

from tests.integration.test_compatibility import make_rand_df

c = Context()

c.create_table("df", make_rand_df(10, a=float, b=(int, 0), c=(str, 0)))
c.sql(
    """
    SELECT a,b,
        MIN(b) OVER (PARTITION BY b ORDER BY a DESC
            ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS a6
    FROM df
    ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST
    """,
).compute()
KeyError                                  Traceback (most recent call last)
File ~/git/dask-contrib/dask-sql/dask_sql/mappings.py:118, in python_to_sql_type(python_type)
    117 try:
--> 118     return DaskTypeMap(_PYTHON_TO_SQL[python_type])
    119 except KeyError:  # pragma: no cover

KeyError: string[pyarrow]

During handling of the above exception, another exception occurred:

NotImplementedError                       Traceback (most recent call last)
Cell In[1], line 8
      5 c = Context()
      7 c.create_table("df", make_rand_df(10, a=float, b=(int, 0), c=(str, 0)))
----> 8 c.sql(
      9     """
     10     SELECT a,b,
     11         MIN(b) OVER (PARTITION BY b ORDER BY a DESC
     12             ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS a6
     13     FROM df
     14     ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST
     15     """,
     16 ).compute()

File ~/git/dask-contrib/dask-sql/dask_sql/context.py:509, in Context.sql(self, sql, return_futures, dataframes, gpu, config_options)
    506         self.create_table(df_name, df, gpu=gpu)
    508 if isinstance(sql, str):
--> 509     rel, _ = self._get_ral(sql)
    510 elif isinstance(sql, LogicalPlan):
    511     rel = sql

File ~/git/dask-contrib/dask-sql/dask_sql/context.py:807, in Context._get_ral(self, sql)
    802 self.context.apply_dynamic_partition_pruning(
    803     dask_config.get("sql.dynamic_partition_pruning")
    804 )
    806 # get the schema of what we currently have registered
--> 807 schemas = self._prepare_schemas()
    808 for schema in schemas:
    809     self.context.register_schema(schema.name, schema)

File ~/git/dask-contrib/dask-sql/dask_sql/context.py:760, in Context._prepare_schemas(self)
    758     cc = cc.rename_handle_duplicates(df.columns, columns)
    759     dc.column_container = cc
--> 760 column_type_mapping = list(
    761     zip(columns, map(python_to_sql_type, df.dtypes))
    762 )
    763 table = DaskTable(
    764     schema_name, name, row_count, column_type_mapping, filepath
    765 )
    767 rust_schema.add_table(table)

File ~/git/dask-contrib/dask-sql/dask_sql/mappings.py:120, in python_to_sql_type(python_type)
    118     return DaskTypeMap(_PYTHON_TO_SQL[python_type])
    119 except KeyError:  # pragma: no cover
--> 120     raise NotImplementedError(
    121         f"The python type {python_type} is not implemented (yet)"
    122     )

NotImplementedError: The python type string is not implemented (yet)

Currently, we're overriding the offending Dask config values in our pytest configuration, but it might make sense to bake those overrides into dask-sql's own Dask config file so that users don't have to make those overrides manually to get things working.

charlesbluca avatar Nov 08 '23 20:11 charlesbluca