dask-sql
dask-sql copied to clipboard
Move Dask config overrides to sql.yaml
Riight now, many trivial SQL queries are failing due to Dask dataframe config defaults not being supported in Dask-SQL right now:
from dask_sql import Context
from tests.integration.test_compatibility import make_rand_df
c = Context()
c.create_table("df", make_rand_df(10, a=float, b=(int, 0), c=(str, 0)))
c.sql(
"""
SELECT a,b,
MIN(b) OVER (PARTITION BY b ORDER BY a DESC
ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS a6
FROM df
ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST
""",
).compute()
KeyError Traceback (most recent call last)
File ~/git/dask-contrib/dask-sql/dask_sql/mappings.py:118, in python_to_sql_type(python_type)
117 try:
--> 118 return DaskTypeMap(_PYTHON_TO_SQL[python_type])
119 except KeyError: # pragma: no cover
KeyError: string[pyarrow]
During handling of the above exception, another exception occurred:
NotImplementedError Traceback (most recent call last)
Cell In[1], line 8
5 c = Context()
7 c.create_table("df", make_rand_df(10, a=float, b=(int, 0), c=(str, 0)))
----> 8 c.sql(
9 """
10 SELECT a,b,
11 MIN(b) OVER (PARTITION BY b ORDER BY a DESC
12 ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) AS a6
13 FROM df
14 ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST
15 """,
16 ).compute()
File ~/git/dask-contrib/dask-sql/dask_sql/context.py:509, in Context.sql(self, sql, return_futures, dataframes, gpu, config_options)
506 self.create_table(df_name, df, gpu=gpu)
508 if isinstance(sql, str):
--> 509 rel, _ = self._get_ral(sql)
510 elif isinstance(sql, LogicalPlan):
511 rel = sql
File ~/git/dask-contrib/dask-sql/dask_sql/context.py:807, in Context._get_ral(self, sql)
802 self.context.apply_dynamic_partition_pruning(
803 dask_config.get("sql.dynamic_partition_pruning")
804 )
806 # get the schema of what we currently have registered
--> 807 schemas = self._prepare_schemas()
808 for schema in schemas:
809 self.context.register_schema(schema.name, schema)
File ~/git/dask-contrib/dask-sql/dask_sql/context.py:760, in Context._prepare_schemas(self)
758 cc = cc.rename_handle_duplicates(df.columns, columns)
759 dc.column_container = cc
--> 760 column_type_mapping = list(
761 zip(columns, map(python_to_sql_type, df.dtypes))
762 )
763 table = DaskTable(
764 schema_name, name, row_count, column_type_mapping, filepath
765 )
767 rust_schema.add_table(table)
File ~/git/dask-contrib/dask-sql/dask_sql/mappings.py:120, in python_to_sql_type(python_type)
118 return DaskTypeMap(_PYTHON_TO_SQL[python_type])
119 except KeyError: # pragma: no cover
--> 120 raise NotImplementedError(
121 f"The python type {python_type} is not implemented (yet)"
122 )
NotImplementedError: The python type string is not implemented (yet)
Currently, we're overriding the offending Dask config values in our pytest configuration, but it might make sense to bake those overrides into dask-sql's own Dask config file so that users don't have to make those overrides manually to get things working.