polars
polars copied to clipboard
suggest add some sql syntax supports
Description
many DBMS support following sql that polars reports error. please let them works. by the way, it seems that create table only create a view of select, can it be materialized?
〉select 1 a union select 2;
Error: no table name provided in query
〉create table t(i int);
Error: only CREATE TABLE AS SELECT is supported
〉create table t as select 1 a;
Error: no table name provided in query
〉create table t as select * from read_parquet('yellow_tripdata_2021-01.parquet');
┌──────────────┐
│ Response │
│ --- │
│ str │
╞══════════════╡
│ Create Table │
└──────────────┘
〉select 1 a from t union select 2 from t;
Error: schema names differ: got a, expected literal
〉select 1 as a from t union select 2 from t;
Error: schema names differ: got a, expected literal
〉select * from t limit 3;
┌──────────┬─────────────────┬─────────────────┬────────────────┬───┬────────────────┬──────────────┬────────────────┬─────────────┐
│ VendorID ┆ tpep_pickup_dat ┆ tpep_dropoff_da ┆ passenger_coun ┆ … ┆ improvement_su ┆ total_amount ┆ congestion_sur ┆ airport_fee │
│ --- ┆ etime ┆ tetime ┆ t ┆ ┆ rcharge ┆ --- ┆ charge ┆ --- │
│ i64 ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ f64 ┆ --- ┆ f64 │
│ ┆ datetime[ns] ┆ datetime[ns] ┆ f64 ┆ ┆ f64 ┆ ┆ f64 ┆ │
╞══════════╪═════════════════╪═════════════════╪════════════════╪═══╪════════════════╪══════════════╪════════════════╪═════════════╡
│ 1 ┆ 2021-01-01 ┆ 2021-01-01 ┆ 1.0 ┆ … ┆ 0.3 ┆ 11.8 ┆ 2.5 ┆ null │
│ ┆ 00:30:10 ┆ 00:36:12 ┆ ┆ ┆ ┆ ┆ ┆ │
│ 1 ┆ 2021-01-01 ┆ 2021-01-01 ┆ 1.0 ┆ … ┆ 0.3 ┆ 4.3 ┆ 0.0 ┆ null │
│ ┆ 00:51:20 ┆ 00:52:19 ┆ ┆ ┆ ┆ ┆ ┆ │
│ 1 ┆ 2021-01-01 ┆ 2021-01-01 ┆ 1.0 ┆ … ┆ 0.3 ┆ 51.95 ┆ 0.0 ┆ null │
│ ┆ 00:43:30 ┆ 01:11:06 ┆ ┆ ┆ ┆ ┆ ┆ │
└──────────┴─────────────────┴─────────────────┴────────────────┴───┴────────────────┴──────────────┴────────────────┴─────────────┘
〉select vendorID from t limit 3 union all select vendorID from t limit 3;
Error: sql parser error: Expected end of statement, found: union at Line: 1, Column 32
〉select vendorID from t union all select vendorID from t limit 3;
Error: not found: vendorID
Error originated just after this operation:
SELECT [col("VendorID"), col("tpep_pickup_datetime"), col("tpep_dropoff_datetime"), col("passenger_count"), col("trip_distance"), col("RatecodeID"), col("store_and_fwd_flag"), col("PULocationID"), col("DOLocationID"), col("payment_type"), col("fare_amount"), col("extra"), col("mta_tax"), col("tip_amount"), col("tolls_amount"), col("improvement_surcharge"), col("total_amount"), col("congestion_surcharge"), col("airport_fee")] FROM
Parquet SCAN yellow_tripdata_2021-01.parquet
PROJECT */19 COLUMNS
〉select VendorID from t union all select VendorID from t limit 3;
┌──────────┐
│ VendorID │
│ --- │
│ i64 │
╞══════════╡
│ 1 │
│ 1 │
│ 1 │
└──────────┘
〉select * from (select VendorID from t limit 4) union all select VendorID from t limit 3;
Error: derived tables must have aliases
〉select * from (select VendorID from t limit 4)a union all select VendorID from t limit 3;
┌──────────┐
│ VendorID │
│ --- │
│ i64 │
╞══════════╡
│ 1 │
│ 1 │
│ 1 │
└──────────┘
〉select VendorID,count(1) from t union all select VendorID,2 from t limit 3;
┌──────────┬─────────┐
│ VendorID ┆ literal │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞══════════╪═════════╡
│ 1 ┆ 1 │
│ 1 ┆ 1 │
│ 1 ┆ 1 │
└──────────┴─────────┘
〉select VendorID,count(1) from t group by VendorID union all select VendorID,2 from t limit 3;
Error: cannot aggregate a literal
〉select VendorID,count(*) from t group by VendorID union all select VendorID,2 from t limit 3;
Error: schema names differ: got len, expected literal
〉select VendorID,count(*) from t group by VendorID union all select VendorID,count(*) from t limit 3;
┌──────────┬────────┐
│ VendorID ┆ len │
│ --- ┆ --- │
│ i64 ┆ u32 │
╞══════════╪════════╡
│ 6 ┆ 10291 │
│ 1 ┆ 422337 │
│ 2 ┆ 937141 │
└──────────┴────────┘
python version has the issue too
>>> import polars as pl
>>> t = pl.scan_parquet("yellow_tripdata_2021-01.parquet")
>>> s="select 1 a union select 2"
>>> pl.sql(s).collect()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python3.10/dist-packages/polars/lazyframe/frame.py", line 2050, in collect
return wrap_df(ldf.collect(callback))
polars.exceptions.ComputeError: schema names differ: got a, expected literal
Resolved plan until failure:
---> FAILED HERE RESOLVING 'unique' <---
SELECT [dyn int: 2] FROM
DF []; PROJECT */0 COLUMNS; SELECTION: None
>>> s="select VendorID,count(*) from t group by VendorID union all select VendorID,count(*) from t limit 3"
>>> pl.sql(s).collect()
shape: (3, 2)
┌──────────┬────────┐
│ VendorID ┆ len │
│ --- ┆ --- │
│ i64 ┆ u32 │
╞══════════╪════════╡
│ 6 ┆ 10291 │
│ 2 ┆ 937141 │
│ 1 ┆ 422337 │
└──────────┴────────┘
>>>
when the CTE name same as the table name, the CTE name should be used first in the sql eg. the 1st sql should got the result as the 2nd sql.
〉with t as (select VendorID,passenger_count,count(*)cnt from t group by VendorID,passenger_count)select * from t limit 3;
┌──────────┬─────────────────┬─────────────────┬────────────────┬───┬────────────────┬──────────────┬────────────────┬─────────────┐
│ VendorID ┆ tpep_pickup_dat ┆ tpep_dropoff_da ┆ passenger_coun ┆ … ┆ improvement_su ┆ total_amount ┆ congestion_sur ┆ airport_fee │
│ --- ┆ etime ┆ tetime ┆ t ┆ ┆ rcharge ┆ --- ┆ charge ┆ --- │
│ i64 ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ f64 ┆ --- ┆ f64 │
│ ┆ datetime[ns] ┆ datetime[ns] ┆ f64 ┆ ┆ f64 ┆ ┆ f64 ┆ │
╞══════════╪═════════════════╪═════════════════╪════════════════╪═══╪════════════════╪══════════════╪════════════════╪═════════════╡
│ 1 ┆ 2021-01-01 ┆ 2021-01-01 ┆ 1.0 ┆ … ┆ 0.3 ┆ 11.8 ┆ 2.5 ┆ null │
│ ┆ 00:30:10 ┆ 00:36:12 ┆ ┆ ┆ ┆ ┆ ┆ │
│ 1 ┆ 2021-01-01 ┆ 2021-01-01 ┆ 1.0 ┆ … ┆ 0.3 ┆ 4.3 ┆ 0.0 ┆ null │
│ ┆ 00:51:20 ┆ 00:52:19 ┆ ┆ ┆ ┆ ┆ ┆ │
│ 1 ┆ 2021-01-01 ┆ 2021-01-01 ┆ 1.0 ┆ … ┆ 0.3 ┆ 51.95 ┆ 0.0 ┆ null │
│ ┆ 00:43:30 ┆ 01:11:06 ┆ ┆ ┆ ┆ ┆ ┆ │
└──────────┴─────────────────┴─────────────────┴────────────────┴───┴────────────────┴──────────────┴────────────────┴─────────────┘
〉with tm as (select VendorID,passenger_count,count(*)cnt from t group by VendorID,passenger_count)select * from tm limit 3;
┌──────────┬─────────────────┬────────┐
│ VendorID ┆ passenger_count ┆ cnt │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ u32 │
╞══════════╪═════════════════╪════════╡
│ 1 ┆ 6.0 ┆ 286 │
│ 2 ┆ 1.0 ┆ 641520 │
│ 1 ┆ 5.0 ┆ 289 │
└──────────┴─────────────────┴────────┘
follwing sql works in other DBMS too
s="""
select ty1,NULL ty2,NULL ty3,sum(v)s,count(v)c from read_parquet('../t123.parquet') group by ty1 union all
select NULL ,ty2,NULL ,sum(v),count(v) from read_parquet('../t123.parquet') group by ty2 union all
select NULL ,NULL ,ty3,sum(v),count(v) from read_parquet('../t123.parquet') group by ty3
"""
>>>
>>> ctx = polars.SQLContext()
>>>
>>> x=ctx.execute(s)
Traceback (most recent call last):
File "<python-input-17>", line 1, in <module>
x=ctx.execute(s)
File "C:\Users\LD\Downloads\python-3.13\Lib\site-packages\polars\sql\context.py", line 438, in execute
res = wrap_ldf(self._ctxt.execute(query))
~~~~~~~~~~~~~~~~~~^^^^^^^
polars.exceptions.DuplicateError: column with name 'v' has more than one occurrence
Resolved plan until failure:
---> FAILED HERE RESOLVING 'select' <---
Parquet SCAN [../t123.parquet]
PROJECT */4 COLUMNS
s="""
select ty1,NULL ty2,NULL ty3,sum(v)s,count(v)c from read_parquet('t123.parquet') group by ty1 union all
select NULL ,ty2,NULL ,sum(v),count(v) from read_parquet('t123.parquet') group by ty2 union all
select NULL ,NULL ,ty3,sum(v),count(v) from read_parquet('t123.parquet') group by ty3
"""
>>> x=ctx.execute(s)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "D:\Python38\lib\site-packages\polars\sql\context.py", line 268, in execute
res = wrap_ldf(self._ctxt.execute(query))
polars.exceptions.DuplicateError: column with name 'literal' has more than one occurrences
Error originated just after this operation:
Parquet SCAN t123.parquet
PROJECT */4 COLUMNS
LogicalPlan had already failed with the above error; after failure, 1 additional operation was attempted on the LazyFrame
in polars, we must write
s="""
select ty1,NULL ty2,NULL ty3,sum(v)s,count(v)c from read_parquet('t123.parquet') group by ty1 union all
select NULL ty1,ty2,NULL ty3,sum(v)s,count(v)c from read_parquet('t123.parquet') group by ty2 union all
select NULL ty1,NULL ty2,ty3,sum(v)s,count(v)c from read_parquet('t123.parquet') group by ty3
"""
x=ctx.execute(s)
x.collect()