malloy
malloy copied to clipboard
row_number() produces incorrect results
The following query produces incorrect results. id2 should always be 1 or 2 but I'm seeing different numbers.
run: duckdb.table('malloytest.airports') -> {
group_by: faa_region
aggregate: airport_count is count()
calculate: id is row_number()
nest: by_fac_type is {
group_by: fac_type
aggregate: airport_count is count()
calculate: id2 is row_number()
nest: elevation is {
aggregate: avg_elevation is elevation.avg()
}
limit: 2
}
}
It only seems to fail on DuckDB :(
Looks like the reason is that were are getting computation in this column from nested datasets. A possible solution is to bracket with a case
CASE WHEN group_set = 1 THEN ROW_NUMBER() OVER(PARTITION BY group_set, base."faa_region" ORDER BY CASE WHEN group_set=1 THEN
COUNT(1)
END desc NULLS LAST ) END as "id2__1",
For debugging
-- connection: duckdb
WITH __stage0 AS (
SELECT
group_set,
base."faa_region" as "faa_region__0",
CASE WHEN group_set=0 THEN
COUNT(1)
END as "airport_count__0",
ROW_NUMBER() OVER(PARTITION BY group_set ORDER BY CASE WHEN group_set=0 THEN
COUNT(1)
END desc NULLS LAST ) as "id__0",
CASE WHEN group_set IN (1,2) THEN
base."fac_type"
END as "fac_type__1",
CASE WHEN group_set=1 THEN
COUNT(1)
END as "airport_count__1",
CASE WHEN group_set = 1 THEN ROW_NUMBER() OVER(PARTITION BY group_set, base."faa_region" ORDER BY CASE WHEN group_set=1 THEN
COUNT(1)
END desc NULLS LAST ) END as "id2__1",
CASE WHEN group_set=2 THEN
AVG(base."elevation")
END as "avg_elevation__2"
FROM malloytest.airports as base
CROSS JOIN (SELECT UNNEST(GENERATE_SERIES(0,2,1)) as group_set ) as group_set
GROUP BY 1,2,5
)
-- select * from __stage0
-- where group_set = 2
-- order by
-- group_set,
-- airport_count__0 desc,
-- faa_region__0,
-- airport_count__1 desc,
-- fac_type__1
-- limit 1000
, __stage1 AS (
SELECT
CASE WHEN group_set=2 THEN 1 ELSE group_set END as group_set,
"faa_region__0" as "faa_region__0",
FIRST("airport_count__0") FILTER (WHERE "airport_count__0" IS NOT NULL) as "airport_count__0",
FIRST("id__0") FILTER (WHERE "id__0" IS NOT NULL) as "id__0",
CASE WHEN group_set IN (1,2) THEN
"fac_type__1"
END as "fac_type__1",
FIRST("airport_count__1") FILTER (WHERE "airport_count__1" IS NOT NULL) as "airport_count__1",
FIRST("id2__1") FILTER (WHERE "id2__1" IS NOT NULL) as "id2__1",
COALESCE(FIRST({"avg_elevation": "avg_elevation__2" }) FILTER(WHERE group_set=2), {"avg_elevation": NULL}) as "elevation__1"
FROM __stage0
GROUP BY 1,2,5
)
SELECT
"faa_region__0" as "faa_region",
MAX(CASE WHEN group_set=0 THEN "airport_count__0" END) as "airport_count",
MAX(CASE WHEN group_set=0 THEN "id__0" END) as "id",
COALESCE(LIST({
"fac_type": "fac_type__1",
"airport_count": "airport_count__1",
"id2": "id2__1",
"elevation": "elevation__1"} ORDER BY "airport_count__1" desc NULLS LAST) FILTER (WHERE group_set=1)[1:2],[]) as "by_fac_type"
FROM __stage1
GROUP BY 1
ORDER BY 2 desc NULLS LAST
fixed.