Achilles
Achilles copied to clipboard
Analysis 117 gives "Query exceeded resource limits" on BigQuery
This means that the query is very bad performance wise:
Query exceeded resource limits. This query used 5132 CPU seconds but would charge only 18M Analysis bytes. This exceeds the ratio supported by the on-demand pricing model. Please consider moving this workload to the flat-rate reservation pricing model, which does not have this limit. 5132 CPU seconds were used, and this query must use less than 4600 CPU seconds.
Original SQL:
-- 117 Number of persons with at least one day of observation in each month
--HINT DISTRIBUTE_ON_KEY(stratum_1)
-- generating date key sequences in a cross-dialect compatible fashion
with century as (select '19' num union select '20' num),
tens as (select '0' num union select '1' num union select '2' num union select '3' num union select '4' num union select '5' num union select '6' num union select '7' num union select '8' num union select '9' num),
ones as (select '0' num union select '1' num union select '2' num union select '3' num union select '4' num union select '5' num union select '6' num union select '7' num union select '8' num union select '9' num),
months as (select '01' as num union select '02' num union select '03' num union select '04' num union select '05' num union select '06' num union select '07' num union select '08' num union select '09' num union select '10' num union select '11' num union select '12' num),
date_keys as (select cast(concat(century.num, tens.num, ones.num,months.num) as int) obs_month from century cross join tens cross join ones cross join months)
SELECT
117 as analysis_id,
CAST(t1.obs_month AS VARCHAR(255)) as stratum_1,
cast(null as varchar(255)) as stratum_2, cast(null as varchar(255)) as stratum_3, cast(null as varchar(255)) as stratum_4, cast(null as varchar(255)) as stratum_5,
COALESCE(COUNT_BIG(distinct op1.PERSON_ID),0) as count_value
into @scratchDatabaseSchema@schemaDelim@tempAchillesPrefix_117
FROM date_keys t1
left join
(select t2.obs_month, op2.*
from @cdmDatabaseSchema.observation_period op2, date_keys t2
where year(op2.observation_period_start_date)*100 + month(op2.observation_period_start_date) <= t2.obs_month
and year(op2.observation_period_end_date)*100 + month(op2.observation_period_end_date) >= t2.obs_month
) op1 on op1.obs_month = t1.obs_month
group by t1.obs_month
having COALESCE(COUNT_BIG(distinct op1.PERSON_ID),0) > 0;
Translated SQL by SQLRender:
-- 117 Number of persons with at least one day of observation in each month
--HINT DISTRIBUTE_ON_KEY(stratum_1)
-- generating date key sequences in a cross-dialect compatible fashion
CREATE TABLE omop_work_zidder.tmpach_117
AS WITH century as (select '19' num union distinct select '20' num),
tens as (select '0' num union distinct select '1' num union distinct select '2' num union distinct select '3' num union distinct select '4' num union distinct select '5' num union distinct select '6' num union distinct select '7' num union distinct select '8' num union distinct select '9' num),
ones as (select '0' num union distinct select '1' num union distinct select '2' num union distinct select '3' num union distinct select '4' num union distinct select '5' num union distinct select '6' num union distinct select '7' num union distinct select '8' num union distinct select '9' num),
months as (select '01' as num union distinct select '02' num union distinct select '03' num union distinct select '04' num union distinct select '05' num union distinct select '06' num union distinct select '07' num union distinct select '08' num union distinct select '09' num union distinct select '10' num union distinct select '11' num union distinct select '12' num),
date_keys as (select cast(concat(century.num, tens.num, ones.num,months.num) as int64) obs_month from century cross join tens cross join ones cross join months)
SELECT 117 as analysis_id,
cast(t1.obs_month as STRING) as stratum_1,
cast(null as STRING) as stratum_2, cast(null as STRING) as stratum_3, cast(null as STRING) as stratum_4, cast(null as STRING) as stratum_5,
coalesce(cast(COUNT(distinct op1.person_id) as int64), 0) as count_value
FROM date_keys t1
left join
(select t2.obs_month, op2.*
from omop_zidder.observation_period op2, date_keys t2
where EXTRACT(YEAR from op2.observation_period_start_date)*100 + EXTRACT(MONTH from op2.observation_period_start_date) <= t2.obs_month
and EXTRACT(YEAR from op2.observation_period_end_date)*100 + EXTRACT(MONTH from op2.observation_period_end_date) >= t2.obs_month
) op1 on op1.obs_month = t1.obs_month
group by t1.obs_month
having coalesce(cast(COUNT(distinct op1.person_id) as int64), 0) > 0 ;
Can try to check the improved version in this branch? Is the performance better for your data? https://github.com/OHDSI/Achilles/tree/issue-690-speed-up-achilles-117
No same result:
Now the generated query is:
--HINT DISTRIBUTE_ON_KEY(stratum_1)
-- generating date key sequences in a cross-dialect compatible fashion
CREATE TABLE omop_work_zidder.tmpach_117
AS WITH century as (select '19' num union distinct select '20' num),
tens as (select '0' num union distinct select '1' num union distinct select '2' num union distinct select '3' num union distinct select '4' num union distinct select '5' num union distinct select '6' num union distinct select '7' num union distinct select '8' num union distinct select '9' num),
ones as (select '0' num union distinct select '1' num union distinct select '2' num union distinct select '3' num union distinct select '4' num union distinct select '5' num union distinct select '6' num union distinct select '7' num union distinct select '8' num union distinct select '9' num),
months as (select '01' as num union distinct select '02' num union distinct select '03' num union distinct select '04' num union distinct select '05' num union distinct select '06' num union distinct select '07' num union distinct select '08' num union distinct select '09' num union distinct select '10' num union distinct select '11' num union distinct select '12' num),
op_dates as (select min(observation_period_start_date) as min_date, max(observation_period_start_date) as max_date from omop_work.observation_period),
date_keys as (select cast(concat(century.num, tens.num, ones.num,months.num) as int64) ints,
IF(SAFE_CAST(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS DATE) IS NULL,PARSE_DATE('%Y%m%d', cast(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS STRING)),SAFE_CAST(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS DATE)) as date_begin,
DATE_SUB(DATE_TRUNC(DATE_ADD(IF(SAFE_CAST(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS DATE) IS NULL,PARSE_DATE('%Y%m%d', cast(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS STRING)),SAFE_CAST(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS DATE)), INTERVAL 1 MONTH), MONTH), INTERVAL 1 DAY) as date_end
from op_dates, century cross join tens cross join ones cross join months
where DATE_ADD(IF(SAFE_CAST(IF(SAFE_CAST(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS DATE) IS NULL,PARSE_DATE('%Y%m%d', cast(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS STRING)),SAFE_CAST(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS DATE)) AS DATE) IS NULL,PARSE_DATE('%Y%m%d', cast(IF(SAFE_CAST(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS DATE) IS NULL,PARSE_DATE('%Y%m%d', cast(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS STRING)),SAFE_CAST(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS DATE)) AS STRING)),SAFE_CAST(IF(SAFE_CAST(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS DATE) IS NULL,PARSE_DATE('%Y%m%d', cast(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS STRING)),SAFE_CAST(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS DATE)) AS DATE)), interval 1 MONTH) >= op_dates.min_date
and DATE_ADD(IF(SAFE_CAST(DATE_SUB(DATE_TRUNC(DATE_ADD(IF(SAFE_CAST(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS DATE) IS NULL,PARSE_DATE('%Y%m%d', cast(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS STRING)),SAFE_CAST(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS DATE)), INTERVAL 1 MONTH), MONTH), INTERVAL 1 DAY) AS DATE) IS NULL,PARSE_DATE('%Y%m%d', cast(DATE_SUB(DATE_TRUNC(DATE_ADD(IF(SAFE_CAST(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS DATE) IS NULL,PARSE_DATE('%Y%m%d', cast(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS STRING)),SAFE_CAST(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS DATE)), INTERVAL 1 MONTH), MONTH), INTERVAL 1 DAY) AS STRING)),SAFE_CAST(DATE_SUB(DATE_TRUNC(DATE_ADD(IF(SAFE_CAST(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS DATE) IS NULL,PARSE_DATE('%Y%m%d', cast(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS STRING)),SAFE_CAST(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' AS DATE)), INTERVAL 1 MONTH), MONTH), INTERVAL 1 DAY) AS DATE)), interval 1 MONTH) <= op_dates.max_date)
SELECT 117 as analysis_id,
cast(t1.ints as STRING) as stratum_1,
cast(null as STRING) as stratum_2, cast(null as STRING) as stratum_3, cast(null as STRING) as stratum_4, cast(null as STRING) as stratum_5,
coalesce(cast(COUNT(distinct op1.person_id) as int64), 0) as count_value
FROM date_keys t1
left join
(select t2.ints, op2.*
from omop_work.observation_period op2, date_keys t2
where op2.observation_period_start_date <= t2.date_end
and op2.observation_period_end_date >= t2.date_begin
) op1 on op1.ints = t1.ints
group by t1.ints
having coalesce(cast(COUNT(distinct op1.person_id) as int64), 0) > 0 ;
I also wrote a small python script to test the SQL render translation (I'm not a R fan :)):
from pathlib import Path
import jpype
import jpype.imports
if __name__ == "__main__":
# launch the JVM
sqlrender_path = str(
Path(__file__).parent.resolve()
/ "libs"
/ "SqlRender"
/ "inst"
/ "java"
/ "SqlRender.jar"
)
jpype.startJVM(classpath=[sqlrender_path]) # type: ignore
from org.ohdsi.sql import ( # type: ignore # pylint: disable=import-outside-toplevel,import-error
SqlRender,
SqlTranslate,
)
path_to_replacement_patterns = str(
Path(__file__).parent.resolve()
/ "libs"
/ "SqlRender"
/ "inst"
/ "csv"
/ "replacementPatterns.csv"
)
sql = """
--HINT DISTRIBUTE_ON_KEY(stratum_1)
-- generating date key sequences in a cross-dialect compatible fashion
with century as (select '19' num union select '20' num),
tens as (select '0' num union select '1' num union select '2' num union select '3' num union select '4' num union select '5' num union select '6' num union select '7' num union select '8' num union select '9' num),
ones as (select '0' num union select '1' num union select '2' num union select '3' num union select '4' num union select '5' num union select '6' num union select '7' num union select '8' num union select '9' num),
months as (select '01' as num union select '02' num union select '03' num union select '04' num union select '05' num union select '06' num union select '07' num union select '08' num union select '09' num union select '10' num union select '11' num union select '12' num),
op_dates as (select min(observation_period_start_date) as min_date, max(observation_period_start_date) as max_date from @cdmDatabaseSchema.observation_period),
date_keys as (select cast(concat(century.num, tens.num, ones.num,months.num) as int) ints,
cast(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' as date) as date_begin,
eomonth(cast(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' as date)) as date_end
from op_dates, century cross join tens cross join ones cross join months
where dateadd(m, 1, cast(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' as date)) >= op_dates.min_date
and dateadd(m, 1, eomonth(cast(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' as date))) <= op_dates.max_date)
SELECT
117 as analysis_id,
CAST(t1.ints AS VARCHAR(255)) as stratum_1,
cast(null as varchar(255)) as stratum_2, cast(null as varchar(255)) as stratum_3, cast(null as varchar(255)) as stratum_4, cast(null as varchar(255)) as stratum_5,
COALESCE(COUNT_BIG(distinct op1.PERSON_ID),0) as count_value
into @scratchDatabaseSchema@schemaDelim@tempAchillesPrefix_117
FROM date_keys t1
left join
(select t2.ints, op2.*
from @cdmDatabaseSchema.observation_period op2, date_keys t2
where op2.observation_period_start_date <= t2.date_end
and op2.observation_period_end_date >= t2.date_begin
) op1 on op1.ints = t1.ints
group by t1.ints
having COALESCE(COUNT_BIG(distinct op1.PERSON_ID),0) > 0;
"""
parameters = {
"cdmDatabaseSchema": "omop_work",
"scratchDatabaseSchema": "omop_work_zidder",
"schemaDelim": ".",
"tempAchillesPrefix": "tmpach",
}
sql = str(
SqlRender.renderSql(sql, list(parameters.keys()), list(parameters.values()))
)
sql = str(
SqlTranslate.translateSqlWithPath(
sql, "bigquery", None, None, path_to_replacement_patterns
)
)
print(sql)
A good example of the 117 query for BigQuery is:
WITH cte_date_range AS (
SELECT
GENERATE_DATE_ARRAY(DATE_TRUNC(op1.observation_period_start_date, MONTH), DATE_TRUNC(op1.observation_period_end_date, MONTH), INTERVAL 1 MONTH) AS date_range,
op1.person_id
FROM omop_zidder.observation_period op1
), cte_flatten_date_range AS (
SELECT
months as ints,
cte1.person_id
FROM cte_date_range cte1
CROSS JOIN UNNEST(cte1.date_range) AS months
)
SELECT
117 AS analysis_id,
FORMAT_DATE('%Y%m', cte2.ints) AS stratum_1,
CAST(NULL AS STRING) AS stratum_2,
CAST(NULL AS STRING) AS stratum_3,
CAST(NULL AS STRING) AS stratum_4,
CAST(NULL AS STRING) AS stratum_5,
COUNT(DISTINCT cte2.person_id) AS count_value
FROM cte_flatten_date_range cte2
GROUP BY cte2.ints
And even better is checking that the observation_period_end_date is not in the future (observation_period_end_date is a required field, if it is not yet known, this date will probably be far in the future ex: 31-Dec-2099)
WITH cte_date_range AS (
SELECT
GENERATE_DATE_ARRAY(DATE_TRUNC(op1.observation_period_start_date, MONTH), DATE_TRUNC(IF(op1.observation_period_end_date > CURRENT_DATE(), CURRENT_DATE(), op1.observation_period_end_date), MONTH), INTERVAL 1 MONTH) AS date_range,
op1.person_id
FROM omop_zidder.observation_period op1
), cte_flatten_date_range AS (
SELECT
months as ints,
cte1.person_id
FROM cte_date_range cte1
CROSS JOIN UNNEST(cte1.date_range) AS months
)
SELECT
117 AS analysis_id,
FORMAT_DATE('%Y%m', cte2.ints) AS stratum_1,
CAST(NULL AS STRING) AS stratum_2,
CAST(NULL AS STRING) AS stratum_3,
CAST(NULL AS STRING) AS stratum_4,
CAST(NULL AS STRING) AS stratum_5,
COUNT(DISTINCT cte2.person_id) AS count_value
FROM cte_flatten_date_range cte2
GROUP BY cte2.ints
-- order by cte2.ints desc
andresults in a query plan thas is much simpler than the initial one:
vs:
Adding the check for end_dates not in the future does the trick (the query stays in the resource limits):
--HINT DISTRIBUTE_ON_KEY(stratum_1)
-- generating date key sequences in a cross-dialect compatible fashion
with century as (select '19' num union select '20' num),
tens as (select '0' num union select '1' num union select '2' num union select '3' num union select '4' num union select '5' num union select '6' num union select '7' num union select '8' num union select '9' num),
ones as (select '0' num union select '1' num union select '2' num union select '3' num union select '4' num union select '5' num union select '6' num union select '7' num union select '8' num union select '9' num),
months as (select '01' as num union select '02' num union select '03' num union select '04' num union select '05' num union select '06' num union select '07' num union select '08' num union select '09' num union select '10' num union select '11' num union select '12' num),
op_dates as (select min(observation_period_start_date) as min_date, max(observation_period_start_date) as max_date from @cdmDatabaseSchema.observation_period),
date_keys as (select cast(concat(century.num, tens.num, ones.num,months.num) as int) ints,
cast(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' as date) as date_begin,
eomonth(cast(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' as date)) as date_end
from op_dates, century cross join tens cross join ones cross join months
where dateadd(m, 1, cast(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' as date)) >= op_dates.min_date
and dateadd(m, 1, eomonth(cast(concat(century.num, tens.num, ones.num)||'-'||months.num||'-01' as date))) <= op_dates.max_date)
SELECT
117 as analysis_id,
CAST(t1.ints AS VARCHAR(255)) as stratum_1,
cast(null as varchar(255)) as stratum_2, cast(null as varchar(255)) as stratum_3, cast(null as varchar(255)) as stratum_4, cast(null as varchar(255)) as stratum_5,
COALESCE(COUNT_BIG(distinct op1.PERSON_ID),0) as count_value
into @scratchDatabaseSchema@schemaDelim@tempAchillesPrefix_117
FROM date_keys t1
left join
(select t2.ints, op2.*
from @cdmDatabaseSchema.observation_period op2, date_keys t2
where op2.observation_period_start_date <= t2.date_end
and IF(op2.observation_period_end_date > GETDATE(), GETDATE(), op2.observation_period_end_date) >= t2.date_begin
) op1 on op1.ints = t1.ints
group by t1.ints
having COALESCE(COUNT_BIG(distinct op1.PERSON_ID),0) > 0;