duckplyr
duckplyr copied to clipboard
Filter pushdown discrepancy between duckdb and duckplyr
This issue is related to the discussion in this comment: https://github.com/duckdblabs/duckplyr/issues/145#issuecomment-2120942326
The following code demonstrates the difference in filter
pushdown between DuckDB and Duckplyr as shown by the explain()
function.
With Duckplyr, the filter does not appear to be pushed down to the Parquet files.
library(duckdb)
#> Loading required package: DBI
library(duckplyr)
#> ✔ Overwriting dplyr methods with duckplyr methods.
#> ℹ Turn off with `duckplyr::methods_restore()`.
#>
#> Attaching package: 'duckplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(dbplyr)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:dbplyr':
#>
#> ident, sql
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
# DuckDB ------------------------------------------------------------------
con <- dbConnect(duckdb())
dbSendQuery(con, "INSTALL httpfs; LOAD httpfs;")
#> <duckdb_result af590 connection=7ce20 statement='INSTALL httpfs; LOAD httpfs;'>
dbSendQuery(con, "SET s3_region='auto';SET s3_endpoint='';")
#> <duckdb_result e70c0 connection=7ce20 statement='SET s3_region='auto';SET s3_endpoint='';'>
f_duckdb <- function() {
df <- tbl(
con,
"read_parquet('s3://duckplyr-demo-taxi-data/taxi-data-2019-partitioned/*/*.parquet')"
)
df |>
filter(total_amount > 0L) |>
filter(!is.na(passenger_count)) |>
mutate(tip_pct = 100 * tip_amount / total_amount) |>
summarise(
avg_tip_pct = median(tip_pct),
n = n(),
.by = passenger_count
) |>
arrange(desc(passenger_count))
}
# Duckplyr ----------------------------------------------------------------
con <- duckplyr:::get_default_duckdb_connection()
dbSendQuery(con, "INSTALL httpfs; LOAD httpfs;")
#> <duckdb_result bab60 connection=cd550 statement='INSTALL httpfs; LOAD httpfs;'>
dbSendQuery(con, "SET s3_region='auto';SET s3_endpoint='';")
#> <duckdb_result e8960 connection=cd550 statement='SET s3_region='auto';SET s3_endpoint='';'>
f_duckplyr <- function() {
duckplyr::duckplyr_df_from_file(
"s3://duckplyr-demo-taxi-data/taxi-data-2019-partitioned/*/*.parquet",
"read_parquet",
options = list(hive_partitioning = TRUE),
class = class(tibble())
) |>
filter(total_amount > 0L) |>
filter(!is.na(passenger_count)) |>
mutate(tip_pct = 100 * tip_amount / total_amount) |>
summarise(
avg_tip_pct = median(tip_pct),
n = n(),
.by = passenger_count
) |>
arrange(desc(passenger_count))
}
explain(f_duckdb())
#> Warning: Missing values are always removed in SQL aggregation functions.
#> Use `na.rm = TRUE` to silence this warning
#> This warning is displayed once every 8 hours.
#> <SQL>
#> SELECT
#> passenger_count,
#> PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY tip_pct) AS avg_tip_pct,
#> COUNT(*) AS n
#> FROM (
#> SELECT q01.*, (100.0 * tip_amount) / total_amount AS tip_pct
#> FROM (FROM read_parquet('s3://duckplyr-demo-taxi-data/taxi-data-2019-partitioned/*/*.parquet')) q01
#> WHERE (total_amount > 0) AND (NOT((passenger_count IS NULL)))
#> ) q01
#> GROUP BY passenger_count
#> ORDER BY passenger_count DESC
#>
#> <PLAN>
#> physical_plan
#> ┌───────────────────────────┐
#> │ ORDER_BY │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ ORDERS: │
#> │ q01.passenger_count DESC │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │ HASH_GROUP_BY │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ #0 │
#> │ quantile_cont(#1) │
#> │ count_star() │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │ PROJECTION │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ passenger_count │
#> │ tip_pct │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │ PROJECTION │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ passenger_count │
#> │ tip_pct │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │ FILTER │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ (NOT (passenger_count IS │
#> │ NULL)) │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ EC: 3524697 │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │ READ_PARQUET │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ passenger_count │
#> │ tip_amount │
#> │ total_amount │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ Filters: total_amount>0.0 │
#> │ AND total_amount IS N... │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ EC: 17623488 │
#> └───────────────────────────┘
explain(f_duckplyr())
#> ┌───────────────────────────┐
#> │ ORDER_BY │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ ORDERS: │
#> │ #3 ASC │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │ PROJECTION │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ passenger_count │
#> │ avg_tip_pct │
#> │ n │
#> │ -(passenger_count) │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │ PROJECTION │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ passenger_count │
#> │ avg_tip_pct │
#> │ n │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │ HASH_GROUP_BY │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ #0 │
#> │ median(#1) │
#> │ count_star() │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │ PROJECTION │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ passenger_count │
#> │ tip_pct │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │ PROJECTION │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ passenger_count │
#> │ tip_pct │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │ PROJECTION │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ passenger_count │
#> │ tip_amount │
#> │ total_amount │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │ FILTER │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │(r_base::>(total_amount, 0)│
#> │ AND (NOT ((passenger_count│
#> │ IS NULL) OR isnan(CAST │
#> │(passenger_count AS DO... │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ EC: 17623488 │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │ READ_PARQUET │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ total_amount │
#> │ passenger_count │
#> │ tip_amount │
#> │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
#> │ EC: 88117440 │
#> └───────────────────────────┘
As a result, duckplyr
is slightly slower compared to duckdb
.
bench::mark(f_duckdb(), f_duckplyr(), check = FALSE)
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 f_duckdb() 1.29s 1.29s 0.777 336KB 1.55
#> 2 f_duckplyr() 6.95s 6.95s 0.144 103KB 0
Created on 2024-05-21 with reprex v2.1.0
Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.4.0 (2024-04-24)
#> os Linux Mint 21.3
#> system x86_64, linux-gnu
#> ui X11
#> language en_CA:en
#> collate en_CA.UTF-8
#> ctype en_CA.UTF-8
#> tz America/Montreal
#> date 2024-05-21
#> pandoc 3.1.11 @ /usr/lib/rstudio/resources/app/bin/quarto/bin/tools/x86_64/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date (UTC) lib source
#> bench 1.1.3 2023-05-04 [1] RSPM (R 4.4.0)
#> blob 1.2.4 2023-03-17 [1] RSPM
#> cli 3.6.2 2023-12-11 [1] RSPM
#> collections 0.3.7 2023-01-05 [1] RSPM
#> DBI * 1.2.2 2024-02-16 [1] RSPM
#> dbplyr * 2.5.0 2024-03-19 [1] RSPM
#> digest 0.6.35 2024-03-11 [1] RSPM
#> dplyr * 1.1.4 2023-11-17 [1] RSPM
#> duckdb * 0.10.2 2024-05-01 [1] CRAN (R 4.4.0)
#> duckplyr * 0.4.0 2024-05-21 [1] CRAN (R 4.4.0)
#> evaluate 0.23 2023-11-01 [1] RSPM
#> fansi 1.0.6 2023-12-08 [1] RSPM
#> fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.4.0)
#> fs 1.6.4 2024-04-25 [1] CRAN (R 4.4.0)
#> generics 0.1.3 2022-07-05 [1] RSPM
#> glue 1.7.0 2024-01-09 [1] RSPM
#> htmltools 0.5.8.1 2024-04-04 [1] RSPM
#> knitr 1.46 2024-04-06 [1] RSPM
#> lifecycle 1.0.4 2023-11-07 [1] RSPM
#> magrittr 2.0.3 2022-03-30 [1] RSPM
#> pillar 1.9.0 2023-03-22 [1] RSPM
#> pkgconfig 2.0.3 2019-09-22 [1] RSPM
#> profmem 0.6.0 2020-12-13 [1] RSPM (R 4.4.0)
#> purrr 1.0.2 2023-08-10 [1] RSPM
#> R.cache 0.16.0 2022-07-21 [1] RSPM
#> R.methodsS3 1.8.2 2022-06-13 [1] RSPM
#> R.oo 1.26.0 2024-01-24 [1] RSPM
#> R.utils 2.12.3 2023-11-18 [1] RSPM
#> R6 2.5.1 2021-08-19 [1] RSPM
#> reprex 2.1.0 2024-01-11 [1] RSPM
#> rlang 1.1.3 2024-01-10 [1] RSPM
#> rmarkdown 2.27 2024-05-17 [1] RSPM (R 4.4.0)
#> rstudioapi 0.16.0 2024-03-24 [1] RSPM
#> sessioninfo 1.2.2 2021-12-06 [1] RSPM
#> styler 1.10.3 2024-04-07 [1] RSPM
#> tibble 3.2.1 2023-03-20 [1] RSPM
#> tidyselect 1.2.1 2024-03-11 [1] RSPM
#> utf8 1.2.4 2023-10-22 [1] RSPM
#> vctrs 0.6.5 2023-12-01 [1] RSPM
#> withr 3.0.0 2024-01-16 [1] RSPM
#> xfun 0.44 2024-05-15 [1] CRAN (R 4.4.0)
#> yaml 2.3.8 2023-12-11 [1] RSPM
#>
#> [1] /home/filoche/R/x86_64-pc-linux-gnu-library/4.4
#> [2] /usr/local/lib/R/site-library
#> [3] /usr/lib/R/site-library
#> [4] /usr/lib/R/library
#>
#> ──────────────────────────────────────────────────────────────────────────────