pointblank
pointblank copied to clipboard
Improve speed of `rows_distinct()` on large databases
I have been testing pointblank on a very large RSQLite database with more than 2 billion rows.
While most build-in validation functions were running quite fast, the validation of rows_dinstinct()
took about 3.5 hours for 480M rows table. It was crashing on larger tables.
So I was checking if this was generally due to SQL DISTINCT being slow, but could find that this only took 22 minutes on the same data.
I could not detect in the code what makes rows_distinct()
so extremely slow (possibly pre-/post processing?), but honestly I couldn't even see whether the code uses dplyr::distinct()
or dbplyr translations for databases.
To test, I have added a benchmark example below. Be careful, it uses about 500 MB of temp local storage and 5-10 minutes to run the benchmarks.
#> expression rows min median `itr/sec` mem_al…¹ gc/se…²
#> <bch:expr> <dbl> <bch:tm> <bch:tm> <dbl> <bch:by> <dbl>
#> 1 pointblank_distinct_sql 10000 3.58s 3.62s 0.276 7.18MB 0.552
#> 2 pointblank_distinct_tib 10000 3.51s 3.54s 0.282 1.46MB 0.565
#> 3 dplyr_distinct_sql 10000 23ms 24.21ms 41.3 106.28KB 0
#> 4 dplyr_distinct_tib 10000 3.94ms 3.94ms 254. 1.1MB 0
#> 5 pointblank_distinct_sql 100000 3.91s 3.92s 0.255 1.42MB 0.510
#> 6 pointblank_distinct_tib 100000 3.85s 3.85s 0.259 1.42MB 0.519
#> 7 dplyr_distinct_sql 100000 64.13ms 65.14ms 15.4 61.8KB 0
#> 8 dplyr_distinct_tib 100000 10.43ms 10.96ms 91.2 8.65MB 0
#> 9 pointblank_distinct_sql 1000000 7.27s 7.29s 0.137 1.42MB 0.274
#> 10 pointblank_distinct_tib 1000000 7.33s 7.34s 0.136 1.42MB 0.272
#> 11 dplyr_distinct_sql 1000000 495.67ms 507.84ms 1.97 61.8KB 0.985
#> 12 dplyr_distinct_tib 1000000 102.11ms 103.34ms 9.68 87.44MB 0
#> 13 pointblank_distinct_sql 10000000 44.27s 44.33s 0.0226 1.42MB 0.0451
#> 14 pointblank_distinct_tib 10000000 43.01s 43.74s 0.0229 1.42MB 0.0457
#> 15 dplyr_distinct_sql 10000000 5.17s 5.18s 0.193 61.8KB 0
#> 16 dplyr_distinct_tib 10000000 1.5s 1.66s 0.601 906.33MB 2.10
#> # … with abbreviated variable names ¹mem_alloc, ²`gc/sec`
Results are
- that SQL distinct is a bit difficult to compare, but with adding another count to materialize results, without that I always get a few milliseconds independent of number of rows
-
dplyr::distinct()
on a tibble takes max 1.5 second to run -
pointblank::rows_distinct()
however needs almost a minute to run (factor 30-40 slower than dplyr and at least factor 10 slower than SQL distinct). - no notable difference in pointblank timings between tibbles and SQL tables
library(pointblank)
library(DBI)
library(RSQLite)
#> Warning: package 'RSQLite' was built under R version 4.2.2
library(tidyverse)
#> Warning: package 'ggplot2' was built under R version 4.2.2
#> Warning: package 'stringr' was built under R version 4.2.2
library(bench)
#create large synth data
n_pat <- 1E6
n_diag <- 1E7
diag <- tibble(
repid = sample(1:n_pat, size = n_diag, replace = TRUE),
abrq = sample(20101:20224, size = n_diag, replace = TRUE),
icd = sample(c("E01, E02, E11, E12"), size = n_diag, replace = TRUE),
icd_sub = paste0(icd, ".9")
) %>%
arrange(repid)
#connect sql db
sql_loc <- dbConnect(RSQLite::SQLite(), dbname = tempfile())
dbWriteTable(sql_loc, "diag", diag)
#benchmark pointblank::rows_distinct vs. dplyr::distinct
#added another count to make sure that SQL DISTINCT is actually materialized
results <-
bench::press(
rows = c(1E4, 1E5, 1E6, 1E7),
bench::mark(
pointblank_distinct_sql = create_agent(
tbl = {tbl(sql_loc, "diag") %>% head(rows)}) %>%
rows_distinct() %>%
interrogate(extract_failed = FALSE),
pointblank_distinct_tib = create_agent(
tbl = {tbl(sql_loc, "diag") %>% head(rows)}) %>%
rows_distinct() %>%
interrogate(extract_failed = FALSE),
dplyr_distinct_sql = tbl(sql_loc, "diag") %>%
head(n = rows) %>%
distinct() %>%
tally() %>%
collect(),
dplyr_distinct_tib = diag %>%
head(n = rows) %>%
distinct() %>%
tally(),
check = FALSE,
iterations = 2
)
)
#> Running with:
#> rows
#> 1 10000
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> 2 100000
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> 3 1000000
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> 4 10000000
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
results
#> # A tibble: 16 × 7
#> expression rows min median `itr/sec` mem_al…¹ gc/se…²
#> <bch:expr> <dbl> <bch:tm> <bch:tm> <dbl> <bch:by> <dbl>
#> 1 pointblank_distinct_sql 10000 3.58s 3.62s 0.276 7.18MB 0.552
#> 2 pointblank_distinct_tib 10000 3.51s 3.54s 0.282 1.46MB 0.565
#> 3 dplyr_distinct_sql 10000 23ms 24.21ms 41.3 106.28KB 0
#> 4 dplyr_distinct_tib 10000 3.94ms 3.94ms 254. 1.1MB 0
#> 5 pointblank_distinct_sql 100000 3.91s 3.92s 0.255 1.42MB 0.510
#> 6 pointblank_distinct_tib 100000 3.85s 3.85s 0.259 1.42MB 0.519
#> 7 dplyr_distinct_sql 100000 64.13ms 65.14ms 15.4 61.8KB 0
#> 8 dplyr_distinct_tib 100000 10.43ms 10.96ms 91.2 8.65MB 0
#> 9 pointblank_distinct_sql 1000000 7.27s 7.29s 0.137 1.42MB 0.274
#> 10 pointblank_distinct_tib 1000000 7.33s 7.34s 0.136 1.42MB 0.272
#> 11 dplyr_distinct_sql 1000000 495.67ms 507.84ms 1.97 61.8KB 0.985
#> 12 dplyr_distinct_tib 1000000 102.11ms 103.34ms 9.68 87.44MB 0
#> 13 pointblank_distinct_sql 10000000 44.27s 44.33s 0.0226 1.42MB 0.0451
#> 14 pointblank_distinct_tib 10000000 43.01s 43.74s 0.0229 1.42MB 0.0457
#> 15 dplyr_distinct_sql 10000000 5.17s 5.18s 0.193 61.8KB 0
#> 16 dplyr_distinct_tib 10000000 1.5s 1.66s 0.601 906.33MB 2.10
#> # … with abbreviated variable names ¹mem_alloc, ²`gc/sec`
Created on 2022-12-20 with reprex v2.0.2
Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.2.1 (2022-06-23 ucrt)
#> os Windows 10 x64 (build 19044)
#> system x86_64, mingw32
#> ui RTerm
#> language (EN)
#> collate German_Germany.utf8
#> ctype German_Germany.utf8
#> tz Europe/Berlin
#> date 2022-12-20
#> pandoc 2.19.2 @ C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date (UTC) lib source
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.2.1)
#> backports 1.4.1 2021-12-13 [1] CRAN (R 4.2.0)
#> bench * 1.1.2 2021-11-30 [1] CRAN (R 4.2.1)
#> bit 4.0.5 2022-11-15 [1] CRAN (R 4.2.1)
#> bit64 4.0.5 2020-08-30 [1] CRAN (R 4.2.1)
#> blastula 0.3.2 2020-05-19 [1] CRAN (R 4.2.2)
#> blob 1.2.3 2022-04-10 [1] CRAN (R 4.2.1)
#> broom 1.0.2 2022-12-15 [1] CRAN (R 4.2.1)
#> cachem 1.0.6 2021-08-19 [1] CRAN (R 4.2.1)
#> cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.2.1)
#> cli 3.4.1 2022-09-23 [1] CRAN (R 4.2.1)
#> colorspace 2.0-3 2022-02-21 [1] CRAN (R 4.2.1)
#> crayon 1.5.2 2022-09-29 [1] CRAN (R 4.2.1)
#> DBI * 1.1.3 2022-06-18 [1] CRAN (R 4.2.1)
#> dbplyr 2.2.1 2022-06-27 [1] CRAN (R 4.2.1)
#> digest 0.6.31 2022-12-11 [1] CRAN (R 4.2.2)
#> dplyr * 1.0.10 2022-09-01 [1] CRAN (R 4.2.1)
#> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.2.1)
#> evaluate 0.19 2022-12-13 [1] CRAN (R 4.2.2)
#> fansi 1.0.3 2022-03-24 [1] CRAN (R 4.2.1)
#> fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.2.1)
#> forcats * 0.5.2 2022-08-19 [1] CRAN (R 4.2.1)
#> fs 1.5.2 2021-12-08 [1] CRAN (R 4.2.1)
#> gargle 1.2.1 2022-09-08 [1] CRAN (R 4.2.1)
#> generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.1)
#> ggplot2 * 3.4.0 2022-11-04 [1] CRAN (R 4.2.2)
#> glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.1)
#> googledrive 2.0.0 2021-07-08 [1] CRAN (R 4.2.1)
#> googlesheets4 1.0.1 2022-08-13 [1] CRAN (R 4.2.1)
#> gtable 0.3.1 2022-09-01 [1] CRAN (R 4.2.1)
#> haven 2.5.1 2022-08-22 [1] CRAN (R 4.2.1)
#> highr 0.9 2021-04-16 [1] CRAN (R 4.2.1)
#> hms 1.1.2 2022-08-19 [1] CRAN (R 4.2.1)
#> htmltools 0.5.4 2022-12-07 [1] CRAN (R 4.2.2)
#> httr 1.4.4 2022-08-17 [1] CRAN (R 4.2.1)
#> jsonlite 1.8.4 2022-12-06 [1] CRAN (R 4.2.1)
#> knitr 1.41 2022-11-18 [1] CRAN (R 4.2.1)
#> lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.2.2)
#> lubridate 1.9.0 2022-11-06 [1] CRAN (R 4.2.1)
#> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.1)
#> memoise 2.0.1 2021-11-26 [1] CRAN (R 4.2.1)
#> modelr 0.1.10 2022-11-11 [1] CRAN (R 4.2.2)
#> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.2.1)
#> pillar 1.8.1 2022-08-19 [1] CRAN (R 4.2.1)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.1)
#> pointblank * 0.11.2.9000 2022-11-18 [1] Github (rich-iannone/pointblank@721d930)
#> profmem 0.6.0 2020-12-13 [1] CRAN (R 4.2.1)
#> purrr * 0.3.5 2022-10-06 [1] CRAN (R 4.2.1)
#> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.2.1)
#> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.2.0)
#> R.oo 1.25.0 2022-06-12 [1] CRAN (R 4.2.0)
#> R.utils 2.12.2 2022-11-11 [1] CRAN (R 4.2.1)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.2.1)
#> Rcpp 1.0.9 2022-07-08 [1] CRAN (R 4.2.1)
#> readr * 2.1.3 2022-10-01 [1] CRAN (R 4.2.1)
#> readxl 1.4.1 2022-08-17 [1] CRAN (R 4.2.1)
#> reprex 2.0.2 2022-08-17 [1] CRAN (R 4.2.1)
#> rlang 1.0.6 2022-09-24 [1] CRAN (R 4.2.1)
#> rmarkdown 2.19 2022-12-15 [1] CRAN (R 4.2.1)
#> RSQLite * 2.2.19 2022-11-24 [1] CRAN (R 4.2.2)
#> rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.2.1)
#> rvest 1.0.3 2022-08-19 [1] CRAN (R 4.2.1)
#> scales 1.2.1 2022-08-20 [1] CRAN (R 4.2.1)
#> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.1)
#> stringi 1.7.8 2022-07-11 [1] CRAN (R 4.2.1)
#> stringr * 1.5.0 2022-12-02 [1] CRAN (R 4.2.2)
#> styler 1.8.1 2022-11-07 [1] CRAN (R 4.2.2)
#> tibble * 3.1.8 2022-07-22 [1] CRAN (R 4.2.1)
#> tidyr * 1.2.1 2022-09-08 [1] CRAN (R 4.2.1)
#> tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.2.2)
#> tidyverse * 1.3.2 2022-07-18 [1] CRAN (R 4.2.1)
#> timechange 0.1.1 2022-11-04 [1] CRAN (R 4.2.2)
#> tzdb 0.3.0 2022-03-28 [1] CRAN (R 4.2.1)
#> utf8 1.2.2 2021-07-24 [1] CRAN (R 4.2.1)
#> vctrs 0.5.1 2022-11-16 [1] CRAN (R 4.2.1)
#> withr 2.5.0 2022-03-03 [1] CRAN (R 4.2.1)
#> xfun 0.35 2022-11-16 [1] CRAN (R 4.2.1)
#> xml2 1.3.3 2021-11-30 [1] CRAN (R 4.2.1)
#> yaml 2.3.6 2022-10-18 [1] CRAN (R 4.2.1)
#>
#> [1] C:/Users/__/AppData/Local/R/win-library/4.2
#> [2] C:/Program Files/R/R-4.2.1/library
#>
#> ──────────────────────────────────────────────────────────────────────────────
Hi @marianschmidt I know this is almost a year old, but I just ran into this myself. The problem is stemming from the internal function add_reporting_data
found here: https://github.com/rstudio/pointblank/blob/dc1b917c6f2fee3d717b40d0da36c96bb4618172/R/interrogate.R#L2942-L3045
It appears that the initial query to determine distinct rows https://github.com/rstudio/pointblank/blob/dc1b917c6f2fee3d717b40d0da36c96bb4618172/R/interrogate.R#L2353-L2357 is good (though different than your initial tests). However, in order to collect the counts of how many records exist, passed, and failed result in 3 separate database queries, which is causing the slowdown.
- Records tested: https://github.com/rstudio/pointblank/blob/dc1b917c6f2fee3d717b40d0da36c96bb4618172/R/interrogate.R#L2972-L2976
- Passed: https://github.com/rstudio/pointblank/blob/dc1b917c6f2fee3d717b40d0da36c96bb4618172/R/interrogate.R#L2986-L2991
- Failed: https://github.com/rstudio/pointblank/blob/dc1b917c6f2fee3d717b40d0da36c96bb4618172/R/interrogate.R#L2997-L3002
Changing these to be run on a single query would solve this issue.
Also, I think there is a typo in your benchmark code where you test pointblank_distinct_tib
- this is still pointing to the database, which is why your results were virtually the same as pointblank_distinct_sql
. If you change that tbl reference to point to the data.frame, the function runs much faster. I also added pointblank_source
to emulate the query that interrogate
is using (that ultimately gets run 3 times as stated above).
library(pointblank)
library(DBI)
library(RSQLite)
library(tidyverse)
library(bench)
#create large synth data
n_pat <- 1E6
n_diag <- 1E7
diag <- tibble(
repid = sample(1:n_pat, size = n_diag, replace = TRUE),
abrq = sample(20101:20224, size = n_diag, replace = TRUE),
icd = sample(c("E01, E02, E11, E12"), size = n_diag, replace = TRUE),
icd_sub = paste0(icd, ".9")
) %>%
arrange(repid)
#connect sql db
sql_loc <- dbConnect(RSQLite::SQLite(), dbname = tempfile())
dbWriteTable(sql_loc, "diag", diag)
#benchmark pointblank::rows_distinct vs. dplyr::distinct
#added another count to make sure that SQL DISTINCT is actually materialized
results <-
bench::press(
rows = c(1E4),
bench::mark(
pointblank_distinct_sql = create_agent(
tbl = {tbl(sql_loc, "diag") %>% head(rows)}) %>%
rows_distinct() %>%
interrogate(extract_failed = FALSE),
pointblank_distinct_tib = create_agent(
tbl = {diag %>% head(rows)}) %>% # <------- changed this!
rows_distinct() %>%
interrogate(extract_failed = FALSE),
dplyr_distinct_sql = tbl(sql_loc, "diag") %>%
head(n = rows) %>%
distinct() %>%
tally() %>%
collect(),
dplyr_distinct_tib = diag %>%
head(n = rows) %>%
distinct() %>%
tally(),
pointblank_source = tbl(sql_loc, "diag") %>%
head(rows) %>%
dplyr::select(everything()) %>%
dplyr::group_by() %>%
dplyr::mutate(`pb_is_good_` = ifelse(dplyr::n() == 1, TRUE, FALSE)) %>%
dplyr::ungroup() %>%
collect(),
check = FALSE,
iterations = 2
)
)
#> Running with:
#> rows
#> 1 10000
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
results
#> # A tibble: 5 × 7
#> expression rows min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <dbl> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 pointblank_distinct_sql 10000 10.03s 10.04s 0.0996 12.84MB 0.299
#> 2 pointblank_distinct_tib 10000 50.19ms 51.9ms 19.3 3.19MB 28.9
#> 3 dplyr_distinct_sql 10000 23.55ms 24.85ms 40.2 212.03KB 20.1
#> 4 dplyr_distinct_tib 10000 1.04ms 1.13ms 887. 1.01MB 0
#> 5 pointblank_source 10000 2.57s 2.58s 0.388 878.67KB 0
Created on 2023-09-22 with reprex v2.0.2