fs icon indicating copy to clipboard operation
fs copied to clipboard

Added helper function call_with_deduplication and use it to speed up path_file and path_dir for vectors with repeats.

Open orgadish opened this issue 1 year ago • 0 comments

See #424.

This PR incurs a slight time cost for fully unique vectors, but I believe the majority of use cases involving long vectors involve many repeated values (e.g. readr::read_csv(x, id="file_path)).

For a vector with significant duplication, the time savings is 2x on Mac and 40x on Windows (see below). In the tests below there is a 5ms overhead cost for fully unique vectors.

Some ways to speed this up would be:

  1. Include logic to avoid match call if length(unique(x)) == length(x) (or close) -- note: I tried this but a chunk of the work (unique(x)) has already been done and becomes sunk cost.
  2. Use faster unique and/or match (e.g. collapse::funique or data.table::chmatch).
  3. Figure out a way to use something like vctrs::vec_duplicated_id and/or vctrs::vec_unique_loc since it seems the action of unique(x) and match(x, unique_x) is redundant. I tried this but couldn't figure it out -- it might need to be a new function in vctrs. I will submit an issue separately.

Timing details

Mac

call_with_deduplication <- function(func, x, ...) {
  unique_x <- unique(x)
  func(unique_x, ...)[match(x, unique_x)]
}

path_file_old <- function(path) {
  is_missing <- is.na(path)
  path[!is_missing] <- basename(path[!is_missing])
  as.character(path)
}

path_file_new <- function(path) {
  is_missing <- is.na(path)
  path[!is_missing] <- call_with_deduplication(basename, path[!is_missing])
  as.character(path)
}

set.seed(0)
N <- 1e5
paths_all_unique <- fs::path("base", glue::glue("dir{d}", d=1:N), "inner")
paths_1pct_unique <- sample(paths_all_unique, N*1/100) |> rep(100/1)

bench::mark(
  old_rep01 = path_file_old(paths_1pct_unique),
  new_rep01 = path_file_new(paths_1pct_unique),
  old_uni = path_file_old(paths_all_unique),
  new_uni = path_file_new(paths_all_unique),
  iterations = 100,
  check=FALSE
)
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 old_rep01   20.68ms   27.4ms      35.1    4.96MB     4.34
#> 2 new_rep01    9.07ms   12.8ms      74.2    7.52MB    19.7 
#> 3 old_uni     24.94ms   28.8ms      32.8   10.68MB     5.78
#> 4 new_uni     34.53ms   39.2ms      25.7   12.45MB    11.0

Created on 2023-07-05 with reprex v2.0.2

Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.2.2 (2022-10-31)
#>  os       macOS Big Sur ... 10.16
#>  system   x86_64, darwin17.0
#>  ui       X11
#>  language (EN)
#>  collate  en_US.UTF-8
#>  ctype    en_US.UTF-8
#>  tz       America/Los_Angeles
#>  date     2023-07-05
#>  pandoc   2.19.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package     * version date (UTC) lib source
#>  bench         1.1.2   2021-11-30 [1] CRAN (R 4.2.0)
#>  cli           3.6.1   2023-03-23 [1] CRAN (R 4.2.0)
#>  digest        0.6.31  2022-12-11 [1] CRAN (R 4.2.0)
#>  evaluate      0.19    2022-12-13 [1] CRAN (R 4.2.0)
#>  fansi         1.0.4   2023-01-22 [1] CRAN (R 4.2.0)
#>  fastmap       1.1.0   2021-01-25 [1] CRAN (R 4.2.0)
#>  fs            1.6.2   2023-04-25 [1] CRAN (R 4.2.0)
#>  glue          1.6.2   2022-02-24 [1] CRAN (R 4.2.0)
#>  highr         0.10    2022-12-22 [1] CRAN (R 4.2.0)
#>  htmltools     0.5.3   2022-07-18 [1] CRAN (R 4.2.0)
#>  knitr         1.41    2022-11-18 [1] CRAN (R 4.2.0)
#>  lifecycle     1.0.3   2022-10-07 [1] CRAN (R 4.2.0)
#>  magrittr      2.0.3   2022-03-30 [1] CRAN (R 4.2.0)
#>  pillar        1.9.0   2023-03-22 [1] CRAN (R 4.2.0)
#>  pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.2.0)
#>  profmem       0.6.0   2020-12-13 [1] CRAN (R 4.2.0)
#>  purrr         1.0.1   2023-01-10 [1] CRAN (R 4.2.0)
#>  R.cache       0.16.0  2022-07-21 [1] CRAN (R 4.2.0)
#>  R.methodsS3   1.8.2   2022-06-13 [1] CRAN (R 4.2.0)
#>  R.oo          1.25.0  2022-06-12 [1] CRAN (R 4.2.0)
#>  R.utils       2.12.2  2022-11-11 [1] CRAN (R 4.2.0)
#>  reprex        2.0.2   2022-08-17 [1] CRAN (R 4.2.0)
#>  rlang         1.1.1   2023-04-28 [1] CRAN (R 4.2.0)
#>  rmarkdown     2.14    2022-04-25 [1] CRAN (R 4.2.0)
#>  rstudioapi    0.13    2020-11-12 [1] CRAN (R 4.2.0)
#>  sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.2.0)
#>  stringi       1.7.8   2022-07-11 [1] CRAN (R 4.2.0)
#>  stringr       1.5.0   2022-12-02 [1] CRAN (R 4.2.0)
#>  styler        1.8.1   2022-11-07 [1] CRAN (R 4.2.0)
#>  tibble        3.2.1   2023-03-20 [1] CRAN (R 4.2.0)
#>  utf8          1.2.3   2023-01-31 [1] CRAN (R 4.2.0)
#>  vctrs         0.6.3   2023-06-14 [1] CRAN (R 4.2.0)
#>  withr         2.5.0   2022-03-03 [1] CRAN (R 4.2.0)
#>  xfun          0.36    2022-12-21 [1] CRAN (R 4.2.0)
#>  yaml          2.3.6   2022-10-18 [1] CRAN (R 4.2.0)
#> 
#>  [1] /Library/Frameworks/R.framework/Versions/4.2/Resources/library
#> 
#> ──────────────────────────────────────────────────────────────────────────────

Windows

call_with_deduplication <- function(func, x, ...) {
  unique_x <- unique(x)
  func(unique_x, ...)[match(x, unique_x)]
}

path_file_old <- function(path) {
  is_missing <- is.na(path)
  path[!is_missing] <- basename(path[!is_missing])
  as.character(path)
}

path_file_new <- function(path) {
  is_missing <- is.na(path)
  path[!is_missing] <- call_with_deduplication(basename, path[!is_missing])
  as.character(path)
}

set.seed(0)
N <- 1e5
paths_all_unique <- fs::path("base", glue::glue("dir{d}", d=1:N), "inner")
paths_1pct_unique <- sample(paths_all_unique, N*1/100) |> rep(100/1)

bench::mark(
  old_rep01 = path_file_old(paths_1pct_unique),
  new_rep01 = path_file_new(paths_1pct_unique),
  old_uni = path_file_old(paths_all_unique),
  new_uni = path_file_new(paths_all_unique),
  iterations = 100,
  check=FALSE
)
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 old_rep01     537ms  542.9ms      1.84    4.96MB     1.22
#> 2 new_rep01    12.3ms   13.1ms     75.3     7.52MB    16.5 
#> 3 old_uni     534.7ms  538.6ms      1.85    6.52MB     1.13
#> 4 new_uni     542.9ms  545.9ms      1.82   12.34MB     3.25

Created on 2023-07-05 with reprex v2.0.2

Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.3.0 (2023-04-21 ucrt)
#>  os       Windows 10 x64 (build 19045)
#>  system   x86_64, mingw32
#>  ui       RTerm
#>  language (EN)
#>  collate  English_United States.utf8
#>  ctype    English_United States.utf8
#>  tz       America/Los_Angeles
#>  date     2023-07-05
#>  pandoc   2.19.2 @ C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  ! package     * version date (UTC) lib source
#>  P bench         1.1.3   2023-05-04 [?] CRAN (R 4.3.0)
#>    cli           3.6.1   2023-03-23 [1] CRAN (R 4.3.0)
#>    digest        0.6.31  2022-12-11 [1] CRAN (R 4.3.0)
#>    evaluate      0.21    2023-05-05 [1] CRAN (R 4.3.0)
#>    fansi         1.0.4   2023-01-22 [1] CRAN (R 4.3.0)
#>    fastmap       1.1.1   2023-02-24 [1] CRAN (R 4.3.0)
#>    fs            1.6.2   2023-04-25 [1] CRAN (R 4.3.0)
#>    glue          1.6.2   2022-02-24 [1] CRAN (R 4.3.0)
#>    htmltools     0.5.5   2023-03-23 [1] CRAN (R 4.3.0)
#>    knitr         1.42    2023-01-25 [1] CRAN (R 4.3.0)
#>    lifecycle     1.0.3   2022-10-07 [1] CRAN (R 4.3.0)
#>    magrittr      2.0.3   2022-03-30 [1] CRAN (R 4.3.0)
#>    pillar        1.9.0   2023-03-22 [1] CRAN (R 4.3.0)
#>    pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.3.0)
#>  P profmem       0.6.0   2020-12-13 [?] CRAN (R 4.3.0)
#>    reprex        2.0.2   2022-08-17 [1] CRAN (R 4.3.0)
#>    rlang         1.1.1   2023-04-28 [1] CRAN (R 4.3.0)
#>    rmarkdown     2.21    2023-03-26 [1] CRAN (R 4.3.0)
#>    rstudioapi    0.14    2022-08-22 [1] CRAN (R 4.3.0)
#>    sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.3.0)
#>    tibble        3.2.1   2023-03-20 [1] CRAN (R 4.3.0)
#>    utf8          1.2.3   2023-01-31 [1] CRAN (R 4.3.0)
#>    vctrs         0.6.2   2023-04-19 [1] CRAN (R 4.3.0)
#>    withr         2.5.0   2022-03-03 [1] CRAN (R 4.3.0)
#>    xfun          0.39    2023-04-20 [1] CRAN (R 4.3.0)
#>    yaml          2.3.7   2023-01-23 [1] CRAN (R 4.3.0)
#> 
#>  [1] C:/Users/LAB/Desktop/2023-05 Sequence Recovery Analysis/renv/library/R-4.3/x86_64-w64-mingw32
#>  [2] C:/Users/LAB/AppData/Local/R/cache/R/renv/sandbox/R-4.3/x86_64-w64-mingw32/830ce55b
#>  [3] C:/Program Files/R/R-4.3.0/library
#> 
#>  P ── Loaded and on-disk path mismatch.
#> 
#> ──────────────────────────────────────────────────────────────────────────────

orgadish avatar Jul 06 '23 05:07 orgadish