fs
fs copied to clipboard
Added helper function call_with_deduplication and use it to speed up path_file and path_dir for vectors with repeats.
See #424.
This PR incurs a slight time cost for fully unique vectors, but I believe the majority of use cases involving long vectors involve many repeated values (e.g. readr::read_csv(x, id="file_path)
).
For a vector with significant duplication, the time savings is 2x on Mac and 40x on Windows (see below). In the tests below there is a 5ms overhead cost for fully unique vectors.
Some ways to speed this up would be:
- Include logic to avoid
match
call iflength(unique(x)) == length(x)
(or close) -- note: I tried this but a chunk of the work (unique(x)
) has already been done and becomes sunk cost. - Use faster
unique
and/ormatch
(e.g.collapse::funique
ordata.table::chmatch
). - Figure out a way to use something like
vctrs::vec_duplicated_id
and/orvctrs::vec_unique_loc
since it seems the action ofunique(x)
andmatch(x, unique_x)
is redundant. I tried this but couldn't figure it out -- it might need to be a new function invctrs
. I will submit an issue separately.
Timing details
Mac
call_with_deduplication <- function(func, x, ...) {
unique_x <- unique(x)
func(unique_x, ...)[match(x, unique_x)]
}
path_file_old <- function(path) {
is_missing <- is.na(path)
path[!is_missing] <- basename(path[!is_missing])
as.character(path)
}
path_file_new <- function(path) {
is_missing <- is.na(path)
path[!is_missing] <- call_with_deduplication(basename, path[!is_missing])
as.character(path)
}
set.seed(0)
N <- 1e5
paths_all_unique <- fs::path("base", glue::glue("dir{d}", d=1:N), "inner")
paths_1pct_unique <- sample(paths_all_unique, N*1/100) |> rep(100/1)
bench::mark(
old_rep01 = path_file_old(paths_1pct_unique),
new_rep01 = path_file_new(paths_1pct_unique),
old_uni = path_file_old(paths_all_unique),
new_uni = path_file_new(paths_all_unique),
iterations = 100,
check=FALSE
)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 old_rep01 20.68ms 27.4ms 35.1 4.96MB 4.34
#> 2 new_rep01 9.07ms 12.8ms 74.2 7.52MB 19.7
#> 3 old_uni 24.94ms 28.8ms 32.8 10.68MB 5.78
#> 4 new_uni 34.53ms 39.2ms 25.7 12.45MB 11.0
Created on 2023-07-05 with reprex v2.0.2
Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.2.2 (2022-10-31)
#> os macOS Big Sur ... 10.16
#> system x86_64, darwin17.0
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz America/Los_Angeles
#> date 2023-07-05
#> pandoc 2.19.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date (UTC) lib source
#> bench 1.1.2 2021-11-30 [1] CRAN (R 4.2.0)
#> cli 3.6.1 2023-03-23 [1] CRAN (R 4.2.0)
#> digest 0.6.31 2022-12-11 [1] CRAN (R 4.2.0)
#> evaluate 0.19 2022-12-13 [1] CRAN (R 4.2.0)
#> fansi 1.0.4 2023-01-22 [1] CRAN (R 4.2.0)
#> fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.2.0)
#> fs 1.6.2 2023-04-25 [1] CRAN (R 4.2.0)
#> glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.0)
#> highr 0.10 2022-12-22 [1] CRAN (R 4.2.0)
#> htmltools 0.5.3 2022-07-18 [1] CRAN (R 4.2.0)
#> knitr 1.41 2022-11-18 [1] CRAN (R 4.2.0)
#> lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.2.0)
#> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.0)
#> pillar 1.9.0 2023-03-22 [1] CRAN (R 4.2.0)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.0)
#> profmem 0.6.0 2020-12-13 [1] CRAN (R 4.2.0)
#> purrr 1.0.1 2023-01-10 [1] CRAN (R 4.2.0)
#> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.2.0)
#> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.2.0)
#> R.oo 1.25.0 2022-06-12 [1] CRAN (R 4.2.0)
#> R.utils 2.12.2 2022-11-11 [1] CRAN (R 4.2.0)
#> reprex 2.0.2 2022-08-17 [1] CRAN (R 4.2.0)
#> rlang 1.1.1 2023-04-28 [1] CRAN (R 4.2.0)
#> rmarkdown 2.14 2022-04-25 [1] CRAN (R 4.2.0)
#> rstudioapi 0.13 2020-11-12 [1] CRAN (R 4.2.0)
#> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.0)
#> stringi 1.7.8 2022-07-11 [1] CRAN (R 4.2.0)
#> stringr 1.5.0 2022-12-02 [1] CRAN (R 4.2.0)
#> styler 1.8.1 2022-11-07 [1] CRAN (R 4.2.0)
#> tibble 3.2.1 2023-03-20 [1] CRAN (R 4.2.0)
#> utf8 1.2.3 2023-01-31 [1] CRAN (R 4.2.0)
#> vctrs 0.6.3 2023-06-14 [1] CRAN (R 4.2.0)
#> withr 2.5.0 2022-03-03 [1] CRAN (R 4.2.0)
#> xfun 0.36 2022-12-21 [1] CRAN (R 4.2.0)
#> yaml 2.3.6 2022-10-18 [1] CRAN (R 4.2.0)
#>
#> [1] /Library/Frameworks/R.framework/Versions/4.2/Resources/library
#>
#> ──────────────────────────────────────────────────────────────────────────────
Windows
call_with_deduplication <- function(func, x, ...) {
unique_x <- unique(x)
func(unique_x, ...)[match(x, unique_x)]
}
path_file_old <- function(path) {
is_missing <- is.na(path)
path[!is_missing] <- basename(path[!is_missing])
as.character(path)
}
path_file_new <- function(path) {
is_missing <- is.na(path)
path[!is_missing] <- call_with_deduplication(basename, path[!is_missing])
as.character(path)
}
set.seed(0)
N <- 1e5
paths_all_unique <- fs::path("base", glue::glue("dir{d}", d=1:N), "inner")
paths_1pct_unique <- sample(paths_all_unique, N*1/100) |> rep(100/1)
bench::mark(
old_rep01 = path_file_old(paths_1pct_unique),
new_rep01 = path_file_new(paths_1pct_unique),
old_uni = path_file_old(paths_all_unique),
new_uni = path_file_new(paths_all_unique),
iterations = 100,
check=FALSE
)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 old_rep01 537ms 542.9ms 1.84 4.96MB 1.22
#> 2 new_rep01 12.3ms 13.1ms 75.3 7.52MB 16.5
#> 3 old_uni 534.7ms 538.6ms 1.85 6.52MB 1.13
#> 4 new_uni 542.9ms 545.9ms 1.82 12.34MB 3.25
Created on 2023-07-05 with reprex v2.0.2
Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.3.0 (2023-04-21 ucrt)
#> os Windows 10 x64 (build 19045)
#> system x86_64, mingw32
#> ui RTerm
#> language (EN)
#> collate English_United States.utf8
#> ctype English_United States.utf8
#> tz America/Los_Angeles
#> date 2023-07-05
#> pandoc 2.19.2 @ C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> ! package * version date (UTC) lib source
#> P bench 1.1.3 2023-05-04 [?] CRAN (R 4.3.0)
#> cli 3.6.1 2023-03-23 [1] CRAN (R 4.3.0)
#> digest 0.6.31 2022-12-11 [1] CRAN (R 4.3.0)
#> evaluate 0.21 2023-05-05 [1] CRAN (R 4.3.0)
#> fansi 1.0.4 2023-01-22 [1] CRAN (R 4.3.0)
#> fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.0)
#> fs 1.6.2 2023-04-25 [1] CRAN (R 4.3.0)
#> glue 1.6.2 2022-02-24 [1] CRAN (R 4.3.0)
#> htmltools 0.5.5 2023-03-23 [1] CRAN (R 4.3.0)
#> knitr 1.42 2023-01-25 [1] CRAN (R 4.3.0)
#> lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.3.0)
#> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0)
#> pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.0)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0)
#> P profmem 0.6.0 2020-12-13 [?] CRAN (R 4.3.0)
#> reprex 2.0.2 2022-08-17 [1] CRAN (R 4.3.0)
#> rlang 1.1.1 2023-04-28 [1] CRAN (R 4.3.0)
#> rmarkdown 2.21 2023-03-26 [1] CRAN (R 4.3.0)
#> rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.3.0)
#> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
#> tibble 3.2.1 2023-03-20 [1] CRAN (R 4.3.0)
#> utf8 1.2.3 2023-01-31 [1] CRAN (R 4.3.0)
#> vctrs 0.6.2 2023-04-19 [1] CRAN (R 4.3.0)
#> withr 2.5.0 2022-03-03 [1] CRAN (R 4.3.0)
#> xfun 0.39 2023-04-20 [1] CRAN (R 4.3.0)
#> yaml 2.3.7 2023-01-23 [1] CRAN (R 4.3.0)
#>
#> [1] C:/Users/LAB/Desktop/2023-05 Sequence Recovery Analysis/renv/library/R-4.3/x86_64-w64-mingw32
#> [2] C:/Users/LAB/AppData/Local/R/cache/R/renv/sandbox/R-4.3/x86_64-w64-mingw32/830ce55b
#> [3] C:/Program Files/R/R-4.3.0/library
#>
#> P ── Loaded and on-disk path mismatch.
#>
#> ──────────────────────────────────────────────────────────────────────────────