vctrs
vctrs copied to clipboard
Consider adding `vec_expand_grid()`
It would be nice to have this in packages that can't use tidyr, and we could probably implement it in a slightly more performant way than tidyr.
> tidyr:::expand_grid
function (..., .name_repair = "check_unique")
{
dots <- dots_cols(...)
ns <- map_int(dots, vec_size)
n <- prod(ns)
if (n == 0) {
out <- map(dots, vec_slice, integer())
}
else {
each <- n/cumprod(ns)
times <- n/each/ns
out <- pmap(list(x = dots, each = each, times = times),
vec_repeat)
}
out <- as_tibble(out, .name_repair = .name_repair)
flatten_nested(out, attr(dots, "named"), .name_repair)
}
I'm not sure if we would need the flattening bit, which seems like it might be there for some backwards compat support
This seems pretty good - it is possible this should live in funs though
library(vctrs)
library(rlang)
library(purrr)
vec_expand_grid <- function(..., .name_repair = "check_unique") {
args <- list2(...)
names <- names2(args)
names <- vec_as_names(names, repair = .name_repair, repair_arg = ".name_repair")
args <- unname(args)
sizes <- list_sizes(args)
size <- as.integer(prod(sizes))
if (size == 0L) {
args <- map(args, vec_slice, NULL)
} else {
times <- size / cumprod(sizes)
args <- map2(args, times, vec_rep_each)
times <- size / times / sizes
args <- map2(args, times, vec_rep)
}
names(args) <- names
new_data_frame(args, n = size)
}
Examples
# require named input by default.
# i think this is a lot safer and more predictable,
# i don't particularly love auto-naming of quos for this
vec_expand_grid(1:2, 3:4)
#> Error: Names can't be empty.
#> x Empty names found at locations 1 and 2.
vec_expand_grid(x = 1:2, y = 3:5)
#> x y
#> 1 1 3
#> 2 1 4
#> 3 1 5
#> 4 2 3
#> 5 2 4
#> 6 2 5
vec_expand_grid(1:2, 3:4, .name_repair = "unique")
#> New names:
#> * `` -> ...1
#> * `` -> ...2
#> ...1 ...2
#> 1 1 3
#> 2 1 4
#> 3 2 3
#> 4 2 4
# with df-cols
df <- data_frame(a = 1:3, b = c("a", "b", "c"))
tibble::as_tibble(vec_expand_grid(x = 1:2, df = df))
#> # A tibble: 6 x 2
#> x df$a $b
#> <int> <int> <chr>
#> 1 1 1 a
#> 2 1 2 b
#> 3 1 3 c
#> 4 2 1 a
#> 5 2 2 b
#> 6 2 3 c
Performance seems to be better than tidyr / base, at least in these simple examples.
x <- 1:1000
y <- 1:50
z <- letters
bench::mark(
vctrs = vec_expand_grid(x = x, y = y, z = z),
tidyr = as.data.frame(tidyr::expand_grid(x = x, y = y, z = z)),
base = expand.grid(z = z, y = y, x = x, stringsAsFactors = FALSE)[c("x", "y", "z")],
iterations = 50
)
#> # A tibble: 3 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 vctrs 9.38ms 9.38ms 107. 44.7MB 5330.
#> 2 tidyr 61.32ms 64.98ms 14.9 43.7MB 27.3
#> 3 base 27.43ms 32.78ms 31.1 39.9MB 60.4