dplyr
dplyr copied to clipboard
Add dplyr_sticky_cols() or similar
So that when you implement a tibble subclass with sticky columns, you don't need to add a new select()
method. Likely to involve renaming ensure_group_vars()
I think the class should be able to specify front- or back-stickiness. For instance the grouping columns of grouped-df are front-sticky, whereas the geometry columns of sf are back-sticky.
Would stickyness imply focus (#6252)?
I came to make an issue similar to this based on Extending dplyr with new data frame subclasses.
To have a sticky column the third bullet point states:
If you have scalar attributes that depend on columns, implement a dplyr_reconstruct() method and a 1d [ method. For example, if your class requires that certain columns be present, your method should return a data.frame or tibble when those columns are removed.
Based on this one would anticipate that simply implementing dplyr_reconstruct.my_class
and [.my_class
would be sufficient for sticky columns (a la sf).
Based on my exploration this isn't true at the moment. It looks like dplyr_col_select
generic needs to be exported so that dplyr_col_select.my_class
can be implemented.
The below reprex illustrates the current short comings. The idea is there is a data frame class called point_collection
which contains a vctrs class points
which is a list of numeric vectors of length 2. The data frame contains the attribute extent
which has the furthest x and y extents of the point collection (updated when rows are added or removed).
Creating the class
# create some points
pnt_data <- lapply(1:5, \(x) runif(2, 0, 90))
# create new vector class
pnts <- vctrs::new_vctr(pnt_data, class = "points")
# function to calculate extent stored as an attribute
calculate_extent <- function(.x) {
stopifnot(inherits(.x, "points"))
x <- vapply(.x, `[`, numeric(1), 1)
y <- vapply(.x, `[`, numeric(1), 2)
c(xmin = min(x), xmax = max(x), ymin = min(y), ymax = max(y))
}
# helper to find point column
which_is_point_col <- function(x) which(vapply(x, inherits, logical(1), "points"))
# function to create a new point collection
new_point_collection <- function(x) {
point_col <- which_is_point_col(x)
tibble::new_tibble(
x,
class = "point_collection",
point_col = names(point_col)[1],
extent = calculate_extent(x[[point_col]])
)
}
# a special format method of course
format.point_collection <- function(x, ...) {
extent <- attr(x, "extent")
extent_str <- paste(names(extent), round(extent, 2), sep = ": ", collapse = " ")
c(
title = "Point Collection",
bbox = extent_str,
NextMethod()
)
}
Implementing `dplyr_reconstruct` and `[`
# create a new tibble super class
x <- new_point_collection(list(pnts = pnts))
# print w/ format
x
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 6.51 ymax: 73.79
#> # A tibble: 5 × 1
#> pnts
#> <points>
#> 1 17.64270, 73.78937
#> 2 82.43608, 39.68580
#> 3 39.72003, 25.56988
#> 4 40.48171, 39.16232
#> 5 59.128772, 6.510731
# implement a `[.point_collection` to maintain the `point_col` attribute
# retains points always
dplyr_reconstruct.point_collection <- function(data, template) {
res <- NextMethod()
new_point_collection(res)
}
# 1D selection method. I'm sure theres a way better way to do this but
# alas im a n00b
`[.point_collection` <- function(x, i, j, ...) {
if (missing(i) && missing(j)) return(x)
# extract points column
pnt_col_name <- attr(x, "point_col")
pnt_col <- x[[pnt_col_name]]
res <- NextMethod()
# x[i,]
if (!missing(i) && missing(j)) {
return(res)
}
# x[i]
if (missing(j)) {
res[[pnt_col_name]] <- pnt_col
return(res)
}
# x[i, j]
if (!missing(i) && !missing(j)) {
pnt_col <- pnt_col[i]
res[[pnt_col_name]] <- pnt_col
return(res)
}
#x[, j]
if (missing(i) && !missing(j)) {
res[[pnt_col_name]] <- pnt_col
return(res)
}
res
}
# Demonstrate selection
x[1]
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 6.51 ymax: 73.79
#> # A tibble: 5 × 1
#> pnts
#> <points>
#> 1 17.64270, 73.78937
#> 2 82.43608, 39.68580
#> 3 39.72003, 25.56988
#> 4 40.48171, 39.16232
#> 5 59.128772, 6.510731
x[1,]
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 6.51 ymax: 73.79
#> # A tibble: 1 × 1
#> pnts
#> <points>
#> 1 17.64270, 73.78937
x[,1]
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 6.51 ymax: 73.79
#> # A tibble: 5 × 1
#> pnts
#> <points>
#> 1 17.64270, 73.78937
#> 2 82.43608, 39.68580
#> 3 39.72003, 25.56988
#> 4 40.48171, 39.16232
#> 5 59.128772, 6.510731
x[1,1]
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 6.51 ymax: 73.79
#> # A tibble: 1 × 1
#> pnts
#> <points>
#> 1 17.64270, 73.78937
Much of the promised dplyr functionality works out of the gate! But transmute()
and select()
do not entirely work.
library(dplyr, warn.conflicts = FALSE)
# I expect the `pnts` to remain
select(x)
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 6.51 ymax: 73.79
#> # A tibble: 5 × 0
transmute(x, y = 1)
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 6.51 ymax: 73.79
#> # A tibble: 5 × 1
#> y
#> <dbl>
#> 1 1
#> 2 1
#> 3 1
#> 4 1
#> 5 1
# works as anticipated
select(x, pooooints = pnts)
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 6.51 ymax: 73.79
#> # A tibble: 5 × 1
#> pooooints
#> <points>
#> 1 17.64270, 73.78937
#> 2 82.43608, 39.68580
#> 3 39.72003, 25.56988
#> 4 40.48171, 39.16232
#> 5 59.128772, 6.510731
relocate(pnt_ids, id)
#> Error in relocate(pnt_ids, id): object 'pnt_ids' not found
slice(x, 2)
#> Point Collection
#> xmin: 82.44 xmax: 82.44 ymin: 39.69 ymax: 39.69
#> # A tibble: 1 × 1
#> pnts
#> <points>
#> 1 82.43608, 39.68580
pnt_ids <- mutate(x, id = row_number()) |>
arrange(-id)
inner_join(
pnt_ids,
tibble::tibble(id = 1:3, abc = letters[1:3])
)
#> Joining with `by = join_by(id)`
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 25.57 ymax: 73.79
#> # A tibble: 3 × 3
#> pnts id abc
#> <points> <int> <chr>
#> 1 39.72003, 25.56988 3 c
#> 2 82.43608, 39.68580 2 b
#> 3 17.64270, 73.78937 1 a
# does not work (unclear why)
distinct(pnt_ids, id)
#> Error in `vectbl_as_col_location2()`:
#> ! Can't extract column with `point_col`.
#> ✖ Subscript `point_col` must be size 1, not 0.
#> Backtrace:
#> ▆
#> 1. ├─dplyr::distinct(pnt_ids, id)
#> 2. └─dplyr:::distinct.data.frame(pnt_ids, id)
#> 3. ├─dplyr::dplyr_row_slice(out, loc)
#> 4. └─dplyr:::dplyr_row_slice.data.frame(out, loc)
#> 5. └─dplyr::dplyr_reconstruct(vec_slice(data, i), data)
#> 6. ├─dplyr:::dplyr_reconstruct_dispatch(data, template)
#> 7. └─global dplyr_reconstruct.point_collection(data, template)
#> 8. └─global new_point_collection(res)
#> 9. ├─tibble::new_tibble(...)
#> 10. │ └─rlang::pairlist2(...)
#> 11. ├─global calculate_extent(x[[point_col]])
#> 12. │ └─base::stopifnot(inherits(.x, "points"))
#> 13. ├─x[[point_col]]
#> 14. └─tibble:::`[[.tbl_df`(x, point_col)
#> 15. └─tibble:::tbl_subset2(x, j = i, j_arg = substitute(i))
#> 16. └─tibble:::vectbl_as_col_location2(j, length(x), j_arg = j_arg)
#> 17. ├─tibble:::subclass_col_index_errors(...)
#> 18. │ └─base::withCallingHandlers(...)
#> 19. └─vctrs::vec_as_location2(j, n, names)
#> 20. └─vctrs:::result_get(...)
#> 21. └─rlang::cnd_signal(x$err)
Created on 2023-02-24 with reprex v2.0.2
@JosiahParry I think you are misinterpreting the docs a little.
We expect that x[i]
will always return exactly n
columns, where n
is the length of i
.
In other words, [
is a low level column subsetting interface that is not allowed to have sticky columns whatsoever. If I request x["a"]
then I must only return a column named "a"
. I can't also return a column named "sticky"
. This is a strict requirement for us to be able to program around [
correctly.
What that means for you is that [
should return a bare tibble (with no point collection attributes at all) if [
drops the sticky column. That is what is meant by the 2nd sentence here:
If you have scalar attributes that depend on columns, implement a dplyr_reconstruct() method and a 1d [ method. For example, if your class requires that certain columns be present, your method should return a data.frame or tibble when those columns are removed.
If you want a user facing API that allows sticky columns, then we suggesting creating a select.points_collection
method that retains them.
So it becomes:
-
x[i]
for low level programmer-friendly column selection -
select(x, ...)
for high level user-friendly column selection
You can use grouped_df
as an example of this:
library(dplyr)
gdf <- group_by(tibble(g = 1:2, x = c("a", "b")), g)
# grouped data frame
gdf
#> # A tibble: 2 × 2
#> # Groups: g [2]
#> g x
#> <int> <chr>
#> 1 1 a
#> 2 2 b
# retained grouping column, so still grouped
gdf["g"]
#> # A tibble: 2 × 1
#> # Groups: g [2]
#> g
#> <int>
#> 1 1
#> 2 2
# lost grouping column, so it is a tibble.
# notice we ONLY return the `x` column (a programming invariant we must follow)
gdf["x"]
#> # A tibble: 2 × 1
#> x
#> <chr>
#> 1 a
#> 2 b
# user friendly version that retains `g` no matter what
select(gdf, x)
#> Adding missing grouping variables: `g`
#> # A tibble: 2 × 2
#> # Groups: g [2]
#> g x
#> <int> <chr>
#> 1 1 a
#> 2 2 b
Created on 2023-02-27 with reprex v2.0.2.9000