dplyr
dplyr copied to clipboard
Request non-equal join: closest() supports dual-sided inequality <>
For example, I now have the following Table 1 and Table 2. I want to find a value2 from both sides that is closest to the value based on matching the IDs. However, closest() only supports an inequality involving one of: >, >=, <, or <=.
library(purrr)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df1 = tibble(
ID = c("NO1", "NO1", "NO2", "NO3", "NO3", "NO3", "NO4", "NO5",
"NO5", "NO5", "NO5", "NO6", "NO7", "NO7", "NO7", "NO7",
"NO7", "NO8", "NO9", "NO9"),
value = c(11, 25, 22, 24, 29, 1.75, 26, 27, 55, 44, 2.1, 31,
35, 33, 410, 21, 1.44, 80, 75, 52))
df1
#> # A tibble: 20 × 2
#> ID value2
#> <chr> <dbl>
#> 1 NO1 11
#> 2 NO1 25
#> 3 NO2 22
#> 4 NO3 24
#> 5 NO3 29
#> 6 NO3 1.75
#> 7 NO4 26
#> 8 NO5 27
#> 9 NO5 55
#> 10 NO5 44
#> 11 NO5 2.1
#> 12 NO6 31
#> 13 NO7 35
#> 14 NO7 33
#> 15 NO7 410
#> 16 NO7 21
#> 17 NO7 1.44
#> 18 NO8 80
#> 19 NO9 75
#> 20 NO9 52
df2 = tibble(
编号 = c("NO1", "NO1", "NO2", "NO3", "NO3", "NO4", "NO5", "NO5",
"NO5", "NO5", "NO6", "NO7", "NO7", "NO7", "NO8", "NO9", "NO9"),
数值2 = c(11.2, 24.6, 21.6, 22, 29.5, 26.1, 26.9, 26.4, 54.3,
42.1, 30.2, 34.7, 31.2, 20.59, 79.8, 74.2, 50.9))
df2
#> # A tibble: 17 × 2
#> 编号 数值2
#> <chr> <dbl>
#> 1 NO1 11.2
#> 2 NO1 24.6
#> 3 NO2 21.6
#> 4 NO3 22
#> 5 NO3 29.5
#> 6 NO4 26.1
#> 7 NO5 26.9
#> 8 NO5 26.4
#> 9 NO5 54.3
#> 10 NO5 42.1
#> 11 NO6 30.2
#> 12 NO7 34.7
#> 13 NO7 31.2
#> 14 NO7 20.6
#> 15 NO8 79.8
#> 16 NO9 74.2
#> 17 NO9 50.9
The manual method I can think of right now is:
df1 %>%
left_join(df2, join_by(编号), relationship = "many-to-many") %>%
group_nest(编号, 数值) %>%
mutate(数值2 = map2_dbl(数值, data, \(x, y) y$数值2[which.min(abs(y$数值2 - x))]),
误差 = abs(数值 - 数值2)) %>%
select(-data)
#> # A tibble: 20 × 4
#> 编号 数值 数值2 误差
#> <chr> <dbl> <dbl> <dbl>
#> 1 NO1 11 11.2 0.200
#> 2 NO1 25 24.6 0.400
#> 3 NO2 22 21.6 0.400
#> 4 NO3 1.75 22 20.2
#> 5 NO3 24 22 2
#> 6 NO3 29 29.5 0.5
#> 7 NO4 26 26.1 0.100
#> 8 NO5 2.1 26.4 24.3
#> 9 NO5 27 26.9 0.100
#> 10 NO5 44 42.1 1.90
#> 11 NO5 55 54.3 0.700
#> 12 NO6 31 30.2 0.800
#> 13 NO7 1.44 20.6 19.2
#> 14 NO7 21 20.6 0.41
#> 15 NO7 33 34.7 1.70
#> 16 NO7 35 34.7 0.300
#> 17 NO7 410 34.7 375.
#> 18 NO8 80 79.8 0.200
#> 19 NO9 52 50.9 1.10
#> 20 NO9 75 74.2 0.800
Here's an alternative to the near-cross-join; I don't know that double-left-join is much better than the near-cross-join, but this has the same effect I think (while preserving the LHS row order). I'll add df2$rn here primarily to demonstrate that the across supports arbitrary number of columns in the RHS frame.
I renamed the columns because the code was inconsistent.
df1 <- structure(list(ID = c("NO1", "NO1", "NO2", "NO3", "NO3", "NO3", "NO4", "NO5", "NO5", "NO5", "NO5", "NO6", "NO7", "NO7", "NO7", "NO7", "NO7", "NO8", "NO9", "NO9"), value = c(11, 25, 22, 24, 29, 1.75, 26, 27, 55, 44, 2.1, 31, 35, 33, 410, 21, 1.44, 80, 75, 52)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -20L))
df2 <- structure(list(ID = c("NO1", "NO1", "NO2", "NO3", "NO3", "NO4", "NO5", "NO5", "NO5", "NO5", "NO6", "NO7", "NO7", "NO7", "NO8", "NO9", "NO9"), value2 = c(11.2, 24.6, 21.6, 22, 29.5, 26.1, 26.9, 26.4, 54.3, 42.1, 30.2, 34.7, 31.2, 20.59, 79.8, 74.2, 50.9)), row.names = c(NA, -17L), class = c("tbl_df", "tbl", "data.frame"))
Two rolling joins, the second a reverse of the first:
df2$rn <- 1:nrow(df2)
left_join(df1, df2, join_by(ID, closest(value <= value2))) |>
left_join(df2, join_by(ID, closest(value >= value2)), suffix = c("", ".y")) |>
mutate(
across(any_of(paste0(names(df2), ".y")), ~ coalesce(
if_else(abs(value-value2) < abs(value-value2.y), cur_data()[[ sub("\\.y$", "", cur_column()) ]], .x),
cur_data()[[ sub("\\.y$", "", cur_column()) ]], .x),
.names = "{sub('.y$','',.col)}")
) |>
select(-ends_with(".y"))
# # A tibble: 20 × 4
# ID value value2 rn
# <chr> <dbl> <dbl> <int>
# 1 NO1 11 11.2 1
# 2 NO1 25 24.6 2
# 3 NO2 22 21.6 3
# 4 NO3 24 22 4
# 5 NO3 29 29.5 5
# 6 NO3 1.75 22 4
# 7 NO4 26 26.1 6
# 8 NO5 27 26.9 7
# 9 NO5 55 54.3 9
# 10 NO5 44 42.1 10
# 11 NO5 2.1 26.4 8
# 12 NO6 31 30.2 11
# 13 NO7 35 34.7 12
# 14 NO7 33 34.7 12
# 15 NO7 410 34.7 12
# 16 NO7 21 20.6 14
# 17 NO7 1.44 20.6 14
# 18 NO8 80 79.8 15
# 19 NO9 75 74.2 16
# 20 NO9 52 50.9 17
I fully agree that having a native join_by(ID, closest(time == time)) would be much easier to read, much clearer, less likely to have errors, and depending on how it is implemented, likely faster than this (though I make no claim that this performance is slow).
This comment about closest(==) still holds https://github.com/tidyverse/dplyr/issues/6686#issuecomment-1429988821
@DavisVaughan that's a fantastic comment, I'm glad it's linked in this issue now.
I understand the difficulty in implementation; there is precedent in other places to support various actions only on number-like fields, where an attempt to use closest(==) on character fields could stop(..) quickly. What are your thoughts on partial implementation?