dplyr icon indicating copy to clipboard operation
dplyr copied to clipboard

Request non-equal join: closest() supports dual-sided inequality <>

Open zhjx19 opened this issue 1 year ago • 3 comments

For example, I now have the following Table 1 and Table 2. I want to find a value2 from both sides that is closest to the value based on matching the IDs. However, closest() only supports an inequality involving one of: >, >=, <, or <=.

library(purrr)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
df1 = tibble(
  ID = c("NO1", "NO1", "NO2", "NO3", "NO3", "NO3", "NO4", "NO5", 
         "NO5", "NO5", "NO5", "NO6", "NO7", "NO7", "NO7", "NO7", 
         "NO7", "NO8", "NO9", "NO9"), 
  value = c(11, 25, 22, 24, 29, 1.75, 26, 27, 55, 44, 2.1, 31, 
         35, 33, 410, 21, 1.44, 80, 75, 52))
df1
#> # A tibble: 20 × 2
#>    ID    value2
#>    <chr>  <dbl>
#>  1 NO1    11   
#>  2 NO1    25   
#>  3 NO2    22   
#>  4 NO3    24   
#>  5 NO3    29   
#>  6 NO3     1.75
#>  7 NO4    26   
#>  8 NO5    27   
#>  9 NO5    55   
#> 10 NO5    44   
#> 11 NO5     2.1 
#> 12 NO6    31   
#> 13 NO7    35   
#> 14 NO7    33   
#> 15 NO7   410   
#> 16 NO7    21   
#> 17 NO7     1.44
#> 18 NO8    80   
#> 19 NO9    75   
#> 20 NO9    52
df2 = tibble(
  编号 = c("NO1", "NO1", "NO2", "NO3", "NO3", "NO4", "NO5", "NO5", 
         "NO5", "NO5", "NO6", "NO7", "NO7", "NO7", "NO8", "NO9", "NO9"), 
  数值2 = c(11.2, 24.6, 21.6, 22, 29.5, 26.1, 26.9, 26.4, 54.3,
            42.1, 30.2, 34.7, 31.2, 20.59, 79.8, 74.2, 50.9))
df2
#> # A tibble: 17 × 2
#>    编号  数值2
#>    <chr> <dbl>
#>  1 NO1    11.2
#>  2 NO1    24.6
#>  3 NO2    21.6
#>  4 NO3    22  
#>  5 NO3    29.5
#>  6 NO4    26.1
#>  7 NO5    26.9
#>  8 NO5    26.4
#>  9 NO5    54.3
#> 10 NO5    42.1
#> 11 NO6    30.2
#> 12 NO7    34.7
#> 13 NO7    31.2
#> 14 NO7    20.6
#> 15 NO8    79.8
#> 16 NO9    74.2
#> 17 NO9    50.9

The manual method I can think of right now is:

df1 %>%
  left_join(df2, join_by(编号), relationship = "many-to-many") %>%
  group_nest(编号, 数值) %>%
  mutate(数值2 = map2_dbl(数值, data, \(x, y) y$数值2[which.min(abs(y$数值2 - x))]),
         误差 = abs(数值 - 数值2)) %>%
  select(-data)
#> # A tibble: 20 × 4
#>    编号    数值 数值2    误差
#>    <chr>  <dbl> <dbl>   <dbl>
#>  1 NO1    11     11.2   0.200
#>  2 NO1    25     24.6   0.400
#>  3 NO2    22     21.6   0.400
#>  4 NO3     1.75  22    20.2  
#>  5 NO3    24     22     2    
#>  6 NO3    29     29.5   0.5  
#>  7 NO4    26     26.1   0.100
#>  8 NO5     2.1   26.4  24.3  
#>  9 NO5    27     26.9   0.100
#> 10 NO5    44     42.1   1.90 
#> 11 NO5    55     54.3   0.700
#> 12 NO6    31     30.2   0.800
#> 13 NO7     1.44  20.6  19.2  
#> 14 NO7    21     20.6   0.41 
#> 15 NO7    33     34.7   1.70 
#> 16 NO7    35     34.7   0.300
#> 17 NO7   410     34.7 375.   
#> 18 NO8    80     79.8   0.200
#> 19 NO9    52     50.9   1.10 
#> 20 NO9    75     74.2   0.800

zhjx19 avatar Feb 21 '24 08:02 zhjx19

Here's an alternative to the near-cross-join; I don't know that double-left-join is much better than the near-cross-join, but this has the same effect I think (while preserving the LHS row order). I'll add df2$rn here primarily to demonstrate that the across supports arbitrary number of columns in the RHS frame.

I renamed the columns because the code was inconsistent.

df1 <- structure(list(ID = c("NO1", "NO1", "NO2", "NO3", "NO3", "NO3", "NO4", "NO5", "NO5", "NO5", "NO5", "NO6", "NO7", "NO7", "NO7", "NO7", "NO7", "NO8", "NO9", "NO9"), value = c(11, 25, 22, 24, 29, 1.75, 26, 27, 55, 44, 2.1, 31, 35, 33, 410, 21, 1.44, 80, 75, 52)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -20L))
df2 <- structure(list(ID = c("NO1", "NO1", "NO2", "NO3", "NO3", "NO4", "NO5", "NO5", "NO5", "NO5", "NO6", "NO7", "NO7", "NO7", "NO8", "NO9", "NO9"), value2 = c(11.2, 24.6, 21.6, 22, 29.5, 26.1, 26.9, 26.4, 54.3, 42.1, 30.2, 34.7, 31.2, 20.59, 79.8, 74.2, 50.9)), row.names = c(NA, -17L), class = c("tbl_df", "tbl", "data.frame"))

Two rolling joins, the second a reverse of the first:

df2$rn <- 1:nrow(df2)
left_join(df1, df2, join_by(ID, closest(value <= value2))) |>
  left_join(df2, join_by(ID, closest(value >= value2)), suffix = c("", ".y")) |>
  mutate(
    across(any_of(paste0(names(df2), ".y")), ~ coalesce(
      if_else(abs(value-value2) < abs(value-value2.y), cur_data()[[ sub("\\.y$", "", cur_column()) ]], .x),
      cur_data()[[ sub("\\.y$", "", cur_column()) ]], .x),
      .names = "{sub('.y$','',.col)}")
  ) |>
  select(-ends_with(".y"))
# # A tibble: 20 × 4
#    ID     value value2    rn
#    <chr>  <dbl>  <dbl> <int>
#  1 NO1    11      11.2     1
#  2 NO1    25      24.6     2
#  3 NO2    22      21.6     3
#  4 NO3    24      22       4
#  5 NO3    29      29.5     5
#  6 NO3     1.75   22       4
#  7 NO4    26      26.1     6
#  8 NO5    27      26.9     7
#  9 NO5    55      54.3     9
# 10 NO5    44      42.1    10
# 11 NO5     2.1    26.4     8
# 12 NO6    31      30.2    11
# 13 NO7    35      34.7    12
# 14 NO7    33      34.7    12
# 15 NO7   410      34.7    12
# 16 NO7    21      20.6    14
# 17 NO7     1.44   20.6    14
# 18 NO8    80      79.8    15
# 19 NO9    75      74.2    16
# 20 NO9    52      50.9    17

I fully agree that having a native join_by(ID, closest(time == time)) would be much easier to read, much clearer, less likely to have errors, and depending on how it is implemented, likely faster than this (though I make no claim that this performance is slow).

r2evans avatar Mar 14 '25 15:03 r2evans

This comment about closest(==) still holds https://github.com/tidyverse/dplyr/issues/6686#issuecomment-1429988821

DavisVaughan avatar Mar 14 '25 19:03 DavisVaughan

@DavisVaughan that's a fantastic comment, I'm glad it's linked in this issue now.

I understand the difficulty in implementation; there is precedent in other places to support various actions only on number-like fields, where an attempt to use closest(==) on character fields could stop(..) quickly. What are your thoughts on partial implementation?

r2evans avatar Mar 14 '25 19:03 r2evans