Calculation of distance with NA in X and/or Y coordinates
I was comparing how sf::st_distance handled NAs in coordinates depending on the underlying distance method used when I found a strange result from {s2}.
First, the comparison for context:
library(sf)
#> Linking to GEOS 3.13.1, GDAL 3.11.3, PROJ 9.6.0; sf_use_s2() is TRUE
# if st_point is used, and either coordinate columns are NA, returns POINT EMPTY
p1 <- st_point(c(1, 1))
p2 <- st_point(c(2, NA_real_))
p3 <- st_point(c(NA_real_, NA_real_))
print(p1)
#> POINT (1 1)
print(p2)
#> POINT EMPTY
print(p3)
#> POINT EMPTY
# But despite returning POINT EMPTY, st_coordinates shows there are still coords
st_coordinates(p1)
#> X Y
#> [1,] 1 1
st_coordinates(p2)
#> X Y
#> [1,] 2 NA
st_coordinates(p3)
#> X Y
#> [1,] NA NA
# Depending on the method, NAs are variably returned for just the point with NAs for X and Y
# or also the point with only NA for Y
st_distance(st_sfc(p1, p2, p3), which = 'Hausdorff')
#> [,1] [,2] [,3]
#> [1,] 0 NA NA
#> [2,] NA NA NA
#> [3,] NA NA NA
st_distance(st_sfc(p1, p2, p3), which = 'Euclidean')
#> 1 2 3
#> 1 0.000000 1.414214 NA
#> 2 1.414214 0.000000 NA
#> 3 NA NA 0
st_distance(st_sfc(p1, p2, p3), which = 'Frechet')
#> [,1] [,2] [,3]
#> [1,] 0 NA NA
#> [2,] NA NA NA
#> [3,] NA NA NA
sf_use_s2(FALSE)
#> Spherical geometry (s2) switched off
st_distance(st_sfc(p1, p2, p3, crs = 4326))
#> Units: [m]
#> [,1] [,2] [,3]
#> [1,] 0 NA NA
#> [2,] NA NA NA
#> [3,] NA NA NA
sf_use_s2(TRUE)
#> Spherical geometry (s2) switched on
st_distance(st_sfc(p1, p2, p3, crs = 4326))
#> Units: [m]
#> [,1] [,2] [,3]
#> [1,] 0 20015118 NA
#> [2,] 20015118 20015118 NA
#> [3,] NA NA NA
Focusing on {s2}, we see s2::as_s2_geography returns: POINT (1 1), POINT (nan nan) and POINT EMPTY
library(sf)
#> Linking to GEOS 3.13.1, GDAL 3.11.3, PROJ 9.6.0; sf_use_s2() is TRUE
library(s2)
p1 <- st_point(c(1, 1))
p2 <- st_point(c(2, NA_real_))
p3 <- st_point(c(NA_real_, NA_real_))
# Results for s2_distance_matrix
s2_distance_matrix(st_sfc(p1, p2, p3), st_sfc(p1, p2, p3))
#> [,1] [,2] [,3]
#> [1,] 0 20015118 NA
#> [2,] 20015118 20015118 NA
#> [3,] NA NA NA
# And same results for s2_distance
s2_distance(p1, p1)
#> [1] 0
s2_distance(p1, p2)
#> [1] 20015118
s2_distance(p1, p3)
#> [1] NA
s2_distance(p2, p2)
#> [1] 20015118
s2_distance(p2, p3)
#> [1] NA
# s2_distance_* use as_s2_geography to setup x, y
# we see a mix of valid points, POINT (nan nan), and POINT EMPTY
as_s2_geography(p1)
#> <geodesic s2_geography[1] with CRS=OGC:CRS84>
#> [1] POINT (1 1)
as_s2_geography(p2)
#> <geodesic s2_geography[1] with CRS=OGC:CRS84>
#> [1] POINT (nan nan)
as_s2_geography(p3)
#> <geodesic s2_geography[1] with CRS=OGC:CRS84>
#> [1] POINT EMPTY
# And results are the same when we work directly in characters (passed to as_s2_geography)
p1 <- 'POINT (1 1)'
p2 <- 'POINT (2 nan)'
p3 <- 'POINT (nan nan)'
s2_distance_matrix(c(p1, p2, p3), c(p1, p2, p3))
#> [,1] [,2] [,3]
#> [1,] 0 20015118 NA
#> [2,] 20015118 20015118 NA
#> [3,] NA NA NA
The distance being returned in the case of a POINT with nan is equal to pi multiplied by s2::s2_earth_radius_meters().
library(s2)
identical(s2_distance('POINT (1 1)', 'POINT (1 nan)'), pi * s2_earth_radius_meters())
[1] TRUE
identical(s2_distance('POINT (9999 9999)', 'POINT (1 nan)'), pi * s2_earth_radius_meters())
[1] TRUE
My expectation as a user is that NA would be returned and not 20015118. I'm not sure where this value is coming from.
I have added tests for NAs being returned instead to match the other tests in test-s2-accessors (https://github.com/r-spatial/s2/pull/290)
Let me know what you think, thanks!
That is a totally reasonable expectation! The conversion to S2's internal point format probably assumes finite input and we probably only ever checked for "empty point". Thank you for spotting this!
Thanks @paleolimbot - let me know how I can help.
For sf, I think that the sf::st_point() should convert a p2 into a p3.
Gives
library(sf)
# Linking to GEOS 3.12.2, GDAL 3.11.3, PROJ 9.4.1; sf_use_s2() is TRUE
# if st_point is used, and either coordinate columns are NA, returns POINT EMPTY
p1 <- st_point(c(1, 1))
p2 <- st_point(c(2, NA_real_))
p3 <- st_point(c(NA_real_, NA_real_))
st_coordinates(p2)
# X Y
# [1,] NA NA
st_distance(st_sfc(p1, p2, p3), which = 'Euclidean')
# 1 2 3
# 1 0 NA NA
# 2 NA NA NA
# 3 NA NA NA
Thanks @edzer
A related case, though involves the user actively setting na.fail to FALSE in st_as_sf, we see:
# With
# remotes::install_github('r-spatial/sf@34d4aba90763b87397f393c75a45aa4c811c04d7')
library(sf)
#> Linking to GEOS 3.13.1, GDAL 3.11.3, PROJ 9.6.0; sf_use_s2() is TRUE
p <- st_as_sf(data.frame(c(1, 2, NA_real_), c(1, NA_real_, NA_real_)),
coords = 1:2, crs = 4326, na.fail = FALSE)
st_coordinates(p)
#> X Y
#> [1,] 1 1
#> [2,] 2 NA
#> [3,] NA NA
st_distance(p)
#> Units: [m]
#> [,1] [,2] [,3]
#> [1,] 0 20015118 NA
#> [2,] 20015118 20015118 NA
#> [3,] NA NA NA
Created on 2025-10-27 with reprex v2.1.1
Thanks, works better now!
Great, this looks like it makes the handling of NAs more consistent, across all methods in st_distance and other downstream functions
library(sf)
#> Linking to GEOS 3.13.1, GDAL 3.11.3, PROJ 9.6.0; sf_use_s2() is TRUE
# if st_point is used, and either coordinate columns are NA, returns POINT EMPTY
p1 <- st_point(c(1, 1))
p2 <- st_point(c(2, NA_real_))
p3 <- st_point(c(NA_real_, NA_real_))
print(p1)
#> POINT (1 1)
print(p2)
#> POINT EMPTY
print(p3)
#> POINT EMPTY
# Coords are NA
st_coordinates(p1)
#> X Y
#> [1,] 1 1
st_coordinates(p2)
#> X Y
#> [1,] NA NA
st_coordinates(p3)
#> X Y
#> [1,] NA NA
# NAs are consistently returned for all points with any NAs in X or Y
st_distance(st_sfc(p1, p2, p3), which = 'Hausdorff')
#> [,1] [,2] [,3]
#> [1,] 0 NA NA
#> [2,] NA NA NA
#> [3,] NA NA NA
st_distance(st_sfc(p1, p2, p3), which = 'Euclidean')
#> 1 2 3
#> 1 0 NA NA
#> 2 NA NA NA
#> 3 NA NA NA
st_distance(st_sfc(p1, p2, p3), which = 'Frechet')
#> [,1] [,2] [,3]
#> [1,] 0 NA NA
#> [2,] NA NA NA
#> [3,] NA NA NA
sf_use_s2(FALSE)
#> Spherical geometry (s2) switched off
st_distance(st_sfc(p1, p2, p3, crs = 4326))
#> Units: [m]
#> [,1] [,2] [,3]
#> [1,] 0 NA NA
#> [2,] NA NA NA
#> [3,] NA NA NA
sf_use_s2(TRUE)
#> Spherical geometry (s2) switched on
st_distance(st_sfc(p1, p2, p3, crs = 4326))
#> Units: [m]
#> [,1] [,2] [,3]
#> [1,] 0 NA NA
#> [2,] NA NA NA
#> [3,] NA NA NA
I think this just leaves the underlying issue in s2::s2_distance*, where if one column is nan, the result is 20015118 instead of NA.
library(s2)
p1 <- 'POINT (1 1)'
p2 <- 'POINT (2 nan)'
p3 <- 'POINT (nan nan)'
s2_distance(c(p1, p2, p3), c(p1, p2, p3))
#> [1] 0 20015118 NA
s2_distance_matrix(c(p1, p2, p3), c(p1, p2, p3))
#> [,1] [,2] [,3]
#> [1,] 0 20015118 NA
#> [2,] 20015118 20015118 NA
#> [3,] NA NA NA
Thanks all for tracking this down! s2 definitely should do something more purposeful with partial nans given the confusing nature of the failure. I am inclined to error for this case because I am not sure that EMPTY is correct (many GEOS operations also do funny things or error for non-finite input as well and so I think there is precedent for all three of the things that are currently happening 🙂 )
I'm afraid there is no "correct", as the POINT EMPTY is the only type without a WKB representation in the simple feature standard, so GEOS gets all kind of things thrown to it. As NA coordinates is nowhere allowed, I think it is a good idea to treat a single NA also as indicator for empty point. But yes, raising an error would be a alternative, let the user clean up first.
Not related to the issue of returning POINT EMPTY or an error (let me know if I should open this as a separate issue in sf), but another difference in handling of NAs:
While st_distance and eg. st_union handle NAs in coordinates, eg. st_combine errors through c.sfg and specifically at valid_numeric_matrix
# With
# remotes::install_github('r-spatial/sf@fc726596e6166718fd9dce32f0a119fb84765eaa')
library(sf)
#> Linking to GEOS 3.13.1, GDAL 3.11.4, PROJ 9.7.0; sf_use_s2() is TRUE
p1 <- st_point(c(1, 2))
p2 <- st_point(c(1, NA_real_))
pts <- st_sfc(p1, p2)
st_coordinates(pts)
#> X Y
#> [1,] 1 2
#> [2,] NA NA
st_distance(pts)
#> 1 2
#> 1 0 NA
#> 2 NA NA
st_combine(pts)
#> Error in valid_numeric_matrix(x): !anyNA(x) is not TRUE
st_union(pts)
#> Geometry set for 1 feature
#> Geometry type: POINT
#> Dimension: XY
#> Bounding box: xmin: 1 ymin: 2 xmax: 1 ymax: 2
#> CRS: NA
#> POINT (1 2)
st_coordinates(st_union(pts))
#> X Y
#> [1,] 1 2
# With
# install.packages('sf')
library(sf)
#> Linking to GEOS 3.13.1, GDAL 3.11.4, PROJ 9.7.0; sf_use_s2() is TRUE
p1 <- st_point(c(1, 2))
p2 <- st_point(c(1, NA_real_))
pts <- st_sfc(p1, p2)
st_coordinates(pts)
#> X Y
#> [1,] 1 2
#> [2,] 1 NA
st_distance(pts)
#> 1 2
#> 1 0 0
#> 2 0 0
st_combine(pts)
#> Error in valid_numeric_matrix(x): !anyNA(x) is not TRUE
st_union(pts)
#> Geometry set for 1 feature
#> Geometry type: MULTIPOINT
#> Dimension: XY
#> Bounding box: xmin: 1 ymin: NA xmax: 1 ymax: NA
#> CRS: NA
#> MULTIPOINT ((1 2), (1 NA))
st_coordinates(st_union(pts))
#> X Y L1
#> [1,] 1 2 1
#> [2,] 1 NA 1