`aggregate` performance
Would it be possible to improve the performance of aggregate() function, in particular the processing of data that is loaded into memory (it seems fine in proxy case)? For the example below, processing 10 layer raster (7771 x 7871 pixels) fully loaded into memory takes ~14 min for only one polygon. For proxy = TRUE it takes ~0.2 s. At the end, I included timings for other packages.
library(sf)
library(stars)
buffers = read_sf("data/vector/buffers.gpkg")
buffers = buffers[1, ] # use only one polygon
rasters = list.files("data/LC08_L1TP_190024_20200418_20200822_02_T1/",
pattern = "\\.TIF$", full.names = TRUE)
# from memory (RAM usage peak was over 50 GB)
ras = read_stars(rasters, along = 3, proxy = FALSE)
system.time(aggregate(ras, buffers, FUN = mean))
#> user system elapsed
#> 732.833 95.660 828.178
# from file
ras = read_stars(rasters, along = 3, proxy = TRUE)
# should there be warning if 1 polygon was used?
system.time(aggregate(ras, buffers, FUN = mean))
#> user system elapsed
#> 0.158 0.003 0.161
#> Warning message:
#> In c.stars(list(attr = c(9230.78260869565, 24826.6884057971, 23208.2391304348, :
#> along argument ignored; maybe you wanted to use st_redimension?
Timings in other packages for reference:
- terra (memory) - 0.008 s
- terra (file) - 0.032 s
- exactextractr - 2.179 s
Data (851 MB): https://drive.google.com/uc?id=1lzglfQJqlQh9OWT_-czc5L0hQ1AhoR8M&export=download Buffers: https://github.com/kadyb/raster-benchmark/blob/main/data/vector/buffers.gpkg Related issue: #153
Here is the result from Rprof:
Profiler output
$by.self
self.time self.pct total.time total.pct
".Call" 528.44 67.35 528.44 67.35
"lapply" 102.50 13.06 781.14 99.56
"[[" 44.48 5.67 44.48 5.67
"FUN" 37.40 4.77 62.74 8.00
"as.vector" 15.72 2.00 15.72 2.00
"structure" 10.24 1.31 66.32 8.45
"[[<-.data.frame" 6.76 0.86 6.76 0.86
"array" 6.28 0.80 10.28 1.31
"unlist" 6.24 0.80 6.24 0.80
"length" 6.12 0.78 6.12 0.78
"st_xy2sfc" 4.62 0.59 69.42 8.85
"abind" 4.60 0.59 11.66 1.49
"expand_dimensions.dimensions" 2.54 0.32 2.54 0.32
"expand.grid" 2.00 0.25 2.00 0.25
"%*%" 1.50 0.19 1.50 0.19
"st_crs<-.sfc" 1.06 0.14 1.06 0.14
"unique.default" 1.00 0.13 1.00 0.13
"lengths" 0.96 0.12 0.96 0.12
"matrix" 0.86 0.11 0.86 0.11
"which" 0.64 0.08 0.64 0.08
"isTRUE" 0.20 0.03 0.38 0.05
"xy_from_colrow" 0.18 0.02 3.32 0.42
"all" 0.14 0.02 0.14 0.02
"lazyLoadDBfetch" 0.04 0.01 0.04 0.01
"as_units.call" 0.02 0.00 0.02 0.00
"has_raster" 0.02 0.00 0.02 0.00
"strsplit" 0.02 0.00 0.02 0.00
$by.total
total.time total.pct self.time self.pct
"aggregate.stars" 784.54 99.99 0.00 0.00
"aggregate" 784.54 99.99 0.00 0.00
"lapply" 781.14 99.56 102.50 13.06
"sapply" 781.08 99.55 0.00 0.00
"join" 588.20 74.97 0.00 0.00
"st_intersects.stars" 588.20 74.97 0.00 0.00
"st_intersects" 588.20 74.97 0.00 0.00
".Call" 528.44 67.35 528.44 67.35
"st_geos_binop" 491.02 62.58 0.00 0.00
"st_intersects.sf" 491.02 62.58 0.00 0.00
"t" 363.68 46.35 0.00 0.00
"CPL_gdal_dimension" 259.98 33.14 0.00 0.00
"st_dimension" 259.98 33.14 0.00 0.00
"CPL_geos_binop" 215.00 27.40 0.00 0.00
"st_as_sf.stars" 97.18 12.39 0.00 0.00
"st_as_sf" 97.18 12.39 0.00 0.00
"st_xy2sfc" 69.42 8.85 4.62 0.59
"structure" 66.32 8.45 10.24 1.31
"FUN" 62.74 8.00 37.40 4.77
"st_as_sfc.dimensions" 50.08 6.38 0.00 0.00
"st_as_sfc.stars" 50.08 6.38 0.00 0.00
"st_as_sfc" 50.08 6.38 0.00 0.00
"[[" 44.48 5.67 44.48 5.67
"CPL_xy2sfc" 42.36 5.40 0.00 0.00
"as.data.frame" 15.76 2.01 0.00 0.00
"as.vector" 15.72 2.00 15.72 2.00
"as.data.frame.matrix" 15.72 2.00 0.00 0.00
"un_dim" 15.72 2.00 0.00 0.00
"t.sgbp" 15.62 1.99 0.00 0.00
"sgbp" 14.18 1.81 0.00 0.00
"abind" 11.66 1.49 4.60 0.59
"CPL_transpose_sparse_incidence" 11.24 1.43 0.00 0.00
"array" 10.28 1.31 6.28 0.80
"[[<-" 7.38 0.94 0.00 0.00
"st_sf" 7.24 0.92 0.00 0.00
"[[<-.data.frame" 6.76 0.86 6.76 0.86
"unlist" 6.24 0.80 6.24 0.80
"length" 6.12 0.78 6.12 0.78
"st_crs<-.sf" 5.00 0.64 0.00 0.00
"st_crs<-" 5.00 0.64 0.00 0.00
"agr_grps" 3.46 0.44 0.00 0.00
"do.call" 3.46 0.44 0.00 0.00
"simplify2array" 3.42 0.44 0.00 0.00
"xy_from_colrow" 3.32 0.42 0.18 0.02
"apply" 3.22 0.41 0.00 0.00
"st_stars" 3.06 0.39 0.00 0.00
"[[<-.sf" 2.88 0.37 0.00 0.00
"expand_dimensions.dimensions" 2.54 0.32 2.54 0.32
"expand_dimensions.stars" 2.54 0.32 0.00 0.00
"expand_dimensions" 2.54 0.32 0.00 0.00
"NextMethod" 2.26 0.29 0.00 0.00
"expand.grid" 2.00 0.25 2.00 0.25
"unique" 1.96 0.25 0.00 0.00
"%*%" 1.50 0.19 1.50 0.19
"st_crs<-.sfc" 1.06 0.14 1.06 0.14
"unique.default" 1.00 0.13 1.00 0.13
"...elt" 1.00 0.13 0.00 0.00
"stopifnot" 1.00 0.13 0.00 0.00
"lengths" 0.96 0.12 0.96 0.12
"matrix" 0.86 0.11 0.86 0.11
"as.matrix.data.frame" 0.78 0.10 0.00 0.00
"as.matrix" 0.78 0.10 0.00 0.00
"ncol" 0.78 0.10 0.00 0.00
"which" 0.64 0.08 0.64 0.08
"isTRUE" 0.38 0.05 0.20 0.03
"bb_wrap" 0.22 0.03 0.00 0.00
"bbox.Mtrx" 0.22 0.03 0.00 0.00
"CPL_get_bbox" 0.22 0.03 0.00 0.00
"all" 0.14 0.02 0.14 0.02
"crs_parameters" 0.08 0.01 0.00 0.00
"$.crs" 0.06 0.01 0.00 0.00
"$" 0.06 0.01 0.00 0.00
"lazyLoadDBfetch" 0.04 0.01 0.04 0.01
"<Anonymous>" 0.04 0.01 0.00 0.00
"as.data.frame.dimensions" 0.04 0.01 0.00 0.00
"CPL_crs_parameters" 0.04 0.01 0.00 0.00
"format.crs" 0.04 0.01 0.00 0.00
"format" 0.04 0.01 0.00 0.00
"print.dimensions" 0.04 0.01 0.00 0.00
"print.stars" 0.04 0.01 0.00 0.00
"print" 0.04 0.01 0.00 0.00
"st_is_longlat" 0.04 0.01 0.00 0.00
"as_units.call" 0.02 0.00 0.02 0.00
"has_raster" 0.02 0.00 0.02 0.00
"strsplit" 0.02 0.00 0.02 0.00
"as_units.character" 0.02 0.00 0.00 0.00
"as_units" 0.02 0.00 0.00 0.00
"get" 0.02 0.00 0.00 0.00
"row.names" 0.02 0.00 0.00 0.00
"tail" 0.02 0.00 0.00 0.00
Thanks - that's what I thought ;-)
See https://github.com/r-spatial/stars/commit/8b52416ba5618e8ddebdaa5da2ea417f9c2cc455 (this is still in a branch called agg)
This implements this over st_extract, which so far accepted point geometries. aggregate (for stars, not for stars_prxoy - needs documentation!) does something different: it will uniquely assign pixels to a single geometry (like base::aggregate does); st_extract (and aggregate for proxy) handle overlapping geometries and will assign pixels to multiple geoms if needed.
library(sf)
# Linking to GEOS 3.9.0, GDAL 3.2.1, PROJ 7.2.1
library(stars)
# Loading required package: abind
buffers = read_sf("buffers.gpkg")
buffers = buffers[1, ] # use only one polygon
rasters = list.files("LC08_L1TP_190024_20200418_20200822_02_T1/",
pattern = "\\.TIF$", full.names = TRUE)
# from memory (RAM usage peak was over 50 GB)
ras = read_stars(rasters, along = 3, proxy = FALSE)
system.time(s <- st_extract(ras, buffers, FUN = mean))
# user system elapsed
# 0.056 0.000 0.056
# Warning message:
# In c.stars(list(LC08_L1TP_190024_20200418_20200822_02_T1_B1.TIF = c(9232.81542056075, :
# along argument ignored; maybe you wanted to use st_redimension?
# from file
ras = read_stars(rasters, along = 3, proxy = TRUE)
# should there be warning if 1 polygon was used?
system.time(a <- aggregate(ras, buffers, FUN = mean))
# user system elapsed
# 0.087 0.001 0.087
# Warning message:
# In c.stars(list(attr = c(9232.81542056075, 24792.0934579439, 23200.8761682243, :
# along argument ignored; maybe you wanted to use st_redimension?
Thanks, I tried to test this on all polygons but I get an error. This is probably a regression as it works in stars 0.5-2.
library(sf)
library(stars) # v. 0.5.3
buffers = read_sf("data/vector/buffers.gpkg")
rasters = list.files("data/LC08_L1TP_190024_20200418_20200822_02_T1/",
pattern = "\\.TIF$", full.names = TRUE)
ras = read_stars(rasters, along = 3, proxy = FALSE)
system.time(s <- st_extract(ras, buffers, FUN = mean))
#> Error in st_crop.stars(x, i, crop = crop, ...) :
#> NA values in bounding box of y
#> Timing stopped at: 0.147 0.001 0.147
ras = read_stars(rasters, along = 3, proxy = TRUE)
system.time(a <- aggregate(ras, buffers, FUN = mean))
#> Error in st_crop.stars(x, i, crop = crop, ...) :
#> NA values in bounding box of y
#> Timing stopped at: 25.06 9.011 34.08
I now merged branch agg with master; please reinstall & retry. With 0.5-2 this shouldn't work, with
> system.time(s <- st_extract(ras, buffers, FUN = mean))
Error in st_extract.stars(ras, buffers, FUN = mean) :
all(st_dimension(pts) == 0) is not TRUE
With 0.5-2 this shouldn't work
I didn't write it precise. I meant aggregate() by polygon worked in 0.5-2, but the same error occurred in aggregate() and st_extract() by polygon in @agg branch.
After the merge, I ran tests again for 1940 polygons and it's OK now. I noticed over 8 times reduction in processing times between aggregate() and st_extract(), so this is a significant improvment.
library(sf)
library(stars)
buffers = read_sf("data/vector/buffers.gpkg") # 1940 polygons
rasters = list.files("data/LC08_L1TP_190024_20200418_20200822_02_T1/",
pattern = "\\.TIF$", full.names = TRUE)
ras_mem = read_stars(rasters, along = 3, proxy = FALSE)
ras_proxy = read_stars(rasters, along = 3, proxy = TRUE)
# from memory (aggregate)
system.time(a <- aggregate(ras_mem, buffers, FUN = mean))
#> user system elapsed
#> 1114.140 131.276 1246.046
# from memory (st_extract)
system.time(b <- st_extract(ras_mem, buffers, FUN = mean))
#> user system elapsed
#> 146.594 0.553 147.231
# from file (aggregate)
system.time(c <- aggregate(ras_proxy, buffers, FUN = mean))
#> user system elapsed
#> 265.855 2.739 268.719
I also include timings from other packages for comparison:
- terra (memory) - 14.505 s
- terra (file) - 29.34 s
- exactextractr - 14.75 s
Here is another interesting thing. It seems that the fastest way is convert stars to data.frame and rasterize buffers. Probably this can be faster using data.table, but the biggest overhead comes from converting stars to data.frame. Maybe there should be a function to directly convert to data.table?
library(sf)
library(stars)
buffers = read_sf("data/vector/buffers.gpkg") # 1940 polygons
rasters = list.files("data/LC08_L1TP_190024_20200418_20200822_02_T1/",
pattern = "\\.TIF$", full.names = TRUE)
ras = read_stars(rasters, along = 3, proxy = FALSE)
ras_names = c("B1", "B10", "B11", "B2", "B3", "B4", "B5", "B6", "B7", "B9")
ras = st_set_dimensions(ras, 3, values = ras_names, names = "band")
names(ras) = "reflectance"
start = Sys.time()
df = data.frame(ras)
df = unstack(df, form = reflectance ~ band)
dest = st_as_stars(st_bbox(ras), dx = 30, values = NA_integer_)
buffers_ras = data.frame(st_rasterize(buffers, dest))
buffers_ras = as.integer(buffers_ras$ID)
agg = aggregate(df, list(ID = buffers_ras), mean)
Sys.time() - start
#> Time difference of 1.747108 mins