stars icon indicating copy to clipboard operation
stars copied to clipboard

`aggregate` performance

Open kadyb opened this issue 4 years ago • 7 comments

Would it be possible to improve the performance of aggregate() function, in particular the processing of data that is loaded into memory (it seems fine in proxy case)? For the example below, processing 10 layer raster (7771 x 7871 pixels) fully loaded into memory takes ~14 min for only one polygon. For proxy = TRUE it takes ~0.2 s. At the end, I included timings for other packages.

library(sf)
library(stars)

buffers = read_sf("data/vector/buffers.gpkg")
buffers = buffers[1, ] # use only one polygon
rasters = list.files("data/LC08_L1TP_190024_20200418_20200822_02_T1/",
                     pattern = "\\.TIF$", full.names = TRUE)

# from memory (RAM usage peak was over 50 GB)
ras = read_stars(rasters, along = 3, proxy = FALSE)
system.time(aggregate(ras, buffers, FUN = mean))
#>    user  system elapsed
#> 732.833  95.660 828.178

# from file
ras = read_stars(rasters, along = 3, proxy = TRUE)
# should there be warning if 1 polygon was used?
system.time(aggregate(ras, buffers, FUN = mean))
#>  user  system elapsed
#> 0.158   0.003   0.161
#> Warning message:
#>   In c.stars(list(attr = c(9230.78260869565, 24826.6884057971, 23208.2391304348,  :
#>    along argument ignored; maybe you wanted to use st_redimension?

Timings in other packages for reference:

  • terra (memory) - 0.008 s
  • terra (file) - 0.032 s
  • exactextractr - 2.179 s

Data (851 MB): https://drive.google.com/uc?id=1lzglfQJqlQh9OWT_-czc5L0hQ1AhoR8M&export=download Buffers: https://github.com/kadyb/raster-benchmark/blob/main/data/vector/buffers.gpkg Related issue: #153

kadyb avatar May 12 '21 15:05 kadyb

Here is the result from Rprof:

Profiler output
$by.self
                               self.time self.pct total.time total.pct
".Call"                           528.44    67.35     528.44     67.35
"lapply"                          102.50    13.06     781.14     99.56
"[["                               44.48     5.67      44.48      5.67
"FUN"                              37.40     4.77      62.74      8.00
"as.vector"                        15.72     2.00      15.72      2.00
"structure"                        10.24     1.31      66.32      8.45
"[[<-.data.frame"                   6.76     0.86       6.76      0.86
"array"                             6.28     0.80      10.28      1.31
"unlist"                            6.24     0.80       6.24      0.80
"length"                            6.12     0.78       6.12      0.78
"st_xy2sfc"                         4.62     0.59      69.42      8.85
"abind"                             4.60     0.59      11.66      1.49
"expand_dimensions.dimensions"      2.54     0.32       2.54      0.32
"expand.grid"                       2.00     0.25       2.00      0.25
"%*%"                               1.50     0.19       1.50      0.19
"st_crs<-.sfc"                      1.06     0.14       1.06      0.14
"unique.default"                    1.00     0.13       1.00      0.13
"lengths"                           0.96     0.12       0.96      0.12
"matrix"                            0.86     0.11       0.86      0.11
"which"                             0.64     0.08       0.64      0.08
"isTRUE"                            0.20     0.03       0.38      0.05
"xy_from_colrow"                    0.18     0.02       3.32      0.42
"all"                               0.14     0.02       0.14      0.02
"lazyLoadDBfetch"                   0.04     0.01       0.04      0.01
"as_units.call"                     0.02     0.00       0.02      0.00
"has_raster"                        0.02     0.00       0.02      0.00
"strsplit"                          0.02     0.00       0.02      0.00

$by.total
                                 total.time total.pct self.time self.pct
"aggregate.stars"                    784.54     99.99      0.00     0.00
"aggregate"                          784.54     99.99      0.00     0.00
"lapply"                             781.14     99.56    102.50    13.06
"sapply"                             781.08     99.55      0.00     0.00
"join"                               588.20     74.97      0.00     0.00
"st_intersects.stars"                588.20     74.97      0.00     0.00
"st_intersects"                      588.20     74.97      0.00     0.00
".Call"                              528.44     67.35    528.44    67.35
"st_geos_binop"                      491.02     62.58      0.00     0.00
"st_intersects.sf"                   491.02     62.58      0.00     0.00
"t"                                  363.68     46.35      0.00     0.00
"CPL_gdal_dimension"                 259.98     33.14      0.00     0.00
"st_dimension"                       259.98     33.14      0.00     0.00
"CPL_geos_binop"                     215.00     27.40      0.00     0.00
"st_as_sf.stars"                      97.18     12.39      0.00     0.00
"st_as_sf"                            97.18     12.39      0.00     0.00
"st_xy2sfc"                           69.42      8.85      4.62     0.59
"structure"                           66.32      8.45     10.24     1.31
"FUN"                                 62.74      8.00     37.40     4.77
"st_as_sfc.dimensions"                50.08      6.38      0.00     0.00
"st_as_sfc.stars"                     50.08      6.38      0.00     0.00
"st_as_sfc"                           50.08      6.38      0.00     0.00
"[["                                  44.48      5.67     44.48     5.67
"CPL_xy2sfc"                          42.36      5.40      0.00     0.00
"as.data.frame"                       15.76      2.01      0.00     0.00
"as.vector"                           15.72      2.00     15.72     2.00
"as.data.frame.matrix"                15.72      2.00      0.00     0.00
"un_dim"                              15.72      2.00      0.00     0.00
"t.sgbp"                              15.62      1.99      0.00     0.00
"sgbp"                                14.18      1.81      0.00     0.00
"abind"                               11.66      1.49      4.60     0.59
"CPL_transpose_sparse_incidence"      11.24      1.43      0.00     0.00
"array"                               10.28      1.31      6.28     0.80
"[[<-"                                 7.38      0.94      0.00     0.00
"st_sf"                                7.24      0.92      0.00     0.00
"[[<-.data.frame"                      6.76      0.86      6.76     0.86
"unlist"                               6.24      0.80      6.24     0.80
"length"                               6.12      0.78      6.12     0.78
"st_crs<-.sf"                          5.00      0.64      0.00     0.00
"st_crs<-"                             5.00      0.64      0.00     0.00
"agr_grps"                             3.46      0.44      0.00     0.00
"do.call"                              3.46      0.44      0.00     0.00
"simplify2array"                       3.42      0.44      0.00     0.00
"xy_from_colrow"                       3.32      0.42      0.18     0.02
"apply"                                3.22      0.41      0.00     0.00
"st_stars"                             3.06      0.39      0.00     0.00
"[[<-.sf"                              2.88      0.37      0.00     0.00
"expand_dimensions.dimensions"         2.54      0.32      2.54     0.32
"expand_dimensions.stars"              2.54      0.32      0.00     0.00
"expand_dimensions"                    2.54      0.32      0.00     0.00
"NextMethod"                           2.26      0.29      0.00     0.00
"expand.grid"                          2.00      0.25      2.00     0.25
"unique"                               1.96      0.25      0.00     0.00
"%*%"                                  1.50      0.19      1.50     0.19
"st_crs<-.sfc"                         1.06      0.14      1.06     0.14
"unique.default"                       1.00      0.13      1.00     0.13
"...elt"                               1.00      0.13      0.00     0.00
"stopifnot"                            1.00      0.13      0.00     0.00
"lengths"                              0.96      0.12      0.96     0.12
"matrix"                               0.86      0.11      0.86     0.11
"as.matrix.data.frame"                 0.78      0.10      0.00     0.00
"as.matrix"                            0.78      0.10      0.00     0.00
"ncol"                                 0.78      0.10      0.00     0.00
"which"                                0.64      0.08      0.64     0.08
"isTRUE"                               0.38      0.05      0.20     0.03
"bb_wrap"                              0.22      0.03      0.00     0.00
"bbox.Mtrx"                            0.22      0.03      0.00     0.00
"CPL_get_bbox"                         0.22      0.03      0.00     0.00
"all"                                  0.14      0.02      0.14     0.02
"crs_parameters"                       0.08      0.01      0.00     0.00
"$.crs"                                0.06      0.01      0.00     0.00
"$"                                    0.06      0.01      0.00     0.00
"lazyLoadDBfetch"                      0.04      0.01      0.04     0.01
"<Anonymous>"                          0.04      0.01      0.00     0.00
"as.data.frame.dimensions"             0.04      0.01      0.00     0.00
"CPL_crs_parameters"                   0.04      0.01      0.00     0.00
"format.crs"                           0.04      0.01      0.00     0.00
"format"                               0.04      0.01      0.00     0.00
"print.dimensions"                     0.04      0.01      0.00     0.00
"print.stars"                          0.04      0.01      0.00     0.00
"print"                                0.04      0.01      0.00     0.00
"st_is_longlat"                        0.04      0.01      0.00     0.00
"as_units.call"                        0.02      0.00      0.02     0.00
"has_raster"                           0.02      0.00      0.02     0.00
"strsplit"                             0.02      0.00      0.02     0.00
"as_units.character"                   0.02      0.00      0.00     0.00
"as_units"                             0.02      0.00      0.00     0.00
"get"                                  0.02      0.00      0.00     0.00
"row.names"                            0.02      0.00      0.00     0.00
"tail"                                 0.02      0.00      0.00     0.00

kadyb avatar May 13 '21 10:05 kadyb

Thanks - that's what I thought ;-)

edzer avatar May 13 '21 10:05 edzer

See https://github.com/r-spatial/stars/commit/8b52416ba5618e8ddebdaa5da2ea417f9c2cc455 (this is still in a branch called agg)

This implements this over st_extract, which so far accepted point geometries. aggregate (for stars, not for stars_prxoy - needs documentation!) does something different: it will uniquely assign pixels to a single geometry (like base::aggregate does); st_extract (and aggregate for proxy) handle overlapping geometries and will assign pixels to multiple geoms if needed.

library(sf)
# Linking to GEOS 3.9.0, GDAL 3.2.1, PROJ 7.2.1
library(stars)
# Loading required package: abind

buffers = read_sf("buffers.gpkg")
buffers = buffers[1, ] # use only one polygon
rasters = list.files("LC08_L1TP_190024_20200418_20200822_02_T1/",
                     pattern = "\\.TIF$", full.names = TRUE)

# from memory (RAM usage peak was over 50 GB)
ras = read_stars(rasters, along = 3, proxy = FALSE)
system.time(s <- st_extract(ras, buffers, FUN = mean))
#    user  system elapsed 
#   0.056   0.000   0.056 
# Warning message:
# In c.stars(list(LC08_L1TP_190024_20200418_20200822_02_T1_B1.TIF = c(9232.81542056075,  :
#   along argument ignored; maybe you wanted to use st_redimension?

# from file
ras = read_stars(rasters, along = 3, proxy = TRUE)
# should there be warning if 1 polygon was used?
system.time(a <- aggregate(ras, buffers, FUN = mean))
#    user  system elapsed 
#   0.087   0.001   0.087 
# Warning message:
# In c.stars(list(attr = c(9232.81542056075, 24792.0934579439, 23200.8761682243,  :
#   along argument ignored; maybe you wanted to use st_redimension?

edzer avatar May 17 '21 20:05 edzer

Thanks, I tried to test this on all polygons but I get an error. This is probably a regression as it works in stars 0.5-2.

library(sf)
library(stars) # v. 0.5.3

buffers = read_sf("data/vector/buffers.gpkg")
rasters = list.files("data/LC08_L1TP_190024_20200418_20200822_02_T1/",
                     pattern = "\\.TIF$", full.names = TRUE)

ras = read_stars(rasters, along = 3, proxy = FALSE)
system.time(s <- st_extract(ras, buffers, FUN = mean))
#> Error in st_crop.stars(x, i, crop = crop, ...) : 
#>   NA values in bounding box of y
#> Timing stopped at: 0.147 0.001 0.147

ras = read_stars(rasters, along = 3, proxy = TRUE)
system.time(a <- aggregate(ras, buffers, FUN = mean))
#> Error in st_crop.stars(x, i, crop = crop, ...) : 
#>   NA values in bounding box of y
#> Timing stopped at: 25.06 9.011 34.08

kadyb avatar May 17 '21 21:05 kadyb

I now merged branch agg with master; please reinstall & retry. With 0.5-2 this shouldn't work, with

> system.time(s <- st_extract(ras, buffers, FUN = mean))
Error in st_extract.stars(ras, buffers, FUN = mean) : 
  all(st_dimension(pts) == 0) is not TRUE

edzer avatar May 18 '21 07:05 edzer

With 0.5-2 this shouldn't work

I didn't write it precise. I meant aggregate() by polygon worked in 0.5-2, but the same error occurred in aggregate() and st_extract() by polygon in @agg branch.

After the merge, I ran tests again for 1940 polygons and it's OK now. I noticed over 8 times reduction in processing times between aggregate() and st_extract(), so this is a significant improvment.

library(sf)
library(stars)

buffers = read_sf("data/vector/buffers.gpkg") # 1940 polygons
rasters = list.files("data/LC08_L1TP_190024_20200418_20200822_02_T1/",
                     pattern = "\\.TIF$", full.names = TRUE)

ras_mem = read_stars(rasters, along = 3, proxy = FALSE)
ras_proxy = read_stars(rasters, along = 3, proxy = TRUE)

# from memory (aggregate)
system.time(a <- aggregate(ras_mem, buffers, FUN = mean))
#>     user   system  elapsed
#> 1114.140  131.276 1246.046

# from memory (st_extract)
system.time(b <- st_extract(ras_mem, buffers, FUN = mean))
#>    user  system elapsed
#> 146.594   0.553 147.231

# from file (aggregate)
system.time(c <- aggregate(ras_proxy, buffers, FUN = mean))
#>    user  system elapsed
#> 265.855   2.739 268.719

I also include timings from other packages for comparison:

  • terra (memory) - 14.505 s
  • terra (file) - 29.34 s
  • exactextractr - 14.75 s

kadyb avatar May 18 '21 10:05 kadyb

Here is another interesting thing. It seems that the fastest way is convert stars to data.frame and rasterize buffers. Probably this can be faster using data.table, but the biggest overhead comes from converting stars to data.frame. Maybe there should be a function to directly convert to data.table?

library(sf)
library(stars)

buffers = read_sf("data/vector/buffers.gpkg") # 1940 polygons
rasters = list.files("data/LC08_L1TP_190024_20200418_20200822_02_T1/",
                     pattern = "\\.TIF$", full.names = TRUE)

ras = read_stars(rasters, along = 3, proxy = FALSE)
ras_names = c("B1", "B10", "B11", "B2", "B3", "B4", "B5", "B6", "B7", "B9")
ras = st_set_dimensions(ras, 3, values = ras_names, names = "band")
names(ras) = "reflectance"

start = Sys.time()
df = data.frame(ras)
df = unstack(df, form = reflectance ~ band)
dest = st_as_stars(st_bbox(ras), dx = 30, values = NA_integer_)
buffers_ras = data.frame(st_rasterize(buffers, dest))
buffers_ras = as.integer(buffers_ras$ID)

agg = aggregate(df, list(ID = buffers_ras), mean)
Sys.time() - start
#> Time difference of 1.747108 mins

kadyb avatar Jul 30 '21 13:07 kadyb