sf icon indicating copy to clipboard operation
sf copied to clipboard

st_write significantly slower when writing logical (boolean) field types

Open wkmor1 opened this issue 4 years ago • 4 comments

Have noticed that when there are logical data type columns in an sf object writing out to a file is much slower. The following illustrates the issue:

f <- function(x, n, fmt = ".csv") {
  x <- rep(x, n)
  x <- data.frame(0, 0, x)
  x <- sf::st_as_sf(x, coords = 1:2)
  sf::st_write(x, tempfile(fileext = fmt), quiet = TRUE)
}

microbenchmark::microbenchmark(
  f(FALSE, 10000),
  f(0L, 10000),
  f(0, 10000),
  f("FALSE", 10000),
  f(FALSE, 10000, ".gpkg"),
  f(0L, 10000, ".gpkg"),
  f(0, 10000, ".gpkg"),
  f("FALSE", 10000, ".gpkg"),
  times = 10
)
#> Unit: milliseconds
#>                        expr       min        lq      mean    median        uq
#>             f(FALSE, 10000) 189.83119 204.43109 307.18853 225.08582 256.84633
#>                f(0L, 10000)  35.12176  35.90689  43.35830  38.03695  42.48434
#>                 f(0, 10000)  37.55803  39.72591  47.20795  41.14088  43.24004
#>           f("FALSE", 10000)  34.27908  35.90282  38.39523  37.67921  40.45558
#>    f(FALSE, 10000, ".gpkg") 284.02960 300.72006 350.00660 337.46137 368.82369
#>       f(0L, 10000, ".gpkg") 126.78843 133.17075 154.76982 137.61813 161.53709
#>        f(0, 10000, ".gpkg") 129.27981 129.50055 140.01243 133.58174 140.45955
#>  f("FALSE", 10000, ".gpkg") 125.95240 130.73475 146.92906 135.61447 157.52841
#>         max neval
#>  1025.06738    10
#>    67.04742    10
#>   104.12921    10
#>    44.11557    10
#>   486.99444    10
#>   244.68378    10
#>   173.41739    10
#>   201.76884    10

Created on 2021-06-08 by the reprex package (v2.0.0)

Session info
sessionInfo()
#> R version 4.1.0 (2021-05-18)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 18.04.5 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> loaded via a namespace (and not attached):
#>  [1] Rcpp_1.0.6           pillar_1.6.1         compiler_4.1.0      
#>  [4] highr_0.9            R.methodsS3_1.8.1    R.utils_2.10.1      
#>  [7] class_7.3-19         tools_4.1.0          digest_0.6.27       
#> [10] evaluate_0.14        lifecycle_1.0.0      tibble_3.1.2        
#> [13] R.cache_0.15.0       pkgconfig_2.0.3      rlang_0.4.11        
#> [16] reprex_2.0.0         DBI_1.1.1            microbenchmark_1.4-7
#> [19] yaml_2.2.1           xfun_0.23            e1071_1.7-7         
#> [22] dplyr_1.0.6          withr_2.4.2          styler_1.4.1        
#> [25] stringr_1.4.0        knitr_1.33           generics_0.1.0      
#> [28] fs_1.5.0             vctrs_0.3.8          tidyselect_1.1.1    
#> [31] grid_4.1.0           classInt_0.4-3       glue_1.4.2          
#> [34] R6_2.5.0             sf_0.9-8             fansi_0.5.0         
#> [37] rmarkdown_2.8        purrr_0.3.4          magrittr_2.0.1      
#> [40] units_0.7-1          backports_1.2.1      ellipsis_0.3.2      
#> [43] htmltools_0.5.1.1    assertthat_0.2.1     utf8_1.2.1          
#> [46] KernSmooth_2.23-20   stringi_1.6.2        proxy_0.4-25        
#> [49] crayon_1.4.1         R.oo_1.24.0

wkmor1 avatar Jun 08 '21 12:06 wkmor1

Interesting 🤔 ... Can confirm on MacOS

library(sf)
#> Linking to GEOS 3.8.1, GDAL 3.1.1, PROJ 6.3.1

f <- function(x, n, fmt = ".csv") {
  x <- rep(x, n)
  x <- data.frame(0, 0, x)
  x <- sf::st_as_sf(x, coords = 1:2)
  sf::st_write(x, tempfile(fileext = fmt), quiet = TRUE)
}

microbenchmark::microbenchmark(
  f(FALSE, 10000),
  f(0L, 10000),
  f(0, 10000),
  f("FALSE", 10000),
  f(FALSE, 10000, ".gpkg"),
  f(0L, 10000, ".gpkg"),
  f(0, 10000, ".gpkg"),
  f("FALSE", 10000, ".gpkg"),
  times = 10)
#> Unit: milliseconds
#>                        expr       min        lq      mean    median        uq
#>             f(FALSE, 10000) 209.85522 230.28042 238.60764 234.83916 241.14995
#>                f(0L, 10000)  32.27804  34.97311  36.81165  35.90451  38.60695
#>                 f(0, 10000)  35.26579  36.09508  36.74345  36.49578  37.87945
#>           f("FALSE", 10000)  34.76497  35.47865  37.41611  36.41996  38.75253
#>    f(FALSE, 10000, ".gpkg") 273.42011 282.26102 300.65752 290.11968 312.02942
#>       f(0L, 10000, ".gpkg")  92.12086  93.14257  97.89846  98.11679  99.52682
#>        f(0, 10000, ".gpkg")  94.32044  97.65801  99.13713  98.62017 100.35575
#>  f("FALSE", 10000, ".gpkg")  94.62713  96.33537  98.80034  97.97429  99.84456
#>        max neval
#>  279.97131    10
#>   41.73219    10
#>   38.51427    10
#>   43.07141    10
#>  378.14181    10
#>  107.44184    10
#>  105.80861    10
#>  105.31229    10

Created on 2021-06-08 by the reprex package (v2.0.0)

Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value                       
#>  version  R version 4.0.2 (2020-06-22)
#>  os       macOS Mojave 10.14.3        
#>  system   x86_64, darwin17.0          
#>  ui       X11                         
#>  language (EN)                        
#>  collate  en_AU.UTF-8                 
#>  ctype    en_AU.UTF-8                 
#>  tz       Australia/Melbourne         
#>  date     2021-06-08                  
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package        * version date       lib source        
#>  assertthat       0.2.1   2019-03-21 [1] CRAN (R 4.0.0)
#>  backports        1.2.1   2020-12-09 [1] CRAN (R 4.0.2)
#>  class            7.3-17  2020-04-26 [1] CRAN (R 4.0.2)
#>  classInt         0.4-3   2020-04-07 [1] CRAN (R 4.0.0)
#>  cli              2.5.0   2021-04-26 [1] CRAN (R 4.0.2)
#>  crayon           1.4.1   2021-02-08 [1] CRAN (R 4.0.2)
#>  DBI              1.1.0   2019-12-15 [1] CRAN (R 4.0.0)
#>  digest           0.6.27  2020-10-24 [1] CRAN (R 4.0.2)
#>  dplyr            1.0.6   2021-05-05 [1] CRAN (R 4.0.2)
#>  e1071            1.7-4   2020-10-14 [1] CRAN (R 4.0.2)
#>  ellipsis         0.3.2   2021-04-29 [1] CRAN (R 4.0.2)
#>  evaluate         0.14    2019-05-28 [1] CRAN (R 4.0.0)
#>  fansi            0.4.2   2021-01-15 [1] CRAN (R 4.0.2)
#>  fs               1.5.0   2020-07-31 [1] CRAN (R 4.0.2)
#>  generics         0.1.0   2020-10-31 [1] CRAN (R 4.0.2)
#>  glue             1.4.2   2020-08-27 [1] CRAN (R 4.0.2)
#>  highr            0.9     2021-04-16 [1] CRAN (R 4.0.2)
#>  htmltools        0.5.1.1 2021-01-22 [1] CRAN (R 4.0.2)
#>  KernSmooth       2.23-17 2020-04-26 [1] CRAN (R 4.0.2)
#>  knitr            1.33    2021-04-24 [1] CRAN (R 4.0.2)
#>  lifecycle        1.0.0   2021-02-15 [1] CRAN (R 4.0.2)
#>  magrittr         2.0.1   2020-11-17 [1] CRAN (R 4.0.2)
#>  microbenchmark * 1.4-7   2019-09-24 [1] CRAN (R 4.0.2)
#>  pillar           1.6.1   2021-05-16 [1] CRAN (R 4.0.2)
#>  pkgconfig        2.0.3   2019-09-22 [1] CRAN (R 4.0.0)
#>  purrr            0.3.4   2020-04-17 [1] CRAN (R 4.0.0)
#>  R6               2.5.0   2020-10-28 [1] CRAN (R 4.0.2)
#>  Rcpp             1.0.6   2021-01-15 [1] CRAN (R 4.0.2)
#>  reprex           2.0.0   2021-04-02 [1] CRAN (R 4.0.2)
#>  rlang            0.4.11  2021-04-30 [1] CRAN (R 4.0.2)
#>  rmarkdown        2.5     2020-10-21 [1] CRAN (R 4.0.2)
#>  sessioninfo      1.1.1   2018-11-05 [1] CRAN (R 4.0.0)
#>  sf             * 0.9-6   2020-09-13 [1] CRAN (R 4.0.2)
#>  stringi          1.6.2   2021-05-17 [1] CRAN (R 4.0.2)
#>  stringr          1.4.0   2019-02-10 [1] CRAN (R 4.0.0)
#>  styler           1.4.1   2021-03-30 [1] CRAN (R 4.0.2)
#>  tibble           3.1.2   2021-05-16 [1] CRAN (R 4.0.2)
#>  tidyselect       1.1.1   2021-04-30 [1] CRAN (R 4.0.2)
#>  units            0.6-7   2020-06-13 [1] CRAN (R 4.0.0)
#>  utf8             1.2.1   2021-03-12 [1] CRAN (R 4.0.2)
#>  vctrs            0.3.8   2021-04-29 [1] CRAN (R 4.0.2)
#>  withr            2.4.2   2021-04-18 [1] CRAN (R 4.0.2)
#>  xfun             0.23    2021-05-15 [1] CRAN (R 4.0.2)
#>  yaml             2.2.1   2020-02-01 [1] CRAN (R 4.0.0)
#> 
#> [1] /Library/Frameworks/R.framework/Versions/4.0/Resources/library

johnbaums avatar Jun 08 '21 13:06 johnbaums

Can also confirm. Flatgeobuf helps speed in general, but also slower for bool

library(sf)
#> Linking to GEOS 3.9.0, GDAL 3.2.1, PROJ 7.2.1

f <- function(x, n, fmt = ".csv") {
  x <- rep(x, n)
  x <- data.frame(0, 0, x)
  x <- sf::st_as_sf(x, coords = 1:2)
  sf::st_write(x, tempfile(fileext = fmt), quiet = TRUE)
}

microbenchmark::microbenchmark(
  f(FALSE, 10000),
  f(0L, 10000),
  f(0, 10000),
  f("FALSE", 10000),
  f(FALSE, 10000, ".fgb"),
  f(0L, 10000, ".fgb"),
  f(0, 10000, ".fgb"),
  f("FALSE", 10000, ".fgb"),
  times = 10)
#> Unit: milliseconds
#>                       expr       min        lq      mean    median        uq
#>            f(FALSE, 10000) 116.03899 120.08824 142.52775 132.83730 146.39599
#>               f(0L, 10000)  23.38568  24.20939  25.40193  26.00254  26.17420
#>                f(0, 10000)  25.33234  25.73252  26.94360  26.05844  28.04113
#>          f("FALSE", 10000)  21.54567  21.88413  22.14330  22.15599  22.50748
#>    f(FALSE, 10000, ".fgb") 124.45799 129.56546 143.20141 140.17218 153.84233
#>       f(0L, 10000, ".fgb")  31.12163  31.95204  32.87096  33.04909  33.58276
#>        f(0, 10000, ".fgb")  30.96207  31.16706  32.97509  31.75533  34.07206
#>  f("FALSE", 10000, ".fgb")  31.68097  32.49478  33.56320  33.04091  34.89044
#>        max neval
#>  234.55547    10
#>   27.09925    10
#>   31.91724    10
#>   22.56170    10
#>  186.41495    10
#>   35.00678    10
#>   39.57030    10
#>   36.27695    10

Created on 2021-06-08 by the reprex package (v2.0.0)

tim-salabim avatar Jun 08 '21 13:06 tim-salabim

The same here. We had to transform the dummy variables to integers. It seems that transforming them into characters is slow too to export. Strange fact !

antuki avatar Sep 24 '21 08:09 antuki

It's treated differently from the others by GDAL: https://trac.osgeo.org/gdal/wiki/rfc50_ogr_field_subtype

edzer avatar Sep 24 '21 08:09 edzer

This should now be fixed with https://github.com/r-spatial/sf/commit/558c693f647d8a1cef44cdd36dd311f4b20dd5ca (see also #1409).

kadyb avatar Feb 07 '23 19:02 kadyb

Thanks!

edzer avatar Feb 07 '23 20:02 edzer