readr icon indicating copy to clipboard operation
readr copied to clipboard

`read_csv2_chunked()` needs to adjust locale like `read_csv2()`

Open dpprdan opened this issue 2 years ago • 3 comments

read_csv2_chunked() does not parse decimals correctly with the default_locale().

library(readr)
tf <- tempfile()
write_csv2(head(mtcars), tf)

Just to make sure: This really is a CSV2 and read_csv2() reads the data as expected

read_lines(tf) |> head(2)
#> [1] "mpg;cyl;disp;hp;drat;wt;qsec;vs;am;gear;carb"
#> [2] "21,0;6;160;110;3,90;2,620;16,46;0;1;4;4"
read_csv2(tf)
#> ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
#> Rows: 6 Columns: 11
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ";"
#> dbl (11): mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 6 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> 4  21.4     6   258   110  3.08  3.22  19.4     1     0     3     1
#> 5  18.7     8   360   175  3.15  3.44  17.0     0     0     3     2
#> 6  18.1     6   225   105  2.76  3.46  20.2     1     0     3     1

with default_locale(), i.e. decimal_mark = ".", the decimals are not parsed as decimals but as integers

read_csv2_chunked(tf, DataFrameCallback$new(data.frame), chunk_size = 3)
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   mpg = col_number(),
#>   cyl = col_double(),
#>   disp = col_double(),
#>   hp = col_double(),
#>   drat = col_number(),
#>   wt = col_number(),
#>   qsec = col_number(),
#>   vs = col_double(),
#>   am = col_double(),
#>   gear = col_double(),
#>   carb = col_double()
#> )
#>   mpg cyl disp  hp drat   wt qsec vs am gear carb index
#> 1 210   6  160 110  390 2620 1646  0  1    4    4     1
#> 2 210   6  160 110  390 2875 1702  0  1    4    4     1
#> 3 228   4  108  93  385 2320 1861  1  1    4    1     1
#> 4 214   6  258 110  308 3215 1944  1  0    3    1     4
#> 5 187   8  360 175  315 3440 1702  0  0    3    2     4
#> 6 181   6  225 105  276 3460 2022  1  0    3    1     4

All is fine when setting the adequate locale.

read_csv2_chunked(tf, DataFrameCallback$new(data.frame), chunk_size = 3, locale = locale(decimal_mark = ",", grouping_mark = "."))
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   mpg = col_double(),
#>   cyl = col_double(),
#>   disp = col_double(),
#>   hp = col_double(),
#>   drat = col_double(),
#>   wt = col_double(),
#>   qsec = col_double(),
#>   vs = col_double(),
#>   am = col_double(),
#>   gear = col_double(),
#>   carb = col_double()
#> )
#>    mpg cyl disp  hp drat    wt  qsec vs am gear carb index
#> 1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4     1
#> 2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4     1
#> 3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1     1
#> 4 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1     4
#> 5 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2     4
#> 6 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1     4

Long story short, the default locale setting is wrong. This could be fixed easily with a default_locale2()/locale(decimal_mark = ",", grouping_mark = ".") default. (cf #1445)

dpprdan avatar Feb 07 '23 20:02 dpprdan

Could you please rework your reproducible example to use the reprex package ? That makes it easier to see both the input and the output, formatted in such a way that I can easily re-run in a local session.

hadley avatar Jul 31 '23 22:07 hadley

reprex without prose:

library(readr)
tf <- tempfile()
write_csv2(head(mtcars), tf)

# decimals parse incorrectly with the `default_locale()`
read_csv2_chunked(tf, DataFrameCallback$new(data.frame), chunk_size = 3)
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   mpg = col_number(),
#>   cyl = col_double(),
#>   disp = col_double(),
#>   hp = col_double(),
#>   drat = col_number(),
#>   wt = col_number(),
#>   qsec = col_number(),
#>   vs = col_double(),
#>   am = col_double(),
#>   gear = col_double(),
#>   carb = col_double()
#> )
#>   mpg cyl disp  hp drat   wt qsec vs am gear carb index
#> 1 210   6  160 110  390 2620 1646  0  1    4    4     1
#> 2 210   6  160 110  390 2875 1702  0  1    4    4     1
#> 3 228   4  108  93  385 2320 1861  1  1    4    1     1
#> 4 214   6  258 110  308 3215 1944  1  0    3    1     4
#> 5 187   8  360 175  315 3440 1702  0  0    3    2     4
#> 6 181   6  225 105  276 3460 2022  1  0    3    1     4

# decimals parse correctly with the adequate locale set
read_csv2_chunked(
  tf,
  DataFrameCallback$new(data.frame),
  chunk_size = 3,
  locale = locale(decimal_mark = ",", grouping_mark = ".")
)
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   mpg = col_double(),
#>   cyl = col_double(),
#>   disp = col_double(),
#>   hp = col_double(),
#>   drat = col_double(),
#>   wt = col_double(),
#>   qsec = col_double(),
#>   vs = col_double(),
#>   am = col_double(),
#>   gear = col_double(),
#>   carb = col_double()
#> )
#>    mpg cyl disp  hp drat    wt  qsec vs am gear carb index
#> 1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4     1
#> 2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4     1
#> 3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1     1
#> 4 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1     4
#> 5 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2     4
#> 6 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1     4
Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.3.1 (2023-06-16 ucrt)
#>  os       Windows 10 x64 (build 19044)
#>  system   x86_64, mingw32
#>  ui       RTerm
#>  language en
#>  collate  German_Germany.utf8
#>  ctype    German_Germany.utf8
#>  tz       Europe/Berlin
#>  date     2023-08-01
#>  pandoc   3.1.5 @ C:/Users/DANIEL~1.AK-/AppData/Local/Pandoc/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package     * version date (UTC) lib source
#>  bit           4.0.5   2022-11-15 [1] CRAN (R 4.3.0)
#>  bit64         4.0.5   2020-08-30 [1] CRAN (R 4.3.0)
#>  cli           3.6.1   2023-03-23 [1] CRAN (R 4.3.0)
#>  crayon        1.5.2   2022-09-29 [1] CRAN (R 4.3.0)
#>  digest        0.6.33  2023-07-07 [1] CRAN (R 4.3.1)
#>  evaluate      0.21    2023-05-05 [1] CRAN (R 4.3.0)
#>  fansi         1.0.4   2023-01-22 [1] CRAN (R 4.3.0)
#>  fastmap       1.1.1   2023-02-24 [1] CRAN (R 4.3.0)
#>  fs            1.6.3   2023-07-20 [1] CRAN (R 4.3.1)
#>  glue          1.6.2   2022-02-24 [1] CRAN (R 4.3.0)
#>  hms           1.1.3   2023-03-21 [1] CRAN (R 4.3.0)
#>  htmltools     0.5.5   2023-03-23 [1] CRAN (R 4.3.0)
#>  knitr         1.43    2023-05-25 [1] CRAN (R 4.3.0)
#>  lifecycle     1.0.3   2022-10-07 [1] CRAN (R 4.3.0)
#>  magrittr      2.0.3   2022-03-30 [1] CRAN (R 4.3.0)
#>  pillar        1.9.0   2023-03-22 [1] CRAN (R 4.3.0)
#>  pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.3.0)
#>  purrr         1.0.1   2023-01-10 [1] CRAN (R 4.3.0)
#>  R.cache       0.16.0  2022-07-21 [1] CRAN (R 4.3.0)
#>  R.methodsS3   1.8.2   2022-06-13 [1] CRAN (R 4.3.0)
#>  R.oo          1.25.0  2022-06-12 [1] CRAN (R 4.3.0)
#>  R.utils       2.12.2  2022-11-11 [1] CRAN (R 4.3.0)
#>  R6            2.5.1   2021-08-19 [1] CRAN (R 4.3.0)
#>  readr       * 2.1.4   2023-02-10 [1] CRAN (R 4.3.0)
#>  reprex        2.0.2   2022-08-17 [1] CRAN (R 4.3.0)
#>  rlang         1.1.1   2023-04-28 [1] CRAN (R 4.3.0)
#>  rmarkdown     2.23    2023-07-01 [1] CRAN (R 4.3.1)
#>  rstudioapi    0.15.0  2023-07-07 [1] CRAN (R 4.3.1)
#>  sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.3.0)
#>  styler        1.10.1  2023-06-05 [1] CRAN (R 4.3.0)
#>  tibble        3.2.1   2023-03-20 [1] CRAN (R 4.3.0)
#>  tidyselect    1.2.0   2022-10-10 [1] CRAN (R 4.3.0)
#>  tzdb          0.4.0   2023-05-12 [1] CRAN (R 4.3.0)
#>  utf8          1.2.3   2023-01-31 [1] CRAN (R 4.3.0)
#>  vctrs         0.6.3   2023-06-14 [1] CRAN (R 4.3.1)
#>  vroom         1.6.3   2023-04-28 [1] CRAN (R 4.3.0)
#>  withr         2.5.0   2022-03-03 [1] CRAN (R 4.3.0)
#>  xfun          0.39    2023-04-20 [1] CRAN (R 4.3.0)
#>  yaml          2.3.7   2023-01-23 [1] CRAN (R 4.3.0)
#> 
#>  [1] C:/Users/Daniel.AK-HAMBURG/AppData/Local/R/win-library/4.3
#>  [2] C:/Program Files/R/R-4.3.1/library
#> 
#> ──────────────────────────────────────────────────────────────────────────────

Long story short, the default locale setting is wrong. This could be fixed easily with a default_locale2()/locale(decimal_mark = ",", grouping_mark = ".") default. (cf #1445)

dpprdan avatar Aug 01 '23 11:08 dpprdan

Somewhat more minimal reprex:

library(readr)

path <- tempfile()
write_lines(c("x;y", "1,3;2,3"), path)

# wrong
read_csv2_chunked(
  path,
  DataFrameCallback$new(data.frame),
  chunk_size = 3,
  col_types = list()
)
#>    x  y index
#> 1 13 23     1

# right
read_csv2(path, col_types = list())
#> ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
#> # A tibble: 1 × 2
#>       x     y
#>   <dbl> <dbl>
#> 1   1.3   2.3

# right
read_csv2_chunked(
  path,
  DataFrameCallback$new(data.frame),
  chunk_size = 3,
  locale = locale(decimal_mark = ",", grouping_mark = "."),
  col_types = list()
)
#>     x   y index
#> 1 1.3 2.3     1

Created on 2023-08-01 with reprex v2.0.2

hadley avatar Aug 01 '23 12:08 hadley