r-polars icon indicating copy to clipboard operation
r-polars copied to clipboard

`pl$read_csv()` throws error for compressed files

Open jesse-smith opened this issue 10 months ago • 5 comments

When using pl$read_csv() to read a gzipped file, an error message is thrown directing the user to "use read_csv for compressed data" (which is what is being used). It looks like read_csv in R is just a wrapper around scan_csv. scan_csv can't be used for compressed data, so this error is thrown for read_csv as well.

I've added a reprex below. I think a previous version of r-polars let me use read_csv on compressed data, but I could be wrong; this could also be breaking change on the Rust side.

library(polars)

if (!suppressMessages(require(R.utils))) stop("Need `R.utils` to gzip file")

# Version used
packageVersion("polars")
#> [1] '0.16.1'

# Make some data
pl_df <- pl$DataFrame(col1 = letters, col2 = 1:26)

# Save
path <- tempfile(fileext = "csv")
pl_df$write_csv(path)

# Compress
gz_path <- R.utils::gzip(path)

# Try to read back - throws error
pl$read_csv(gz_path)
#> Error: Execution halted with the following contexts
#>    0: In R: in pl$read_csv():
#>    0: During function call [base::tryCatch(base::withCallingHandlers({
#>           NULL
#>           base::saveRDS(base::do.call(base::do.call, base::c(base::readRDS("C:\\Users\\jsmith79\\AppData\\Local\\Temp\\RtmpMbY9aJ\\callr-fun-98e475e83a42"), 
#>               base::list(envir = .GlobalEnv, quote = TRUE)), envir = .GlobalEnv, 
#>               quote = TRUE), file = "C:\\Users\\jsmith79\\AppData\\Local\\Temp\\RtmpMbY9aJ\\callr-res-98e415445ef", 
#>               compress = FALSE)
#>           base::flush(base::stdout())
#>           base::flush(base::stderr())
#>           NULL
#>           base::invisible()
#>       }, error = function(e) {
#>           {
#>               callr_data <- base::as.environment("tools:callr")$`__callr_data__`
#>               err <- callr_data$err
#>               if (FALSE) {
#>                   base::assign(".Traceback", base::.traceback(4), envir = callr_data)
#>                   utils::dump.frames("__callr_dump__")
#>                   base::assign(".Last.dump", .GlobalEnv$`__callr_dump__`, 
#>                       envir = callr_data)
#>                   base::rm("__callr_dump__", envir = .GlobalEnv)
#>               }
#>               e <- err$process_call(e)
#>               e2 <- err$new_error("error in callr subprocess")
#>               class <- base::class
#>               class(e2) <- base::c("callr_remote_error", class(e2))
#>               e2 <- err$add_trace_back(e2)
#>               cut <- base::which(e2$trace$scope == "global")[1]
#>               if (!base::is.na(cut)) {
#>                   e2$trace <- e2$trace[-(1:cut), ]
#>               }
#>               base::saveRDS(base::list("error", e2, e), file = base::paste0("C:\\Users\\jsmith79\\AppData\\Local\\Temp\\RtmpMbY9aJ\\callr-res-98e415445ef", 
#>                   ".error"))
#>           }
#>       }, interrupt = function(e) {
#>           {
#>               callr_data <- base::as.environment("tools:callr")$`__callr_data__`
#>               err <- callr_data$err
#>               if (FALSE) {
#>                   base::assign(".Traceback", base::.traceback(4), envir = callr_data)
#>                   utils::dump.frames("__callr_dump__")
#>                   base::assign(".Last.dump", .GlobalEnv$`__callr_dump__`, 
#>                       envir = callr_data)
#>                   base::rm("__callr_dump__", envir = .GlobalEnv)
#>               }
#>               e <- err$process_call(e)
#>               e2 <- err$new_error("error in callr subprocess")
#>               class <- base::class
#>               class(e2) <- base::c("callr_remote_error", class(e2))
#>               e2 <- err$add_trace_back(e2)
#>               cut <- base::which(e2$trace$scope == "global")[1]
#>               if (!base::is.na(cut)) {
#>                   e2$trace <- e2$trace[-(1:cut), ]
#>               }
#>               base::saveRDS(base::list("error", e2, e), file = base::paste0("C:\\Users\\jsmith79\\AppData\\Local\\Temp\\RtmpMbY9aJ\\callr-res-98e415445ef", 
#>                   ".error"))
#>           }
#>       }, callr_message = function(e) {
#>           base::try(base::signalCondition(e))
#>       }), error = function(e) {
#>           NULL
#>           if (FALSE) {
#>               base::try(base::stop(e))
#>           }
#>           else {
#>               base::invisible()
#>           }
#>       }, interrupt = function(e) {
#>           NULL
#>           if (FALSE) {
#>               e
#>           }
#>           else {
#>               base::invisible()
#>           }
#>       })]
#>    1: Encountered the following error in Rust-Polars:
#>          cannot scan compressed csv; use `read_csv` for compressed data

Created on 2024-04-23 with reprex v2.1.0

Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.3.3 (2024-02-29 ucrt)
#>  os       Windows 11 x64 (build 22631)
#>  system   x86_64, mingw32
#>  ui       RTerm
#>  language (EN)
#>  collate  English_United States.utf8
#>  ctype    English_United States.utf8
#>  tz       America/Chicago
#>  date     2024-04-23
#>  pandoc   3.1.1 @ C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package     * version date (UTC) lib source
#>  cli           3.6.2   2023-12-11 [1] CRAN (R 4.3.3)
#>  digest        0.6.35  2024-03-11 [1] CRAN (R 4.3.3)
#>  evaluate      0.23    2023-11-01 [1] CRAN (R 4.3.3)
#>  fastmap       1.1.1   2023-02-24 [1] CRAN (R 4.3.3)
#>  fs            1.6.3   2023-07-20 [1] CRAN (R 4.3.3)
#>  glue          1.7.0   2024-01-09 [1] CRAN (R 4.3.3)
#>  htmltools     0.5.8.1 2024-04-04 [1] CRAN (R 4.3.3)
#>  knitr         1.46    2024-04-06 [1] CRAN (R 4.3.3)
#>  lifecycle     1.0.4   2023-11-07 [1] CRAN (R 4.3.3)
#>  magrittr      2.0.3   2022-03-30 [1] CRAN (R 4.3.3)
#>  polars      * 0.16.1  2024-04-16 [1] https://r~
#>  purrr         1.0.2   2023-08-10 [1] CRAN (R 4.3.3)
#>  R.cache       0.16.0  2022-07-21 [1] CRAN (R 4.3.3)
#>  R.methodsS3 * 1.8.2   2022-06-13 [1] CRAN (R 4.3.3)
#>  R.oo        * 1.26.0  2024-01-24 [1] CRAN (R 4.3.3)
#>  R.utils     * 2.12.3  2023-11-18 [1] CRAN (R 4.3.3)
#>  reprex        2.1.0   2024-01-11 [1] CRAN (R 4.3.3)
#>  rlang         1.1.3   2024-01-10 [1] CRAN (R 4.3.3)
#>  rmarkdown     2.26    2024-03-05 [1] CRAN (R 4.3.3)
#>  rstudioapi    0.16.0  2024-03-24 [1] CRAN (R 4.3.3)
#>  sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.3.3)
#>  styler        1.10.3  2024-04-07 [1] CRAN (R 4.3.3)
#>  vctrs         0.6.5   2023-12-01 [1] CRAN (R 4.3.3)
#>  withr         3.0.0   2024-01-16 [1] CRAN (R 4.3.3)
#>  xfun          0.43    2024-03-25 [1] CRAN (R 4.3.3)
#>  yaml          2.3.8   2023-12-11 [1] CRAN (R 4.3.2)
#> 
#>  [1] D:/ProgramFiles/R/R-4.3.3/library
#> 
#> ──────────────────────────────────────────────────────────────────────────────

jesse-smith avatar Apr 23 '24 20:04 jesse-smith

Thanks for the report and the reprex, I confirm pl.read_csv() on this gzipped file works. We need to see in the python code if they have a special path for this kind of file

etiennebacher avatar Apr 24 '24 10:04 etiennebacher

Similar issue: pola-rs/polars-cli#60

eitsupi avatar Apr 27 '24 04:04 eitsupi

Perhaps polars::prelude::CsvReader::new() should be used here as in Python. https://github.com/pola-rs/polars/blob/f1846a93f347b7967176d5f0276ad58584781bd6/py-polars/src/dataframe/io.rs#L89-L120

Contributions are welcome!

eitsupi avatar Apr 28 '24 10:04 eitsupi

I'm happy to give it a go, actually! I've been dipping my toes into Rust for the past couple of months and looking for ways to put that to use.

jesse-smith avatar May 01 '24 04:05 jesse-smith

Glad to hear that! Feel free to open WIP PRs.

eitsupi avatar May 01 '24 16:05 eitsupi