mlr3pipelines icon indicating copy to clipboard operation
mlr3pipelines copied to clipboard

Error in task_data(self, rows, cols, data_format, ordered) : Assertion on 'rows' failed

Open mohammadreza-sheykhmousa opened this issue 4 years ago • 4 comments

@mllg Dear mlr3 team, I have been trying to run a model with spcv resampling which gives me the above mentioned error. However, the same setting wiht CV works just fine. I think this has to with a bug when it comes to sparse data resampling with spcv and alike. Here is reprex for your reference. Data can be downloaded from here.

library(mlr3verse)
library(mlr3spatiotempcv)
library(future)
library(progressr)
library(checkmate)
library(dplyr)
cm.croatia <- readRDS("/data/eumap/sample-data/R-sample-tiles/9529/9529_croatia_samples.rds")
df <-  cm.croatia
df$lc_class <- as.factor(df$lc_class)
df <- df %>% group_by_if(is.character, as.factor)
df$row.id <- NULL
df$survey_date <- NULL
df$lucas <- NULL
df$Date <- NULL
df$id <- NULL
df$year <- NULL
df$tile_id <- NULL
df$confidence <- NULL
colnames(df)[2] <- "x"
colnames(df)[3] <- "y"
df.trf = mlr3::as_data_backend(df)
tsk_clf = TaskClassifST$new(id = "df", backend = df.trf, target = "lc_class",extra_args = list( positive = "TRUE", coordinate_names = c("x","y"),coords_as_features = FALSE, crs = "+init=epsg:3035"))
pre =  po("encode") %>>%  po("imputemode") %>>% po("removeconstants")
g = pre %>>% 
  gunion(
    list(
      po("select") %>>% po("learner_cv", id = "kknn", lrn("classif.kknn")),
      po("pca") %>>% po("learner_cv", id = "featureless", lrn("classif.featureless")),
      po("subsample") %>>% po("learner_cv", id = "rpart", lrn("classif.rpart"))
      
    )
  ) %>>%
  po("featureunion") %>>%
  po("learner", lrn("classif.ranger",importance ="permutation")) 
resampling_sp = rsmp("repeated_spcv_coords", folds = 2, repeats = 4)
rr_sp = rsmp(
  task = tsk_regr, learner = g,
  resampling = resampling_sp
)
g$keep_results = "TRUE"
g$train(tsk_clf)
#> INFO  [16:25:08.727] Applying learner 'classif.kknn' on task 'df' (iter 1/3) 
#> INFO  [16:25:08.887] Applying learner 'classif.kknn' on task 'df' (iter 3/3) 
#> INFO  [16:25:08.998] Applying learner 'classif.kknn' on task 'df' (iter 2/3) 
#> INFO  [16:25:09.258] Applying learner 'classif.featureless' on task 'df' (iter 2/3) 
#> INFO  [16:25:09.283] Applying learner 'classif.featureless' on task 'df' (iter 3/3) 
#> INFO  [16:25:09.307] Applying learner 'classif.featureless' on task 'df' (iter 1/3) 
#> INFO  [16:25:09.501] Applying learner 'classif.rpart' on task 'df' (iter 1/3) 
#> INFO  [16:25:09.557] Applying learner 'classif.rpart' on task 'df' (iter 3/3) 
#> INFO  [16:25:09.611] Applying learner 'classif.rpart' on task 'df' (iter 2/3)
#> Error in task_data(self, rows, cols, data_format, ordered): Assertion on 'rows' failed: Must be a subset of {'1','2','3','5','8','9','10','12','13','17','18','21','23','24','26','28','32','33','36','39','41','43','44','46','48','49','50','51','52','53','56','60','64','65','66','68','70','71','73','75','76','78','79','80','81','84','85','86','87','91','92','93','95','98','99','100','101','103','104','106','107','108','109','##some value deleted ##
86','687','688','689','690','692','693','694','695','700','702','703','704','707','708','709','711','712','713','715','717','718','719','720','721','722','724','725','726','731','732','734','735','736','737','738','739','740','742','743','744','745','746','747','748','749','750','751','755','756','757','758'}, but is {'1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32','33','34','35','36','37','38','39','40','41','42','43','44','45','46','47','48','49','50','51','52','53','54','55','56','57','58','59','60','61','62','63','64','65','66','67'
##some value deleted ##
'8','709','710','711','712','713','714','715','716','717','718','719','720','721','722','723','724','725','726','727','728','729','730','731','732','733','734','735','736','737','738','739','740','741','742','743','744','745','746','747','748','749','750','751','752','753','754','755','756','757','758','759'}.

@be-marc can you try to reproduce?

mllg avatar Feb 02 '21 19:02 mllg

Yes I can reproduce. I am on it.

Simplified reprex.

library(mlr3)
library(mlr3pipelines)

task = tsk("iris")
resampling = rsmp("holdout")

graph = gunion(
  list(
    po("pca") %>>% po("learner_cv", id = "featureless", lrn("classif.featureless")),
    po("subsample") %>>% po("learner_cv", id = "rpart", lrn("classif.rpart")))
  ) %>>%
  po("featureunion") %>>%
  po("learner", lrn("classif.rpart")) 

resample(task, graph, resampling)

be-marc avatar Feb 03 '21 07:02 be-marc

@mohammadreza-sheykhmousa You cannot use po("subsample"). The resulting task has less rows than the other tasks created by po("learner_cv") and therefore po("featureunion") cannot bind them.

be-marc avatar Feb 03 '21 08:02 be-marc

Can we improve the error message in POFeatureUnion @mb706 ?

mllg avatar Feb 03 '21 08:02 mllg