mlr3pipelines
mlr3pipelines copied to clipboard
Error in task_data(self, rows, cols, data_format, ordered) : Assertion on 'rows' failed
@mllg
Dear mlr3 team,
I have been trying to run a model with spcv
resampling which gives me the above mentioned error. However, the same setting wiht CV
works just fine. I think this has to with a bug when it comes to sparse data resampling with spcv
and alike.
Here is reprex for your reference. Data can be downloaded from here.
library(mlr3verse)
library(mlr3spatiotempcv)
library(future)
library(progressr)
library(checkmate)
library(dplyr)
cm.croatia <- readRDS("/data/eumap/sample-data/R-sample-tiles/9529/9529_croatia_samples.rds")
df <- cm.croatia
df$lc_class <- as.factor(df$lc_class)
df <- df %>% group_by_if(is.character, as.factor)
df$row.id <- NULL
df$survey_date <- NULL
df$lucas <- NULL
df$Date <- NULL
df$id <- NULL
df$year <- NULL
df$tile_id <- NULL
df$confidence <- NULL
colnames(df)[2] <- "x"
colnames(df)[3] <- "y"
df.trf = mlr3::as_data_backend(df)
tsk_clf = TaskClassifST$new(id = "df", backend = df.trf, target = "lc_class",extra_args = list( positive = "TRUE", coordinate_names = c("x","y"),coords_as_features = FALSE, crs = "+init=epsg:3035"))
pre = po("encode") %>>% po("imputemode") %>>% po("removeconstants")
g = pre %>>%
gunion(
list(
po("select") %>>% po("learner_cv", id = "kknn", lrn("classif.kknn")),
po("pca") %>>% po("learner_cv", id = "featureless", lrn("classif.featureless")),
po("subsample") %>>% po("learner_cv", id = "rpart", lrn("classif.rpart"))
)
) %>>%
po("featureunion") %>>%
po("learner", lrn("classif.ranger",importance ="permutation"))
resampling_sp = rsmp("repeated_spcv_coords", folds = 2, repeats = 4)
rr_sp = rsmp(
task = tsk_regr, learner = g,
resampling = resampling_sp
)
g$keep_results = "TRUE"
g$train(tsk_clf)
#> INFO [16:25:08.727] Applying learner 'classif.kknn' on task 'df' (iter 1/3)
#> INFO [16:25:08.887] Applying learner 'classif.kknn' on task 'df' (iter 3/3)
#> INFO [16:25:08.998] Applying learner 'classif.kknn' on task 'df' (iter 2/3)
#> INFO [16:25:09.258] Applying learner 'classif.featureless' on task 'df' (iter 2/3)
#> INFO [16:25:09.283] Applying learner 'classif.featureless' on task 'df' (iter 3/3)
#> INFO [16:25:09.307] Applying learner 'classif.featureless' on task 'df' (iter 1/3)
#> INFO [16:25:09.501] Applying learner 'classif.rpart' on task 'df' (iter 1/3)
#> INFO [16:25:09.557] Applying learner 'classif.rpart' on task 'df' (iter 3/3)
#> INFO [16:25:09.611] Applying learner 'classif.rpart' on task 'df' (iter 2/3)
#> Error in task_data(self, rows, cols, data_format, ordered): Assertion on 'rows' failed: Must be a subset of {'1','2','3','5','8','9','10','12','13','17','18','21','23','24','26','28','32','33','36','39','41','43','44','46','48','49','50','51','52','53','56','60','64','65','66','68','70','71','73','75','76','78','79','80','81','84','85','86','87','91','92','93','95','98','99','100','101','103','104','106','107','108','109','##some value deleted ##
86','687','688','689','690','692','693','694','695','700','702','703','704','707','708','709','711','712','713','715','717','718','719','720','721','722','724','725','726','731','732','734','735','736','737','738','739','740','742','743','744','745','746','747','748','749','750','751','755','756','757','758'}, but is {'1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32','33','34','35','36','37','38','39','40','41','42','43','44','45','46','47','48','49','50','51','52','53','54','55','56','57','58','59','60','61','62','63','64','65','66','67'
##some value deleted ##
'8','709','710','711','712','713','714','715','716','717','718','719','720','721','722','723','724','725','726','727','728','729','730','731','732','733','734','735','736','737','738','739','740','741','742','743','744','745','746','747','748','749','750','751','752','753','754','755','756','757','758','759'}.
@be-marc can you try to reproduce?
Yes I can reproduce. I am on it.
Simplified reprex.
library(mlr3)
library(mlr3pipelines)
task = tsk("iris")
resampling = rsmp("holdout")
graph = gunion(
list(
po("pca") %>>% po("learner_cv", id = "featureless", lrn("classif.featureless")),
po("subsample") %>>% po("learner_cv", id = "rpart", lrn("classif.rpart")))
) %>>%
po("featureunion") %>>%
po("learner", lrn("classif.rpart"))
resample(task, graph, resampling)
@mohammadreza-sheykhmousa You cannot use po("subsample")
. The resulting task has less rows than the other tasks created by po("learner_cv")
and therefore po("featureunion")
cannot bind them.
Can we improve the error message in POFeatureUnion @mb706 ?