caretEnsemble icon indicating copy to clipboard operation
caretEnsemble copied to clipboard

c operator to combine train objects into a list should check for identical indexes (and fail if they're not)

Open brent-halen opened this issue 9 years ago • 2 comments
trafficstars

I ran into this issue while working on a different data set/project, but the minimal dataset I detailed below seems to reproduce the issue. However, I do get several warnings running the below code that I don't get when using my other data. If it's necessary, I'll try to construct a more representative facsimile of my data.

When I attempt to use the 'caretStack' function, I'm getting a strange error message:

"Error { .... is not TRUE"

I'm including a screenshot for verification.

https://imgur.com/JbDFqlR

I have no idea how to go about fixing the problem, as I have no idea what is actually broken. I was getting this error in both Windows 10 and Ubuntu 14.04.

Minimal dataset:

library(caret)
col <- c(rnorm(10)*2000)
Data <- data.frame(
    X = sample(1:10),
    Y = sample(c("yes", "no"), 10, replace = TRUE)
)
Data <- cbind(Data,col)
colnames(Data)[3] <- "loss"

dmy <- dummyVars(loss~ ., data = Data)
Data.1 <- predict(dmy, newdata=Data)
Data.1.df <- as.data.frame(Data.1)

Data <- Data.1.df
Data <- cbind(Data,col)
colnames(Data)[4] <- "loss"

Minimal, runnable code:

library(elasticnet)
library(pls)
library(nnet)
library(e1071)
library(randomForest)
library(gbm)
library(plyr)
library(MASS)
library(caretEnsemble)


control <- trainControl(method="repeatedcv", number=10, repeats=3, verboseIter=TRUE)

model1 <- train(loss~., data = as.data.frame(Data), method='glm', trControl=control)
model2 <- train(loss~., data = as.data.frame(Data), method='svmRadial', trControl=control)
model3 <- train(loss~., data = as.data.frame(Data), method='svmPoly', trControl=control)
model4 <- train(loss~., data = as.data.frame(Data), method='elm', trControl=control)
model5 <- train(loss~., data = as.data.frame(Data), method='nnet', trControl=control)
model6 <- train(loss~., data = as.data.frame(Data), method='rf', trControl=control)
model7 <- train(loss~., data = as.data.frame(Data), method='lasso', trControl=control)



models <- c(model1, model2, model3, model4, model5, model6, model7)

caretStack(models, method = "rf")
caretStack(models, method = "gbm")
models <- c(model1,model3)
caretStack(models,method = "rf")
caretStack(models,method = "gbm")

Session Info:

>sessionInfo()
R version 3.3.1 (2016-06-21)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 10586)

locale:
[1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252    LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                           LC_TIME=English_United States.1252    

attached base packages:
[1] parallel  splines   stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] elmNN_1.0           kernlab_0.9-24      caretEnsemble_2.0.0 MASS_7.3-45         plyr_1.8.4          gbm_2.1.1           survival_2.39-4    
 [8] randomForest_4.6-12 e1071_1.6-7         caret_6.0-71        ggplot2_2.1.0       lattice_0.20-33     nnet_7.3-12         pls_2.5-0          
[15] elasticnet_1.1      lars_1.2           

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.5        compiler_3.3.1     nloptr_1.0.4       iterators_1.0.8    class_7.3-14       tools_3.3.1        lme4_1.1-12       
 [8] digest_0.6.10      nlme_3.1-128       gtable_0.2.0       mgcv_1.8-12        Matrix_1.2-6       foreach_1.4.3      SparseM_1.7       
[15] gridExtra_2.2.1    stringr_1.1.0      MatrixModels_0.4-1 stats4_3.3.1       grid_3.3.1         data.table_1.9.6   pbapply_1.3-0     
[22] minqa_1.2.4        reshape2_1.4.1     car_2.1-2          magrittr_1.5       scales_0.4.0       codetools_0.2-14   pbkrtest_0.4-6    
[29] colorspace_1.2-6   quantreg_5.26      stringi_1.1.1      munsell_0.4.3      chron_2.3-47      


If there's anything else I need to provide, let me know.

brent-halen avatar Oct 13 '16 19:10 brent-halen

The problem is that your models all use different re-sampling folds, because you do not explicitly define them in your trainControl.

Please use the caretList helper function:

library(caret)
col <- c(rnorm(10)*2000)
Data <- data.frame(
  X = sample(1:10),
  Y = sample(c("yes", "no"), 10, replace = TRUE)
)
Data <- cbind(Data,col)
colnames(Data)[3] <- "loss"

dmy <- dummyVars(loss~ ., data = Data)
Data.1 <- predict(dmy, newdata=Data)
Data.1.df <- as.data.frame(Data.1)

Data <- Data.1.df
Data <- cbind(Data,col)
colnames(Data)[4] <- "loss"

library(elasticnet)
library(pls)
library(nnet)
library(e1071)
library(randomForest)
library(gbm)
library(plyr)
library(MASS)
library(caretEnsemble)


control <- trainControl(method="repeatedcv", number=10, repeats=3, verboseIter=TRUE)
models <- caretList(loss~., data = as.data.frame(Data), methodList = c('glm', 'svmRadial', 'svmPoly', 'elm', 'nnet', 'rf'), trControl=control)

caretStack(models, method = "rf")
caretStack(models, method = "gbm", tuneGrid=expand.grid(n.minobsinnode=1, n.trees=10, interaction.depth=1, shrinkage=0.1))

zachmayer avatar Oct 13 '16 19:10 zachmayer

If you must create the models one at a time, you MUST specify an explicitly index to the trainControl:

library(caret)
col <- c(rnorm(10)*2000)
Data <- data.frame(
  X = sample(1:10),
  Y = sample(c("yes", "no"), 10, replace = TRUE)
)
Data <- cbind(Data,col)
colnames(Data)[3] <- "loss"

dmy <- dummyVars(loss~ ., data = Data)
Data.1 <- predict(dmy, newdata=Data)
Data.1.df <- as.data.frame(Data.1)

Data <- Data.1.df
Data <- cbind(Data,col)
colnames(Data)[4] <- "loss"

library(elasticnet)
library(pls)
library(nnet)
library(e1071)
library(randomForest)
library(gbm)
library(plyr)
library(MASS)
library(caretEnsemble)

index <- createMultiFolds(Data[['loss']], 10, 3)
control <- trainControl(method="repeatedcv", number=10, repeats=3, verboseIter=TRUE, index=index, savePredictions=TRUE, classProbs=TRUE)
model1 <- train(loss~., data = as.data.frame(Data), method='glm', trControl=control)
model2 <- train(loss~., data = as.data.frame(Data), method='svmRadial', trControl=control)
model3 <- train(loss~., data = as.data.frame(Data), method='svmPoly', trControl=control)
model4 <- train(loss~., data = as.data.frame(Data), method='elm', trControl=control)
model5 <- train(loss~., data = as.data.frame(Data), method='nnet', trControl=control)
model6 <- train(loss~., data = as.data.frame(Data), method='rf', trControl=control)
#model7 <- train(loss~., data = as.data.frame(Data), method='lasso', trControl=control) #Always fails

models <- c(model1, model2, model3, model4, model5, model6)

caretStack(models, method = "rf")
caretStack(models, method = "gbm", tuneGrid=expand.grid(n.minobsinnode=1, n.trees=10, interaction.depth=1, shrinkage=0.1))
models <- c(model1,model3)
caretStack(models,method = "rf")
caretStack(models,method = "gbm", tuneGrid=expand.grid(n.minobsinnode=1, n.trees=10, interaction.depth=1, shrinkage=0.1))

zachmayer avatar Oct 13 '16 20:10 zachmayer