caret-machine-learning
caret-machine-learning copied to clipboard
training results are random with same seed set in caret
A number of methods in caret_6.0-58 give slightly different results. Setting the seed should result in the same RMSE and R2 (80 methods in caret do that). Exceptions are:
treebag gbm bdk evtree elm xyf parRF
Example code
# load caret and DT the cars data set
require(caret); require(DT); require(mlbench);
library(AppliedPredictiveModeling)
data(solubility)
# load the data and coerce into single frame (legacy)
training_data = data.frame(solTrainX,solTrainY)[1:80,]
testing_data = data.frame(solTestX,solTestY)
# just rename columsn to stay conform with style below
colnames(training_data)[colnames(training_data) == 'solTrainY'] <- 'y'
colnames(testing_data)[colnames(testing_data) == 'solTestY'] <- 'y'
# all the training data (just named x and y)
y <- training_data$y
x <- training_data[, -ncol(training_data)]
# load all libraries
library(doParallel); cl <- makeCluster(8); registerDoParallel(cl)
# RMSE and R2 results should be the same, three times see knn
set.seed(123); result <- train(x,y,"knn"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"knn"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"knn"); getTrainPerf(result)
# Now the methods that give random results
set.seed(123); result <- train(x,y,"treebag"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"treebag"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"treebag"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"gbm"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"gbm"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"gbm"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"bdk"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"bdk"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"bdk"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"evtree"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"evtree"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"evtree"); getTrainPerf(result)
# elm is the worst offender
set.seed(123); result <- train(x,y,"elm"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"elm"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"elm"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"xyf"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"xyf"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"xyf"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"parRF"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"parRF"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"parRF"); getTrainPerf(result)
# and some working results again
set.seed(123); result <- train(x,y,"rvmLinear"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"rvmLinear"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"rvmLinear"); getTrainPerf(result)
# stop the parallel processing and register sequential front-end
stopCluster(cl); registerDoSEQ();
Results (see correct and incorrect ones, for parRF increase train number to 160).
> # RMSE and R2 results should be the same, three times see knn
> set.seed(123); result <- train(x,y,"knn"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 0.2901132 0.03771506 knn
> set.seed(123); result <- train(x,y,"knn"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 0.2901132 0.03771506 knn
> set.seed(123); result <- train(x,y,"knn"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 0.2901132 0.03771506 knn
>
> # Now the methods that give random results
> set.seed(123); result <- train(x,y,"treebag"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 0.295095 0.04499159 treebag
> set.seed(123); result <- train(x,y,"treebag"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 0.2935537 0.04086652 treebag
> set.seed(123); result <- train(x,y,"treebag"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 0.2925668 0.03898617 treebag
>
> set.seed(123); result <- train(x,y,"gbm"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 0.2947946 0.02526446 gbm
> set.seed(123); result <- train(x,y,"gbm"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 0.2953287 0.02494099 gbm
> set.seed(123); result <- train(x,y,"gbm"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 0.2937742 0.02779757 gbm
> set.seed(123); result <- train(x,y,"bdk"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 0.3475528 0.0539665 bdk
> set.seed(123); result <- train(x,y,"bdk"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 0.3485172 0.05422798 bdk
> set.seed(123); result <- train(x,y,"bdk"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 0.3497129 0.03251845 bdk
>
> set.seed(123); result <- train(x,y,"evtree"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 0.2964285 0.02148905 evtree
> set.seed(123); result <- train(x,y,"evtree"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 0.2970565 0.02178834 evtree
> set.seed(123); result <- train(x,y,"evtree"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 0.2970565 0.02178834 evtree
>
> # elm is the worst offender
> set.seed(123); result <- train(x,y,"elm"); getTrainPerf(result)
Warning message:
In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
There were missing values in resampled performance measures.
TrainRMSE TrainRsquared method
1 0.3061933 0.02887321 elm
> set.seed(123); result <- train(x,y,"elm"); getTrainPerf(result)
Warning message:
In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
There were missing values in resampled performance measures.
TrainRMSE TrainRsquared method
1 0.3223984 0.0338859 elm
> set.seed(123); result <- train(x,y,"elm"); getTrainPerf(result)
Warning message:
In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
There were missing values in resampled performance measures.
TrainRMSE TrainRsquared method
1 0.2954139 0.02630014 elm
>
>
> set.seed(123); result <- train(x,y,"xyf"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 0.2798505 0.0172159 xyf
> set.seed(123); result <- train(x,y,"xyf"); getTrainPerf(result)
Warning message:
In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
There were missing values in resampled performance measures.
TrainRMSE TrainRsquared method
1 0.2790053 0.01891035 xyf
> set.seed(123); result <- train(x,y,"xyf"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 0.279014 0.01531723 xyf
> set.seed(123); result <- train(x,y,"parRF"); getTrainPerf(result)
Something is wrong; all the RMSE metric values are missing:
> set.seed(123); result <- train(x,y,"parRF"); getTrainPerf(result)
Something is wrong; all the RMSE metric values are missing:
> set.seed(123); result <- train(x,y,"parRF"); getTrainPerf(result)
Something is wrong; all the RMSE metric values are missing:
> # and some working results again
> set.seed(123); result <- train(x,y,"rvmLinear"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 1.066702 0.03438293 rvmLinear
> set.seed(123); result <- train(x,y,"rvmLinear"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 1.066702 0.03438293 rvmLinear
> set.seed(123); result <- train(x,y,"rvmLinear"); getTrainPerf(result)
TrainRMSE TrainRsquared method
1 1.066702 0.03438293 rvmLinear
One easy way to run fully reproducible model in parallel mode using the caret package is by using the seeds argument when calling the train control.
see also http://stackoverflow.com/questions/13403427/fully-reproducible-parallel-models-using-caret