caret-machine-learning icon indicating copy to clipboard operation
caret-machine-learning copied to clipboard

training results are random with same seed set in caret

Open tobigithub opened this issue 9 years ago • 1 comments

A number of methods in caret_6.0-58 give slightly different results. Setting the seed should result in the same RMSE and R2 (80 methods in caret do that). Exceptions are:

treebag gbm bdk evtree elm xyf parRF

Example code

# load caret and DT the cars data set
require(caret); require(DT);  require(mlbench);
library(AppliedPredictiveModeling)
data(solubility)

# load the data and coerce into single frame (legacy)
training_data = data.frame(solTrainX,solTrainY)[1:80,]
testing_data = data.frame(solTestX,solTestY)

# just rename columsn to stay conform with style below
colnames(training_data)[colnames(training_data) == 'solTrainY'] <- 'y'
colnames(testing_data)[colnames(testing_data) == 'solTestY'] <- 'y'


# all the training data (just named x and y)
y <- training_data$y
x <- training_data[, -ncol(training_data)]

# load all libraries
library(doParallel); cl <- makeCluster(8); registerDoParallel(cl)

# RMSE and R2 results should be the same, three times see knn
set.seed(123); result <- train(x,y,"knn"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"knn"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"knn"); getTrainPerf(result)

# Now the methods that give random results
set.seed(123); result <- train(x,y,"treebag"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"treebag"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"treebag"); getTrainPerf(result)

set.seed(123); result <- train(x,y,"gbm"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"gbm"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"gbm"); getTrainPerf(result)

set.seed(123); result <- train(x,y,"bdk"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"bdk"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"bdk"); getTrainPerf(result)

set.seed(123); result <- train(x,y,"evtree"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"evtree"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"evtree"); getTrainPerf(result)

# elm is the worst offender
set.seed(123); result <- train(x,y,"elm"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"elm"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"elm"); getTrainPerf(result)


set.seed(123); result <- train(x,y,"xyf"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"xyf"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"xyf"); getTrainPerf(result)

set.seed(123); result <- train(x,y,"parRF"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"parRF"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"parRF"); getTrainPerf(result)

# and some working results again
set.seed(123); result <- train(x,y,"rvmLinear"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"rvmLinear"); getTrainPerf(result)
set.seed(123); result <- train(x,y,"rvmLinear"); getTrainPerf(result)

# stop the parallel processing and register sequential front-end
stopCluster(cl); registerDoSEQ();

Results (see correct and incorrect ones, for parRF increase train number to 160).

> # RMSE and R2 results should be the same, three times see knn
> set.seed(123); result <- train(x,y,"knn"); getTrainPerf(result)
  TrainRMSE TrainRsquared method
1 0.2901132    0.03771506    knn
> set.seed(123); result <- train(x,y,"knn"); getTrainPerf(result)
  TrainRMSE TrainRsquared method
1 0.2901132    0.03771506    knn
> set.seed(123); result <- train(x,y,"knn"); getTrainPerf(result)
  TrainRMSE TrainRsquared method
1 0.2901132    0.03771506    knn
> 
> # Now the methods that give random results
> set.seed(123); result <- train(x,y,"treebag"); getTrainPerf(result)
  TrainRMSE TrainRsquared  method
1  0.295095    0.04499159 treebag
> set.seed(123); result <- train(x,y,"treebag"); getTrainPerf(result)
  TrainRMSE TrainRsquared  method
1 0.2935537    0.04086652 treebag
> set.seed(123); result <- train(x,y,"treebag"); getTrainPerf(result)
  TrainRMSE TrainRsquared  method
1 0.2925668    0.03898617 treebag
> 
> set.seed(123); result <- train(x,y,"gbm"); getTrainPerf(result)
  TrainRMSE TrainRsquared method
1 0.2947946    0.02526446    gbm
> set.seed(123); result <- train(x,y,"gbm"); getTrainPerf(result)
  TrainRMSE TrainRsquared method
1 0.2953287    0.02494099    gbm
> set.seed(123); result <- train(x,y,"gbm"); getTrainPerf(result)
  TrainRMSE TrainRsquared method
1 0.2937742    0.02779757    gbm
> set.seed(123); result <- train(x,y,"bdk"); getTrainPerf(result)
  TrainRMSE TrainRsquared method
1 0.3475528     0.0539665    bdk
> set.seed(123); result <- train(x,y,"bdk"); getTrainPerf(result)
  TrainRMSE TrainRsquared method
1 0.3485172    0.05422798    bdk
> set.seed(123); result <- train(x,y,"bdk"); getTrainPerf(result)
  TrainRMSE TrainRsquared method
1 0.3497129    0.03251845    bdk
> 
> set.seed(123); result <- train(x,y,"evtree"); getTrainPerf(result)
  TrainRMSE TrainRsquared method
1 0.2964285    0.02148905 evtree
> set.seed(123); result <- train(x,y,"evtree"); getTrainPerf(result)
  TrainRMSE TrainRsquared method
1 0.2970565    0.02178834 evtree
> set.seed(123); result <- train(x,y,"evtree"); getTrainPerf(result)
  TrainRMSE TrainRsquared method
1 0.2970565    0.02178834 evtree
> 
> # elm is the worst offender
> set.seed(123); result <- train(x,y,"elm"); getTrainPerf(result)
Warning message:
In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,  :
  There were missing values in resampled performance measures.
  TrainRMSE TrainRsquared method
1 0.3061933    0.02887321    elm
> set.seed(123); result <- train(x,y,"elm"); getTrainPerf(result)
Warning message:
In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,  :
  There were missing values in resampled performance measures.
  TrainRMSE TrainRsquared method
1 0.3223984     0.0338859    elm
> set.seed(123); result <- train(x,y,"elm"); getTrainPerf(result)
Warning message:
In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,  :
  There were missing values in resampled performance measures.
  TrainRMSE TrainRsquared method
1 0.2954139    0.02630014    elm
> 
> 
> set.seed(123); result <- train(x,y,"xyf"); getTrainPerf(result)
  TrainRMSE TrainRsquared method
1 0.2798505     0.0172159    xyf
> set.seed(123); result <- train(x,y,"xyf"); getTrainPerf(result)
Warning message:
In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,  :
  There were missing values in resampled performance measures.
  TrainRMSE TrainRsquared method
1 0.2790053    0.01891035    xyf
> set.seed(123); result <- train(x,y,"xyf"); getTrainPerf(result)
  TrainRMSE TrainRsquared method
1  0.279014    0.01531723    xyf
> set.seed(123); result <- train(x,y,"parRF"); getTrainPerf(result)
Something is wrong; all the RMSE metric values are missing:
> set.seed(123); result <- train(x,y,"parRF"); getTrainPerf(result)
Something is wrong; all the RMSE metric values are missing:
> set.seed(123); result <- train(x,y,"parRF"); getTrainPerf(result)
Something is wrong; all the RMSE metric values are missing:

> # and some working results again
> set.seed(123); result <- train(x,y,"rvmLinear"); getTrainPerf(result)
  TrainRMSE TrainRsquared    method
1  1.066702    0.03438293 rvmLinear
> set.seed(123); result <- train(x,y,"rvmLinear"); getTrainPerf(result)
  TrainRMSE TrainRsquared    method
1  1.066702    0.03438293 rvmLinear
> set.seed(123); result <- train(x,y,"rvmLinear"); getTrainPerf(result)
  TrainRMSE TrainRsquared    method
1  1.066702    0.03438293 rvmLinear

tobigithub avatar Dec 30 '15 06:12 tobigithub

One easy way to run fully reproducible model in parallel mode using the caret package is by using the seeds argument when calling the train control.

see also http://stackoverflow.com/questions/13403427/fully-reproducible-parallel-models-using-caret

tobigithub avatar Dec 30 '15 06:12 tobigithub