mlr3
mlr3 copied to clipboard
Compare predictive performance of individual models and classification average in trained graph
My goal is to compare predictive performance of individual models (say ranger, kknn and log_reg) with classif.average
of all models.
I have created following graph with individual learners and classif average at the end:
library(mlr3verse)
task = tsk("german_credit")
# learners
learners = list(
ranger = lrn("classif.ranger", predict_type = "prob", id = "ranger"),
log_reg = lrn("classif.log_reg", predict_type = "prob", id = "log_reg"),
kknn = lrn("classif.kknn", predict_type = "prob", id = "kknn")
)
# create complete grapg
graph = po("removeconstants", ratio = 0.05) %>>%
po("branch", options = c("nop_prep", "yeojohnson", "pca", "ica"), id = "prep_branch") %>>%
gunion(list(po("nop", id = "nop_prep"), po("yeojohnson"), po("pca", scale. = TRUE), po("ica"))) %>>%
po("unbranch", id = "prep_unbranch") %>>%
learners %>>%
po("classifavg", innum = length(learners))
plot(graph)
graph_learner = as_learner(graph)
as.data.table(graph_learner$param_set)[1:70, .(id, class, lower, upper)]
search_space = ps(
# preprocesing
# interaction_branch.selection = p_fct(levels = c("nop_filter", "modelmatrix")),
prep_branch.selection = p_fct(levels = c("nop_prep", "yeojohnson", "pca", "ica")),
pca.rank. = p_int(2, 6, depends = prep_branch.selection == "pca"),
ica.n.comp = p_int(2, 6, depends = prep_branch.selection == "ica"),
yeojohnson.standardize = p_lgl(depends = prep_branch.selection == "yeojohnson"),
# models
ranger.ranger.mtry.ratio = p_dbl(0.2, 1),
ranger.ranger.max.depth = p_int(2, 6),
kknn.kknn.k = p_int(5, 20)
)
# plan("multisession", workers = 4L)
at = auto_tuner(
method = "random_search",
learner = graph_learner,
resampling = rsmp("cv", folds = 3),
measure = msr("classif.acc"),
search_space = search_space,
term_evals = 15
)
at$train(task)
How can now compare predictive performance of average with preditive performance of indiviudla model?
I know I can train those model separately (inside graph or in new learner list), but than I am training the same model multiple times .
Sorry for the late response, does the following help?
library(mlr3verse)
#> Loading required package: mlr3
task = tsk("german_credit")
# learners
learners = list(
ranger = lrn("classif.ranger", predict_type = "prob", id = "ranger"),
log_reg = lrn("classif.log_reg", predict_type = "prob", id = "log_reg"),
kknn = lrn("classif.kknn", predict_type = "prob", id = "kknn")
)
# create complete grapg
graph = po("removeconstants", ratio = 0.05) %>>%
po("branch", options = c("nop_prep", "yeojohnson", "pca", "ica"), id = "prep_branch") %>>%
gunion(list(po("nop", id = "nop_prep"), po("yeojohnson"), po("pca", scale. = TRUE), po("ica"))) %>>%
po("unbranch", id = "prep_unbranch") %>>%
learners %>>%
po("classifavg", innum = length(learners))
plot(graph)
graph_learner = as_learner(graph)
as.data.table(graph_learner$param_set)[1:70, .(id, class, lower, upper)]
#> id class lower upper
#> 1: removeconstants.ratio ParamDbl 0 1
#> 2: removeconstants.rel_tol ParamDbl 0 Inf
#> 3: removeconstants.abs_tol ParamDbl 0 Inf
#> 4: removeconstants.na_ignore ParamLgl NA NA
#> 5: removeconstants.affect_columns ParamUty NA NA
#> 6: prep_branch.selection ParamFct NA NA
#> 7: yeojohnson.eps ParamDbl 0 Inf
#> 8: yeojohnson.standardize ParamLgl NA NA
#> 9: yeojohnson.lower ParamDbl -Inf Inf
#> 10: yeojohnson.upper ParamDbl -Inf Inf
#> 11: yeojohnson.affect_columns ParamUty NA NA
#> 12: pca.center ParamLgl NA NA
#> 13: pca.scale. ParamLgl NA NA
#> 14: pca.rank. ParamInt 1 Inf
#> 15: pca.affect_columns ParamUty NA NA
#> 16: ica.n.comp ParamInt 1 Inf
#> 17: ica.alg.typ ParamFct NA NA
#> 18: ica.fun ParamFct NA NA
#> 19: ica.alpha ParamDbl 1 2
#> 20: ica.method ParamFct NA NA
#> 21: ica.row.norm ParamLgl NA NA
#> 22: ica.maxit ParamInt 1 Inf
#> 23: ica.tol ParamDbl 0 Inf
#> 24: ica.verbose ParamLgl NA NA
#> 25: ica.w.init ParamUty NA NA
#> 26: ica.affect_columns ParamUty NA NA
#> 27: ranger.ranger.alpha ParamDbl -Inf Inf
#> 28: ranger.ranger.always.split.variables ParamUty NA NA
#> 29: ranger.ranger.class.weights ParamUty NA NA
#> 30: ranger.ranger.holdout ParamLgl NA NA
#> 31: ranger.ranger.importance ParamFct NA NA
#> 32: ranger.ranger.keep.inbag ParamLgl NA NA
#> 33: ranger.ranger.max.depth ParamInt 0 Inf
#> 34: ranger.ranger.min.node.size ParamInt 1 Inf
#> 35: ranger.ranger.min.prop ParamDbl -Inf Inf
#> 36: ranger.ranger.minprop ParamDbl -Inf Inf
#> 37: ranger.ranger.mtry ParamInt 1 Inf
#> 38: ranger.ranger.mtry.ratio ParamDbl 0 1
#> 39: ranger.ranger.num.random.splits ParamInt 1 Inf
#> 40: ranger.ranger.num.threads ParamInt 1 Inf
#> 41: ranger.ranger.num.trees ParamInt 1 Inf
#> 42: ranger.ranger.oob.error ParamLgl NA NA
#> 43: ranger.ranger.regularization.factor ParamUty NA NA
#> 44: ranger.ranger.regularization.usedepth ParamLgl NA NA
#> 45: ranger.ranger.replace ParamLgl NA NA
#> 46: ranger.ranger.respect.unordered.factors ParamFct NA NA
#> 47: ranger.ranger.sample.fraction ParamDbl 0 1
#> 48: ranger.ranger.save.memory ParamLgl NA NA
#> 49: ranger.ranger.scale.permutation.importance ParamLgl NA NA
#> 50: ranger.ranger.se.method ParamFct NA NA
#> 51: ranger.ranger.seed ParamInt -Inf Inf
#> 52: ranger.ranger.split.select.weights ParamUty NA NA
#> 53: ranger.ranger.splitrule ParamFct NA NA
#> 54: ranger.ranger.verbose ParamLgl NA NA
#> 55: ranger.ranger.write.forest ParamLgl NA NA
#> 56: log_reg.log_reg.dispersion ParamUty NA NA
#> 57: log_reg.log_reg.epsilon ParamDbl -Inf Inf
#> 58: log_reg.log_reg.etastart ParamUty NA NA
#> 59: log_reg.log_reg.maxit ParamDbl -Inf Inf
#> 60: log_reg.log_reg.model ParamLgl NA NA
#> 61: log_reg.log_reg.mustart ParamUty NA NA
#> 62: log_reg.log_reg.offset ParamUty NA NA
#> 63: log_reg.log_reg.singular.ok ParamLgl NA NA
#> 64: log_reg.log_reg.start ParamUty NA NA
#> 65: log_reg.log_reg.trace ParamLgl NA NA
#> 66: log_reg.log_reg.x ParamLgl NA NA
#> 67: log_reg.log_reg.y ParamLgl NA NA
#> 68: kknn.kknn.k ParamInt 1 Inf
#> 69: kknn.kknn.distance ParamDbl 0 Inf
#> 70: kknn.kknn.kernel ParamFct NA NA
#> id class lower upper
search_space = ps(
# preprocesing
# interaction_branch.selection = p_fct(levels = c("nop_filter", "modelmatrix")),
prep_branch.selection = p_fct(levels = c("nop_prep", "yeojohnson", "pca", "ica")),
pca.rank. = p_int(2, 6, depends = prep_branch.selection == "pca"),
ica.n.comp = p_int(2, 6, depends = prep_branch.selection == "ica"),
yeojohnson.standardize = p_lgl(depends = prep_branch.selection == "yeojohnson"),
# models
ranger.ranger.mtry.ratio = p_dbl(0.2, 1),
ranger.ranger.max.depth = p_int(2, 6),
kknn.kknn.k = p_int(5, 20)
)
# plan("multisession", workers = 4L)
at = auto_tuner(
method = "random_search",
learner = graph_learner,
resampling = rsmp("cv", folds = 3),
measure = msr("classif.acc"),
search_space = search_space,
term_evals = 2L,
store_models = TRUE
)
at$train(task)
#> INFO [12:03:59.591] [bbotk] Starting to optimize 7 parameter(s) with '<OptimizerRandomSearch>' and '<TerminatorEvals> [n_evals=2, k=0]'
#> INFO [12:03:59.894] [bbotk] Evaluating 1 configuration(s)
#> INFO [12:03:59.964] [mlr3] Running benchmark with 3 resampling iterations
#> INFO [12:04:00.010] [mlr3] Applying learner 'removeconstants.prep_branch.nop_prep.yeojohnson.pca.ica.prep_unbranch.ranger.ranger.log_reg.log_reg.kknn.kknn.classifavg' on task 'german_credit' (iter 3/3)
#> INFO [12:04:00.434] [mlr3] Applying learner 'removeconstants.prep_branch.nop_prep.yeojohnson.pca.ica.prep_unbranch.ranger.ranger.log_reg.log_reg.kknn.kknn.classifavg' on task 'german_credit' (iter 1/3)
#> INFO [12:04:00.834] [mlr3] Applying learner 'removeconstants.prep_branch.nop_prep.yeojohnson.pca.ica.prep_unbranch.ranger.ranger.log_reg.log_reg.kknn.kknn.classifavg' on task 'german_credit' (iter 2/3)
#> INFO [12:04:01.206] [mlr3] Finished benchmark
#> INFO [12:04:01.274] [bbotk] Result of batch 1:
#> INFO [12:04:01.276] [bbotk] prep_branch.selection pca.rank. ica.n.comp yeojohnson.standardize
#> INFO [12:04:01.276] [bbotk] nop_prep NA NA NA
#> INFO [12:04:01.276] [bbotk] ranger.ranger.mtry.ratio ranger.ranger.max.depth kknn.kknn.k classif.acc
#> INFO [12:04:01.276] [bbotk] 0.2590138 6 5 0.7419725
#> INFO [12:04:01.276] [bbotk] warnings errors runtime_learners uhash
#> INFO [12:04:01.276] [bbotk] 0 0 1.169 bdd13c00-6492-4705-9b48-cf97aab78e8d
#> INFO [12:04:01.286] [bbotk] Evaluating 1 configuration(s)
#> INFO [12:04:01.635] [mlr3] Running benchmark with 3 resampling iterations
#> INFO [12:04:01.640] [mlr3] Applying learner 'removeconstants.prep_branch.nop_prep.yeojohnson.pca.ica.prep_unbranch.ranger.ranger.log_reg.log_reg.kknn.kknn.classifavg' on task 'german_credit' (iter 3/3)
#> INFO [12:04:02.087] [mlr3] Applying learner 'removeconstants.prep_branch.nop_prep.yeojohnson.pca.ica.prep_unbranch.ranger.ranger.log_reg.log_reg.kknn.kknn.classifavg' on task 'german_credit' (iter 1/3)
#> INFO [12:04:02.539] [mlr3] Applying learner 'removeconstants.prep_branch.nop_prep.yeojohnson.pca.ica.prep_unbranch.ranger.ranger.log_reg.log_reg.kknn.kknn.classifavg' on task 'german_credit' (iter 2/3)
#> INFO [12:04:02.986] [mlr3] Finished benchmark
#> INFO [12:04:03.035] [bbotk] Result of batch 2:
#> INFO [12:04:03.036] [bbotk] prep_branch.selection pca.rank. ica.n.comp yeojohnson.standardize
#> INFO [12:04:03.036] [bbotk] yeojohnson NA NA FALSE
#> INFO [12:04:03.036] [bbotk] ranger.ranger.mtry.ratio ranger.ranger.max.depth kknn.kknn.k classif.acc
#> INFO [12:04:03.036] [bbotk] 0.3456153 5 12 0.7509575
#> INFO [12:04:03.036] [bbotk] warnings errors runtime_learners uhash
#> INFO [12:04:03.036] [bbotk] 0 0 1.322 079a7fb2-f87a-4623-ba62-b5d10a296696
#> INFO [12:04:03.060] [bbotk] Finished optimizing after 2 evaluation(s)
#> INFO [12:04:03.060] [bbotk] Result:
#> INFO [12:04:03.061] [bbotk] prep_branch.selection pca.rank. ica.n.comp yeojohnson.standardize
#> INFO [12:04:03.061] [bbotk] yeojohnson NA NA FALSE
#> INFO [12:04:03.061] [bbotk] ranger.ranger.mtry.ratio ranger.ranger.max.depth kknn.kknn.k
#> INFO [12:04:03.061] [bbotk] 0.3456153 5 12
#> INFO [12:04:03.061] [bbotk] learner_param_vals x_domain classif.acc
#> INFO [12:04:03.061] [bbotk] <list[13]> <list[5]> 0.7509575
graph = at$learner$graph_model
graph$keep_results = TRUE
graph$predict(task)
#> $classifavg.output
#> <PredictionClassif> for 1000 observations:
#> row_ids truth response prob.good prob.bad
#> 1 good good 0.9426428 0.05735724
#> 2 bad bad 0.3555267 0.64447326
#> 3 good good 0.9582165 0.04178351
#> ---
#> 998 good good 0.9369601 0.06303990
#> 999 bad bad 0.3653883 0.63461169
#> 1000 good good 0.7739578 0.22604221
graph$pipeops$kknn.kknn$.result$output
#> <PredictionClassif> for 1000 observations:
#> row_ids truth response prob.good prob.bad
#> 1 good good 1.0000000 0.00000000
#> 2 bad bad 0.3840454 0.61595460
#> 3 good good 0.9877564 0.01224364
#> ---
#> 998 good good 0.9687620 0.03123795
#> 999 bad bad 0.4657351 0.53426487
#> 1000 good good 0.9269728 0.07302720
res = graph$predict(task)
Created on 2022-04-03 by the reprex package (v2.0.1)**