Xgboost objective function customizing error.
This issue is a continuation of #873 and #875.
The customized objective function for "logregobj" and the one trained with the default options should calculate the same, but they didn't, so I saved the predictions to check.
The sum of the predicted probability values is 1, but each value is not fixed between 0 and 1.
Below is the minimal reproducible code attached.
- What steps do I need to take to figure out why this is happening on my own? I would like to know how to debug it.
library(tidymodels)
library(xgboost)
data(agaricus.train, package = "xgboost")
data(agaricus.test, package = "xgboost")
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
watchlist <- list(train = dtrain, eval = dtest)
param <- list(max_depth = 2, eta = 1, nthread = 2,
objective = "binary:logistic", eval_metric = "auc")
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
#> [1] train-auc:0.958228 eval-auc:0.960373
#> [2] train-auc:0.981413 eval-auc:0.979930
#> [1] train-auc:0.958228 eval-auc:0.960373
#> [2] train-auc:0.981413 eval-auc:0.979930
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
param$objective <- logregobj
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
#> [1] train-auc:0.958228 eval-auc:0.960373
#> [2] train-auc:0.981413 eval-auc:0.979930
#> [1] train-auc:0.958228 eval-auc:0.960373
#> [2] train-auc:0.981413 eval-auc:0.979930
set.seed(100)
res <-
fit_resamples(
boost_tree("classification") %>% set_engine("xgboost"),
Class ~ .,
control =control_grid(verbose = FALSE,
allow_par = FALSE,
parallel_over = NULL,save_pred = TRUE),
bootstraps(two_class_dat, 5)
)
set.seed(100)
res_custom <-
fit_resamples(
boost_tree("classification") %>% set_engine("xgboost", objective = logregobj),
Class ~ .,
control =control_grid(verbose = FALSE,
allow_par = FALSE,
parallel_over = NULL,save_pred = TRUE),
bootstraps(two_class_dat, 5)
)
collect_metrics(res)
#> # A tibble: 2 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy binary 0.788 5 0.0115 Preprocessor1_Model1
#> 2 roc_auc binary 0.864 5 0.00930 Preprocessor1_Model1
collect_metrics(res_custom)
#> # A tibble: 2 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy binary 0.782 5 0.0125 Preprocessor1_Model1
#> 2 roc_auc binary 0.860 5 0.00995 Preprocessor1_Model1
origin <- res$.predictions[[1]] |>
select(contains("pred"),.row,-.config,-.pred_class)
custom <- res_custom$.predictions[[1]] |>
select(.pred_Class1_custom = .pred_Class1,
.pred_Class2_custom = .pred_Class2,
,-.config,.row)
left_join(origin,custom,by=".row") |> head(15)
#> # A tibble: 15 × 5
#> .pred_Class1 .pred_Class2 .row .pred_Class1_custom .pred_Class2_custom
#> <dbl> <dbl> <int> <dbl> <dbl>
#> 1 0.752 0.248 3 1.52 -0.522
#> 2 0.0827 0.917 5 -2.64 3.64
#> 3 0.378 0.622 8 -1.10 2.10
#> 4 0.985 0.0153 20 3.94 -2.94
#> 5 0.941 0.0585 22 3.76 -2.76
#> 6 0.0951 0.905 25 -2.69 3.69
#> 7 0.982 0.0179 27 3.97 -2.97
#> 8 0.975 0.0252 29 3.93 -2.93
#> 9 0.739 0.261 33 0.586 0.414
#> 10 0.0386 0.961 35 -3.37 4.37
#> 11 0.465 0.535 36 0.120 0.880
#> 12 0.979 0.0213 39 4.15 -3.15
#> 13 0.983 0.0172 40 3.94 -2.94
#> 14 0.185 0.815 41 -1.41 2.41
#> 15 0.967 0.0327 42 3.84 -2.84
Created on 2023-09-09 with reprex v2.0.2
Ah, interesting! Thanks for the issue.
The xgboost engine itself will return the same predictions given that objective function, but tidymodels then only knows how to post-process predictions from the default objective function to convert them to probabilities. This is why the _custom "probabilities" aren't in $[0,~1]$. The code responsible is here, in parsnip::xgb_predict().
In this case, the debugging process may have been aided by translate(), where the training function parsnip::xgb_train() in the shown output is documented in the same file as the offending function.
library(parsnip)
boost_tree("classification") %>% set_engine("xgboost") %>% translate()
#> Boosted Tree Model Specification (classification)
#>
#> Computational engine: xgboost
#>
#> Model fit template:
#> parsnip::xgb_train(x = missing_arg(), y = missing_arg(), weights = missing_arg(),
#> nthread = 1, verbose = 0)
It was also helpful to see when xgboost itself knows when outputted predictions are log-odds and when it doesn't:
library(tidymodels)
library(xgboost)
#>
#> Attaching package: 'xgboost'
#> The following object is masked from 'package:dplyr':
#>
#> slice
data(agaricus.train, package = "xgboost")
data(agaricus.test, package = "xgboost")
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
watchlist <- list(train = dtrain, eval = dtest)
# no `objective` argument at all
param <- list(max_depth = 2, eta = 1, nthread = 2,eval_metric = "auc")
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
#> [1] train-auc:0.958228 eval-auc:0.960373
#> [2] train-auc:0.987601 eval-auc:0.986900
head(predict(bst, dtest, outputmargin = FALSE))
#> [1] 0.08585284 0.94223028 0.08585284 0.08585284 0.02808682 0.94223028
head(predict(bst, dtest, outputmargin = TRUE))
#> [1] 0.08585284 0.94223028 0.08585284 0.08585284 0.02808682 0.94223028
# `objective` argument is a pre-configured option
param$objective <- "binary:logistic"
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
#> [1] train-auc:0.958228 eval-auc:0.960373
#> [2] train-auc:0.981413 eval-auc:0.979930
head(predict(bst, dtest, outputmargin = FALSE))
#> [1] 0.28583017 0.92392391 0.28583017 0.28583017 0.05169873 0.92392391
head(predict(bst, dtest, outputmargin = TRUE))
#> [1] -0.915723 2.496895 -0.915723 -0.915723 -2.909239 2.496895
# `objective` argument is "custom"
param$objective <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
#> [1] train-auc:0.958228 eval-auc:0.960373
#> [2] train-auc:0.981413 eval-auc:0.979930
head(predict(bst, dtest, outputmargin = FALSE))
#> [1] -1.049978 2.575050 -1.049978 -1.049978 -3.019169 2.575050
head(predict(bst, dtest, outputmargin = TRUE))
#> [1] -1.049978 2.575050 -1.049978 -1.049978 -3.019169 2.575050
Created on 2023-10-06 with reprex v2.0.2 This is tricky. We may want to consider if there's some way to embed that prediction post-processing function into the model specification, which links up nicely with related work on post-processing in the tidymodels. :)
The method you gave me was really helpful.
I found out that I can model it by registering a new engine called xgboost_custom that modifies the process of converting to probability that is part of the xgb_train() function. Thank you.
# build custom engine -----------------------------------------------------
#set_new_model("boost_tree")
library(tidymodels)
set_model_mode("boost_tree", "classification")
set_model_engine("boost_tree", "classification", "xgboost_custom")
set_dependency("boost_tree", eng = "xgboost_custom",pkg = "xgboost",mode = "classification")
set_model_arg(
model = "boost_tree",
eng = "xgboost_custom",
parsnip = "tree_depth",
original = "max_depth",
func = list(pkg = "dials", fun = "tree_depth"),
has_submodel = FALSE
)
set_model_arg(
model = "boost_tree",
eng = "xgboost_custom",
parsnip = "trees",
original = "nrounds",
func = list(pkg = "dials", fun = "trees"),
has_submodel = TRUE
)
set_model_arg(
model = "boost_tree",
eng = "xgboost_custom",
parsnip = "learn_rate",
original = "eta",
func = list(pkg = "dials", fun = "learn_rate"),
has_submodel = FALSE
)
set_model_arg(
model = "boost_tree",
eng = "xgboost_custom",
parsnip = "mtry",
original = "colsample_bynode",
func = list(pkg = "dials", fun = "mtry"),
has_submodel = FALSE
)
set_model_arg(
model = "boost_tree",
eng = "xgboost_custom",
parsnip = "min_n",
original = "min_child_weight",
func = list(pkg = "dials", fun = "min_n"),
has_submodel = FALSE
)
set_model_arg(
model = "boost_tree",
eng = "xgboost_custom",
parsnip = "loss_reduction",
original = "gamma",
func = list(pkg = "dials", fun = "loss_reduction"),
has_submodel = FALSE
)
set_model_arg(
model = "boost_tree",
eng = "xgboost_custom",
parsnip = "sample_size",
original = "subsample",
func = list(pkg = "dials", fun = "sample_size"),
has_submodel = FALSE
)
set_model_arg(
model = "boost_tree",
eng = "xgboost_custom",
parsnip = "stop_iter",
original = "early_stop",
func = list(pkg = "dials", fun = "stop_iter"),
has_submodel = FALSE
)
set_encoding(
model = "boost_tree",
eng = "xgboost_custom",
mode = "regression",
options = list(
predictor_indicators = "one_hot",
compute_intercept = FALSE,
remove_intercept = TRUE,
allow_sparse_x = TRUE
)
)
set_fit(
model = "boost_tree",
eng = "xgboost_custom",
mode = "classification",
value = list(
interface = "matrix",
protect = c("x", "y", "weights"),
func = c(pkg = "parsnip", fun = "xgb_train"),
defaults = list(nthread = 1, verbose = 0)
)
)
set_encoding(
model = "boost_tree",
eng = "xgboost_custom",
mode = "classification",
options = list(
predictor_indicators = "one_hot",
compute_intercept = FALSE,
remove_intercept = TRUE,
allow_sparse_x = TRUE
)
)
set_pred(
model = "boost_tree",
eng = "xgboost_custom",
mode = "classification",
type = "class",
value = list(
pre = NULL,
post = function(x, object) {
if (is.vector(x)) {
event_level <- parsnip:::get_event_level(object$spec)
if (event_level == "first") {
x <- ifelse(x >= 0.5, object$lvl[1], object$lvl[2])
} else {
x <- ifelse(x >= 0.5, object$lvl[2], object$lvl[1])
}
} else {
x <- object$lvl[apply(x, 1, which.max)]
}
x
},
func = c(pkg = NULL, fun = "xgb_predict_custom"),
args = list(object = quote(object$fit), new_data = quote(new_data))
)
)
set_pred(
model = "boost_tree",
eng = "xgboost_custom",
mode = "classification",
type = "prob",
value = list(
pre = NULL,
post = function(x, object) {
if (is.vector(x)) {
event_level <- parsnip:::get_event_level(object$spec)
if (event_level == "first") {
x <- tibble(v1 = x, v2 = 1 - x)
} else {
x <- tibble(v1 = 1 - x, v2 = x)
}
} else {
x <- as_tibble(x, .name_repair = "minimal")
}
colnames(x) <- object$lvl
x
},
func = c(pkg = NULL, fun = "xgb_predict_custom"),
args = list(object = quote(object$fit), new_data = quote(new_data))
)
)
set_pred(
model = "boost_tree",
eng = "xgboost_custom",
mode = "classification",
type = "raw",
value = list(
pre = NULL,
post = NULL,
func = c(fun = "xgb_predict_custom"),
args = list(object = quote(object$fit), new_data = quote(new_data))
)
)
##################################################################################
##################################################################################
library(xgboost)
#>
#> Attaching package: 'xgboost'
#> The following object is masked from 'package:dplyr':
#>
#> slice
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
# logit_default -----------------------------------------------------------
logistic_test <- function() {
## link
linkfun <- function(mu){log(mu/(1-mu))}
## inverse link
linkinv <- function(eta) {1/(1+exp(-eta))}
## derivative of invlink wrt eta
mu.eta <- function(eta) { exp(-eta)/(1+exp(-eta))^2 }
valideta <- function(eta) {TRUE}
link <- "logistic_test"
structure(list(linkfun = linkfun, linkinv = linkinv,
mu.eta = mu.eta, valideta = valideta,
name = link),
class = "link-glm")
}
logistic<- logistic_test()
xgb_predict_custom <- function(object, new_data, ...) {
if (!inherits(new_data, "xgb.DMatrix")) {
new_data <- maybe_matrix(new_data)
new_data <- xgboost::xgb.DMatrix(data = new_data, missing = NA)
}
res <- predict(object, new_data, ...)
x <- switch(
object$params$objective %||% 3L,
"binary:logitraw" = stats::binomial()$linkinv(res),
"multi:softprob" = matrix(res, ncol = object$params$num_class, byrow = TRUE),
# Link functions related to custom objective functions
logistic$linkinv(res))
x
}
set.seed(100)
res <-
fit_resamples(
boost_tree("classification") %>% set_engine("xgboost",objective = "binary:logitraw"),
Class ~ .,
control =control_grid(verbose = FALSE,
allow_par = FALSE,
parallel_over = NULL,save_pred = TRUE),
bootstraps(two_class_dat, 5)
)
set.seed(100)
res_custom <-
fit_resamples(
boost_tree("classification") %>% set_engine("xgboost_custom", objective = logregobj),
Class ~ .,
control =control_grid(verbose = FALSE,
allow_par = FALSE,
parallel_over = NULL,save_pred = TRUE),
bootstraps(two_class_dat, 5)
)
collect_metrics(res)
#> # A tibble: 2 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy binary 0.788 5 0.00719 Preprocessor1_Model1
#> 2 roc_auc binary 0.860 5 0.00995 Preprocessor1_Model1
collect_metrics(res_custom)
#> # A tibble: 2 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy binary 0.788 5 0.00719 Preprocessor1_Model1
#> 2 roc_auc binary 0.860 5 0.00995 Preprocessor1_Model1
origin <- res$.predictions[[1]] |>
select(contains("pred"),.row,-.config,-.pred_class)
custom <- res_custom$.predictions[[1]] |>
select(.pred_Class1_custom = .pred_Class1,
.pred_Class2_custom = .pred_Class2,
,-.config,.row)
left_join(origin,custom,by=".row") |> head(15)
#> # A tibble: 15 × 5
#> .pred_Class1 .pred_Class2 .row .pred_Class1_custom .pred_Class2_custom
#> <dbl> <dbl> <int> <dbl> <dbl>
#> 1 0.821 0.179 3 0.821 0.179
#> 2 0.0663 0.934 5 0.0663 0.934
#> 3 0.249 0.751 8 0.249 0.751
#> 4 0.981 0.0191 20 0.981 0.0191
#> 5 0.977 0.0228 22 0.977 0.0228
#> 6 0.0636 0.936 25 0.0636 0.936
#> 7 0.981 0.0186 27 0.981 0.0186
#> 8 0.981 0.0193 29 0.981 0.0193
#> 9 0.643 0.357 33 0.643 0.357
#> 10 0.0331 0.967 35 0.0331 0.967
#> 11 0.530 0.470 36 0.530 0.470
#> 12 0.984 0.0155 39 0.984 0.0155
#> 13 0.981 0.0191 40 0.981 0.0191
#> 14 0.196 0.804 41 0.196 0.804
#> 15 0.979 0.0210 42 0.979 0.0210
Created on 2023-10-08 with reprex v2.0.2