parsnip Xgboost objective function customizing error.

This issue is a continuation of #873 and #875.

The customized objective function for "logregobj" and the one trained with the default options should calculate the same, but they didn't, so I saved the predictions to check.

The sum of the predicted probability values is 1, but each value is not fixed between 0 and 1.

Below is the minimal reproducible code attached.

What steps do I need to take to figure out why this is happening on my own? I would like to know how to debug it.

library(tidymodels)
library(xgboost)


data(agaricus.train, package = "xgboost")
data(agaricus.test, package = "xgboost")

dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
watchlist <- list(train = dtrain, eval = dtest)

param <- list(max_depth = 2, eta = 1, nthread = 2,
              objective = "binary:logistic", eval_metric = "auc")
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
#> [1]  train-auc:0.958228  eval-auc:0.960373 
#> [2]  train-auc:0.981413  eval-auc:0.979930
#> [1]  train-auc:0.958228  eval-auc:0.960373 
#> [2]  train-auc:0.981413  eval-auc:0.979930


logregobj <- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  preds <- 1/(1 + exp(-preds))
  grad <- preds - labels
  hess <- preds * (1 - preds)
  return(list(grad = grad, hess = hess))
}

param$objective <- logregobj

bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
#> [1]  train-auc:0.958228  eval-auc:0.960373 
#> [2]  train-auc:0.981413  eval-auc:0.979930
#> [1]  train-auc:0.958228  eval-auc:0.960373 
#> [2]  train-auc:0.981413  eval-auc:0.979930




set.seed(100)
res <-
  fit_resamples(
    boost_tree("classification") %>% set_engine("xgboost"),
    Class ~ .,
    control =control_grid(verbose = FALSE,
                          allow_par = FALSE,
                          parallel_over = NULL,save_pred = TRUE),
    bootstraps(two_class_dat, 5)
  )


set.seed(100)
res_custom <-
  fit_resamples(
    boost_tree("classification") %>% set_engine("xgboost", objective = logregobj),
    Class ~ .,
    control =control_grid(verbose = FALSE,
                          allow_par = FALSE,
                          parallel_over = NULL,save_pred = TRUE),
    bootstraps(two_class_dat, 5)
  )

collect_metrics(res)
#> # A tibble: 2 × 6
#>   .metric  .estimator  mean     n std_err .config             
#>   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
#> 1 accuracy binary     0.788     5 0.0115  Preprocessor1_Model1
#> 2 roc_auc  binary     0.864     5 0.00930 Preprocessor1_Model1
collect_metrics(res_custom)
#> # A tibble: 2 × 6
#>   .metric  .estimator  mean     n std_err .config             
#>   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
#> 1 accuracy binary     0.782     5 0.0125  Preprocessor1_Model1
#> 2 roc_auc  binary     0.860     5 0.00995 Preprocessor1_Model1


origin <- res$.predictions[[1]] |>
  select(contains("pred"),.row,-.config,-.pred_class)
custom <- res_custom$.predictions[[1]] |> 
  select(.pred_Class1_custom = .pred_Class1,
         .pred_Class2_custom = .pred_Class2,
         ,-.config,.row) 

left_join(origin,custom,by=".row") |> head(15)
#> # A tibble: 15 × 5
#>    .pred_Class1 .pred_Class2  .row .pred_Class1_custom .pred_Class2_custom
#>           <dbl>        <dbl> <int>               <dbl>               <dbl>
#>  1       0.752        0.248      3               1.52               -0.522
#>  2       0.0827       0.917      5              -2.64                3.64 
#>  3       0.378        0.622      8              -1.10                2.10 
#>  4       0.985        0.0153    20               3.94               -2.94 
#>  5       0.941        0.0585    22               3.76               -2.76 
#>  6       0.0951       0.905     25              -2.69                3.69 
#>  7       0.982        0.0179    27               3.97               -2.97 
#>  8       0.975        0.0252    29               3.93               -2.93 
#>  9       0.739        0.261     33               0.586               0.414
#> 10       0.0386       0.961     35              -3.37                4.37 
#> 11       0.465        0.535     36               0.120               0.880
#> 12       0.979        0.0213    39               4.15               -3.15 
#> 13       0.983        0.0172    40               3.94               -2.94 
#> 14       0.185        0.815     41              -1.41                2.41 
#> 15       0.967        0.0327    42               3.84               -2.84

^{Created on 2023-09-09 with reprex v2.0.2}

Sep 09 '23 09:09 SHo-JANG

Ah, interesting! Thanks for the issue.

The xgboost engine itself will return the same predictions given that objective function, but tidymodels then only knows how to post-process predictions from the default objective function to convert them to probabilities. This is why the _custom "probabilities" aren't in $[0,~1]$. The code responsible is here, in parsnip::xgb_predict().

In this case, the debugging process may have been aided by translate(), where the training function parsnip::xgb_train() in the shown output is documented in the same file as the offending function.

library(parsnip)
  
boost_tree("classification") %>% set_engine("xgboost") %>% translate()
#> Boosted Tree Model Specification (classification)
#> 
#> Computational engine: xgboost 
#> 
#> Model fit template:
#> parsnip::xgb_train(x = missing_arg(), y = missing_arg(), weights = missing_arg(), 
#>     nthread = 1, verbose = 0)

It was also helpful to see when xgboost itself knows when outputted predictions are log-odds and when it doesn't:

library(tidymodels)
library(xgboost)
#> 
#> Attaching package: 'xgboost'
#> The following object is masked from 'package:dplyr':
#> 
#>     slice

data(agaricus.train, package = "xgboost")
data(agaricus.test, package = "xgboost")

dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
watchlist <- list(train = dtrain, eval = dtest)

# no `objective` argument at all
param <- list(max_depth = 2, eta = 1, nthread = 2,eval_metric = "auc")
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
#> [1]  train-auc:0.958228  eval-auc:0.960373 
#> [2]  train-auc:0.987601  eval-auc:0.986900
head(predict(bst, dtest, outputmargin = FALSE))
#> [1] 0.08585284 0.94223028 0.08585284 0.08585284 0.02808682 0.94223028
head(predict(bst, dtest, outputmargin = TRUE))
#> [1] 0.08585284 0.94223028 0.08585284 0.08585284 0.02808682 0.94223028

# `objective` argument is a pre-configured option
param$objective <- "binary:logistic"
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
#> [1]  train-auc:0.958228  eval-auc:0.960373 
#> [2]  train-auc:0.981413  eval-auc:0.979930
head(predict(bst, dtest, outputmargin = FALSE))
#> [1] 0.28583017 0.92392391 0.28583017 0.28583017 0.05169873 0.92392391
head(predict(bst, dtest, outputmargin = TRUE))
#> [1] -0.915723  2.496895 -0.915723 -0.915723 -2.909239  2.496895

# `objective` argument is "custom"
param$objective <- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  preds <- 1/(1 + exp(-preds))
  grad <- preds - labels
  hess <- preds * (1 - preds)
  return(list(grad = grad, hess = hess))
}
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
#> [1]  train-auc:0.958228  eval-auc:0.960373 
#> [2]  train-auc:0.981413  eval-auc:0.979930
head(predict(bst, dtest, outputmargin = FALSE))
#> [1] -1.049978  2.575050 -1.049978 -1.049978 -3.019169  2.575050
head(predict(bst, dtest, outputmargin = TRUE))
#> [1] -1.049978  2.575050 -1.049978 -1.049978 -3.019169  2.575050

^{Created on 2023-10-06 with reprex v2.0.2} This is tricky. We may want to consider if there's some way to embed that prediction post-processing function into the model specification, which links up nicely with related work on post-processing in the tidymodels. :)

Oct 06 '23 14:10 simonpcouch

The method you gave me was really helpful. I found out that I can model it by registering a new engine called xgboost_custom that modifies the process of converting to probability that is part of the xgb_train() function. Thank you.

# build custom engine -----------------------------------------------------
#set_new_model("boost_tree")
library(tidymodels)
set_model_mode("boost_tree", "classification")
set_model_engine("boost_tree", "classification", "xgboost_custom")
set_dependency("boost_tree", eng = "xgboost_custom",pkg = "xgboost",mode = "classification")

set_model_arg(
  model = "boost_tree",
  eng = "xgboost_custom",
  parsnip = "tree_depth",
  original = "max_depth",
  func = list(pkg = "dials", fun = "tree_depth"),
  has_submodel = FALSE
)
set_model_arg(
  model = "boost_tree",
  eng = "xgboost_custom",
  parsnip = "trees",
  original = "nrounds",
  func = list(pkg = "dials", fun = "trees"),
  has_submodel = TRUE
)
set_model_arg(
  model = "boost_tree",
  eng = "xgboost_custom",
  parsnip = "learn_rate",
  original = "eta",
  func = list(pkg = "dials", fun = "learn_rate"),
  has_submodel = FALSE
)
set_model_arg(
  model = "boost_tree",
  eng = "xgboost_custom",
  parsnip = "mtry",
  original = "colsample_bynode",
  func = list(pkg = "dials", fun = "mtry"),
  has_submodel = FALSE
)
set_model_arg(
  model = "boost_tree",
  eng = "xgboost_custom",
  parsnip = "min_n",
  original = "min_child_weight",
  func = list(pkg = "dials", fun = "min_n"),
  has_submodel = FALSE
)
set_model_arg(
  model = "boost_tree",
  eng = "xgboost_custom",
  parsnip = "loss_reduction",
  original = "gamma",
  func = list(pkg = "dials", fun = "loss_reduction"),
  has_submodel = FALSE
)
set_model_arg(
  model = "boost_tree",
  eng = "xgboost_custom",
  parsnip = "sample_size",
  original = "subsample",
  func = list(pkg = "dials", fun = "sample_size"),
  has_submodel = FALSE
)
set_model_arg(
  model = "boost_tree",
  eng = "xgboost_custom",
  parsnip = "stop_iter",
  original = "early_stop",
  func = list(pkg = "dials", fun = "stop_iter"),
  has_submodel = FALSE
)


set_encoding(
  model = "boost_tree",
  eng = "xgboost_custom",
  mode = "regression",
  options = list(
    predictor_indicators = "one_hot",
    compute_intercept = FALSE,
    remove_intercept = TRUE,
    allow_sparse_x = TRUE
  )
)

set_fit(
  model = "boost_tree",
  eng = "xgboost_custom",
  mode = "classification",
  value = list(
    interface = "matrix",
    protect = c("x", "y", "weights"),
    func = c(pkg = "parsnip", fun = "xgb_train"),
    defaults = list(nthread = 1, verbose = 0)
  )
)

set_encoding(
  model = "boost_tree",
  eng = "xgboost_custom",
  mode = "classification",
  options = list(
    predictor_indicators = "one_hot",
    compute_intercept = FALSE,
    remove_intercept = TRUE,
    allow_sparse_x = TRUE
  )
)

set_pred(
  model = "boost_tree",
  eng = "xgboost_custom",
  mode = "classification",
  type = "class",
  value = list(
    pre = NULL,
    post = function(x, object) {
      if (is.vector(x)) {
        event_level <- parsnip:::get_event_level(object$spec)
        if (event_level == "first") {
          x <- ifelse(x >= 0.5, object$lvl[1], object$lvl[2])
        } else {
          x <- ifelse(x >= 0.5, object$lvl[2], object$lvl[1])
        }
      } else {
        x <- object$lvl[apply(x, 1, which.max)]
      }
      x
    },
    func = c(pkg = NULL, fun = "xgb_predict_custom"),
    args = list(object = quote(object$fit), new_data = quote(new_data))
  )
)

set_pred(
  model = "boost_tree",
  eng = "xgboost_custom",
  mode = "classification",
  type = "prob",
  value = list(
    pre = NULL,
    post = function(x, object) {
      if (is.vector(x)) {
        event_level <- parsnip:::get_event_level(object$spec)
        if (event_level == "first") {
          x <- tibble(v1 = x, v2 = 1 - x)
        } else {
          x <- tibble(v1 = 1 - x, v2 = x)
        }
      } else {
        x <- as_tibble(x, .name_repair = "minimal")
      }
      colnames(x) <- object$lvl
      x
    },
    func = c(pkg = NULL, fun = "xgb_predict_custom"),
    args = list(object = quote(object$fit), new_data = quote(new_data))
  )
)

set_pred(
  model = "boost_tree",
  eng = "xgboost_custom",
  mode = "classification",
  type = "raw",
  value = list(
    pre = NULL,
    post = NULL,
    func = c(fun = "xgb_predict_custom"),
    args = list(object = quote(object$fit), new_data = quote(new_data))
  )
)

##################################################################################
##################################################################################


library(xgboost)
#> 
#> Attaching package: 'xgboost'
#> The following object is masked from 'package:dplyr':
#> 
#>     slice


logregobj <- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  preds <- 1/(1 + exp(-preds))
  grad <- preds - labels
  hess <- preds * (1 - preds)
  return(list(grad = grad, hess = hess))
}


# logit_default -----------------------------------------------------------
logistic_test <- function() {
  ## link
  linkfun <- function(mu){log(mu/(1-mu))} 
  ## inverse link
  linkinv <- function(eta) {1/(1+exp(-eta))} 
  ## derivative of invlink wrt eta
  mu.eta <- function(eta) { exp(-eta)/(1+exp(-eta))^2 }
  valideta <- function(eta) {TRUE}
  link <- "logistic_test"
  structure(list(linkfun = linkfun, linkinv = linkinv,
                 mu.eta = mu.eta, valideta = valideta, 
                 name = link),
            class = "link-glm")
}
logistic<- logistic_test()




xgb_predict_custom <- function(object, new_data, ...) {
  if (!inherits(new_data, "xgb.DMatrix")) {
    new_data <- maybe_matrix(new_data)
    new_data <- xgboost::xgb.DMatrix(data = new_data, missing = NA)
  }
  
  res <- predict(object, new_data, ...)
  
  x <- switch(
    object$params$objective %||% 3L,
    "binary:logitraw" = stats::binomial()$linkinv(res),
    "multi:softprob" = matrix(res, ncol = object$params$num_class, byrow = TRUE),
    # Link functions related to custom objective functions
    logistic$linkinv(res))
  
  x
}



set.seed(100)
res <-
  fit_resamples(
    boost_tree("classification") %>% set_engine("xgboost",objective = "binary:logitraw"),
    Class ~ .,
    control =control_grid(verbose = FALSE,
                          allow_par = FALSE,
                          parallel_over = NULL,save_pred = TRUE),
    bootstraps(two_class_dat, 5)
  )


set.seed(100)
res_custom <-
  fit_resamples(
    boost_tree("classification") %>% set_engine("xgboost_custom", objective = logregobj),
    Class ~ .,
    control =control_grid(verbose = FALSE,
                          allow_par = FALSE,
                          parallel_over = NULL,save_pred = TRUE),
    bootstraps(two_class_dat, 5)
  )

collect_metrics(res)
#> # A tibble: 2 × 6
#>   .metric  .estimator  mean     n std_err .config             
#>   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
#> 1 accuracy binary     0.788     5 0.00719 Preprocessor1_Model1
#> 2 roc_auc  binary     0.860     5 0.00995 Preprocessor1_Model1

collect_metrics(res_custom)
#> # A tibble: 2 × 6
#>   .metric  .estimator  mean     n std_err .config             
#>   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
#> 1 accuracy binary     0.788     5 0.00719 Preprocessor1_Model1
#> 2 roc_auc  binary     0.860     5 0.00995 Preprocessor1_Model1


origin <- res$.predictions[[1]] |>
  select(contains("pred"),.row,-.config,-.pred_class)
custom <- res_custom$.predictions[[1]] |> 
  select(.pred_Class1_custom = .pred_Class1,
         .pred_Class2_custom = .pred_Class2,
         ,-.config,.row) 

left_join(origin,custom,by=".row") |> head(15)
#> # A tibble: 15 × 5
#>    .pred_Class1 .pred_Class2  .row .pred_Class1_custom .pred_Class2_custom
#>           <dbl>        <dbl> <int>               <dbl>               <dbl>
#>  1       0.821        0.179      3              0.821               0.179 
#>  2       0.0663       0.934      5              0.0663              0.934 
#>  3       0.249        0.751      8              0.249               0.751 
#>  4       0.981        0.0191    20              0.981               0.0191
#>  5       0.977        0.0228    22              0.977               0.0228
#>  6       0.0636       0.936     25              0.0636              0.936 
#>  7       0.981        0.0186    27              0.981               0.0186
#>  8       0.981        0.0193    29              0.981               0.0193
#>  9       0.643        0.357     33              0.643               0.357 
#> 10       0.0331       0.967     35              0.0331              0.967 
#> 11       0.530        0.470     36              0.530               0.470 
#> 12       0.984        0.0155    39              0.984               0.0155
#> 13       0.981        0.0191    40              0.981               0.0191
#> 14       0.196        0.804     41              0.196               0.804 
#> 15       0.979        0.0210    42              0.979               0.0210

^{Created on 2023-10-08 with reprex v2.0.2}

Oct 08 '23 12:10 SHo-JANG