finetune icon indicating copy to clipboard operation
finetune copied to clipboard

Error that occurs when using the custom metric during parallel hyperparameter tuning.

Open AKALeon opened this issue 1 year ago • 5 comments

I have defined a custom metric (partial ROC AUC) myself, the code is as follows:

# Load packages
library(tidyverse)
library(tidymodels)
library(modeldata)
library(finetune)
library(baguette)
library(doParallel)
ncores = round(parallel::detectCores()/3)

# Logic for `event_level`
event_col <- function(truth, event_level) {
  if (identical(event_level, "first")) {
    levels(truth)[1]
  } else {
    levels(truth)[2]
  }
}

pauc_impl <- function(truth, estimate, estimator = 'binary', event_level) {
  
  if(estimator == "binary") {
    
    level_case = event_col(truth = truth, event_level = event_level)
    level_control = setdiff(levels(truth), level_case)
    
    result = pROC::roc(estimate,
                       response = truth,
                       levels = c(level_control, level_case),
                       partial.auc = c(0.8,1),
                       partial.auc.focus = "sensitivity")
    
    pauc_value = as.numeric(result$auc)
  }
  
  return(pauc_value)
}

pauc_vec <- function(truth,
                     estimate,
                     estimator = NULL,
                     na_rm = TRUE,
                     case_weights = NULL,
                     event_level = "first",
                     ...) {
  # calls finalize_estimator_internal() internally
  estimator <- finalize_estimator(truth, estimator, metric_class = "pauc")
  
  check_prob_metric(truth, estimate, case_weights, estimator)
  
  if (na_rm) {
    result <- yardstick_remove_missing(truth, estimate, case_weights)
    
    truth <- result$truth
    estimate <- result$estimate
    case_weights <- result$case_weights
  } else if (yardstick_any_missing(truth, estimate, case_weights)) {
    return(NA_real_)
  }
  
  pauc_impl(truth, estimate, estimator, event_level)
}



pauc <- function(data, ...) {
  UseMethod("pauc")
}

pauc <- new_prob_metric(pauc, direction = "maximize")

pauc.data.frame <- function(data,
                            truth,
                            estimate,
                            estimator = NULL,
                            na_rm = TRUE,
                            case_weights = NULL,
                            event_level = "first",
                            options = list()) {
  
  prob_metric_summarizer(
    name = "pauc",
    fn = pauc_vec,
    data = data,
    truth = !!enquo(truth),
    !!enquo(estimate),
    estimator = estimator,
    na_rm = na_rm,
    case_weights = !!enquo(case_weights),
    event_level = event_level,
    fn_options = list(options = options)
  )
  
}

I can use defined metric function pauc on the example:

pauc(data = two_class_example,truth = truth,Class1)

The results are as follows:

> pauc(data = two_class_example,truth = truth,Class1)
Setting direction: controls < cases
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 pauc    binary         0.149

Then, I used tune_race_anova to tune the bag_tree model.

set.seed(123)
data("lending_club", package = "modeldata")
split <- initial_split(lending_club, strata = Class)
train <- training(split)
test  <- testing(split)

fold = vfold_cv(data = train,v = 10,strata = Class)

rec <- recipe(Class ~ ., train) %>%
  step_normalize(all_numeric())

mod <- bag_tree(tree_depth = tune()) %>%
  set_engine("rpart") %>%
  set_mode("classification")

wf_set <- workflow_set(
  preproc = list(base = rec),
  models = list(bag = mod),
  cross = TRUE)

When not using parallel computation, using the defined pauc metric works correctly:

race_result = workflow_map(wf_set,
                           fn = 'tune_race_anova',
                           resamples = fold,
                           grid = 5,
                           metrics = metric_set(pauc))

race_result %>% 
  extract_workflow_set_result(id = 'base_bag') %>% 
  show_best(metric = 'pauc')
> race_result %>% 
+   extract_workflow_set_result(id = 'base_bag') %>% 
+   show_best(metric = 'pauc')
# A tibble: 1 × 7
  tree_depth .metric .estimator   mean     n std_err .config             
       <int> <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
1          6 pauc    binary     0.0647    10 0.00621 Preprocessor1_Model2

However, when I use parallel computation, an error occurs:

cl = makePSOCKcluster(ncores)
registerDoParallel(cl)

race_result = workflow_map(wf_set,
                           fn = 'tune_race_anova',
                           resamples = fold,
                           grid = 5,
                           metrics = metric_set(pauc))

stopCluster(cl)
Warning message:
All models failed. Run `show_notes(.Last.tune.result)` for more information. 
> show_notes(.Last.tune.result)
unique notes:
───────────────────────────────────────────────────────────────────────────────────
Error in `metric_set()`:
! Failed to compute `pauc()`.
Caused by error in `UseMethod()`:
! no applicable method for 'pauc' applied to an object of class "c('grouped_df', 'tbl_df', 'tbl', 'data.frame')"

When I use roc_auc as the metric for parallel hyperparameter tuning, everything works fine. Therefore, I believe the source of the error is in the parallel computation.

AKALeon avatar Jul 24 '24 12:07 AKALeon

Just noting that I can reproduce this.

An admittedly cumbersome way around this right now is to drop pauc in a package and then supply it to control_race(pkgs), but we ought to think about how we can better support this.

simonpcouch avatar Jul 29 '24 14:07 simonpcouch

@simonpcouch Sorry, I should have loaded the packages at the beginning. Now these results can be reproduced.

AKALeon avatar Jul 30 '24 04:07 AKALeon

I'm not sure I understand this most recent reply, but we'll be coming back to this as we continue to improve our support for parallelism!

simonpcouch avatar Jul 31 '24 19:07 simonpcouch

I'm not sure I understand this most recent reply, but we'll be coming back to this as we continue to improve our support for parallelism!

Anyway, thank you!

AKALeon avatar Aug 01 '24 05:08 AKALeon

When I change "cl = makePSOCKcluster(ncores)" to "cl = makeForkCluster(ncores)", everything works fine!

AKALeon avatar Aug 01 '24 05:08 AKALeon

Turns out we can replicate this issue with tune alone—I just filed an issue there and will address soon! Thank you!

simonpcouch avatar Sep 05 '24 14:09 simonpcouch