xgboost
xgboost copied to clipboard
Error in modeling cox regression
Hi,
I repeatedly received the following error during the training of XGBoost (objective: survival:cox, eval_metric: cox-nloglik). I am using xgboost 1.5.2 with python 3.8 at Win10, and I've noticed that others have encountered such issue #6885. This issue seems to be unresolved till now? I am new to using XGBoost, so I have absolutely no idea how to solve this problem. I would be highly grateful if anyone could give me some useful suggestions. The Error report, my data, and my code below were attached below:
Error report:
`ValueError Traceback (most recent call last)
Input In [4], in <module>
37 if __name__ == "__main__":
38 study = optuna.create_study(
39 pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize"
40 )
---> 41 study.optimize(objective, n_trials=100)
42 print(study.best_trial)
File D:\Softwares\ANACONDA\envs\xgboost\lib\site-packages\optuna\study\study.py:400, in Study.optimize(self, func, n_trials, timeout, n_jobs, catch, callbacks, gc_after_trial, show_progress_bar)
392 if n_jobs != 1:
393 warnings.warn(
394 "`n_jobs` argument has been deprecated in v2.7.0. "
395 "This feature will be removed in v4.0.0. "
396 "See https://github.com/optuna/optuna/releases/tag/v2.7.0.",
397 FutureWarning,
398 )
--> 400 _optimize(
401 study=self,
402 func=func,
403 n_trials=n_trials,
404 timeout=timeout,
405 n_jobs=n_jobs,
406 catch=catch,
407 callbacks=callbacks,
408 gc_after_trial=gc_after_trial,
409 show_progress_bar=show_progress_bar,
410 )
File D:\Softwares\ANACONDA\envs\xgboost\lib\site-packages\optuna\study\_optimize.py:66, in _optimize(study, func, n_trials, timeout, n_jobs, catch, callbacks, gc_after_trial, show_progress_bar)
64 try:
65 if n_jobs == 1:
---> 66 _optimize_sequential(
67 study,
68 func,
69 n_trials,
70 timeout,
71 catch,
72 callbacks,
73 gc_after_trial,
74 reseed_sampler_rng=False,
75 time_start=None,
76 progress_bar=progress_bar,
77 )
78 else:
79 if show_progress_bar:
File D:\Softwares\ANACONDA\envs\xgboost\lib\site-packages\optuna\study\_optimize.py:163, in _optimize_sequential(study, func, n_trials, timeout, catch, callbacks, gc_after_trial, reseed_sampler_rng, time_start, progress_bar)
160 break
162 try:
--> 163 trial = _run_trial(study, func, catch)
164 except Exception:
165 raise
File D:\Softwares\ANACONDA\envs\xgboost\lib\site-packages\optuna\study\_optimize.py:264, in _run_trial(study, func, catch)
261 assert False, "Should not reach."
263 if state == TrialState.FAIL and func_err is not None and not isinstance(func_err, catch):
--> 264 raise func_err
265 return trial
File D:\Softwares\ANACONDA\envs\xgboost\lib\site-packages\optuna\study\_optimize.py:213, in _run_trial(study, func, catch)
210 thread.start()
212 try:
--> 213 value_or_values = func(trial)
214 except exceptions.TrialPruned as e:
215 # TODO(mamu): Handle multi-objective cases.
216 state = TrialState.PRUNED
Input In [4], in objective(trial)
28 # Add a callback for pruning.
29 pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-cox-nloglik")
---> 30 bst = xgb.train(param, dtrain, evals=[(dvalid, "validation")], callbacks=[pruning_callback])
31 preds = bst.predict(dvalid)
32 pred_labels = np.rint(preds)
File D:\Softwares\ANACONDA\envs\xgboost\lib\site-packages\xgboost\training.py:188, in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks)
115 def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
116 maximize=None, early_stopping_rounds=None, evals_result=None,
117 verbose_eval=True, xgb_model=None, callbacks=None):
118 # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
119 """Train a booster with given parameters.
120
121 Parameters
(...)
186 Booster : a trained booster model
187 """
--> 188 bst = _train_internal(params, dtrain,
189 num_boost_round=num_boost_round,
190 evals=evals,
191 obj=obj, feval=feval,
192 xgb_model=xgb_model, callbacks=callbacks,
193 verbose_eval=verbose_eval,
194 evals_result=evals_result,
195 maximize=maximize,
196 early_stopping_rounds=early_stopping_rounds)
197 return bst
File D:\Softwares\ANACONDA\envs\xgboost\lib\site-packages\xgboost\training.py:82, in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks, evals_result, maximize, verbose_eval, early_stopping_rounds)
80 break
81 bst.update(dtrain, i, obj)
---> 82 if callbacks.after_iteration(bst, i, dtrain, evals):
83 break
85 bst = callbacks.after_training(bst)
File D:\Softwares\ANACONDA\envs\xgboost\lib\site-packages\xgboost\callback.py:438, in CallbackContainer.after_iteration(self, model, epoch, dtrain, evals)
436 # split up `test-error:0.1234`
437 score = [tuple(s.split(':')) for s in score]
--> 438 self._update_history(score, epoch)
439 ret = any(c.after_iteration(model, epoch, self.history)
440 for c in self.callbacks)
441 return ret
File D:\Softwares\ANACONDA\envs\xgboost\lib\site-packages\xgboost\callback.py:404, in CallbackContainer._update_history(self, score, epoch)
402 def _update_history(self, score, epoch):
403 for d in score:
--> 404 name, s = d[0], float(d[1])
405 if self.is_cv:
406 std = float(d[2])
ValueError: could not convert string to float: '-nan(ind)'`
My code:
import numpy as np
import optuna
import warnings
import os
import datatable as dt
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline
from modin import pandas as pd
from sklearn.model_selection import train_test_split
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice
os.chdir("D:/BioI/Py_hub/Feature_selection_SURVIVAL") # set my wd
df = dt.fread("data.txt",sep='\t',header=True)
df = df.to_pandas().set_index('id',drop=True)
X = df.iloc[:, 2:df.shape[1]]
y = df.iloc[:, 0:2]
y['OS'] = y['OS'].replace(1, True)
y['OS'] = y['OS'].replace(0, False)
y = y.loc[:,['OS','OS.time']]
y = y.to_records(index=False)
y = [x[1] if x[0] else -x[1] for x in y]
def c_statistic_harrell(pred, labels):
total = 0
matches = 0
for i in range(len(labels)):
for j in range(len(labels)):
if labels[j] > 0 and abs(labels[i]) > labels[j]:
total += 1
if pred[j] > pred[i]:
matches += 1
return matches/total
def objective(trial):
data, target = X, y
train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.20)
dtrain = xgb.DMatrix(train_x, label=train_y)
dvalid = xgb.DMatrix(valid_x, label=valid_y)
param = {
"verbosity": 0,
"objective": "survival:cox",
"eval_metric": "cox-nloglik",
"booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
"lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
"alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
"n_estimators":trial.suggest_int("n_estimators",500,2000)
}
if param["booster"] == "gbtree" or param["booster"] == "dart":
param["max_depth"] = trial.suggest_int("max_depth", 3, 10)
param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
if param["booster"] == "dart":
param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
# Add a callback for pruning.
pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-cox-nloglik")
bst = xgb.train(param, dtrain, evals=[(dvalid, "validation")], callbacks=[pruning_callback])
preds = bst.predict(dvalid)
pred_labels = np.rint(preds)
accuracy = c_statistic_harrell(preds, valid_y)
return accuracy
if __name__ == "__main__":
study = optuna.create_study(
pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize"
)
study.optimize(objective, n_trials=100)
print(study.best_trial)
my data
Apologies for the long silence. I will gather all the errors with cox and take a deeper look.
Apologies for the long silence. I will gather all the errors with cox and take a deeper look.
Hi trivialfis
I test this hack and this might help since . This is due to reason that nloglik goes to infinity or nan then it can't be converted into float.
in the xgboost\callback.py file
change line: "cvmap[(metric_idx, k)].append(float(v))"
to: " try: cvmap[(metric_idx, k)].append(float(v)) except: cvmap[(metric_idx, k)].append(numpy.nan)"
Thanks
Thank you for sharing. Yeah that seems to be a quick fix. I would like to try to avoid generating nan if it's possible. Also, we might find a better way to serialize the floating point.