xgboost
xgboost copied to clipboard
`scale_pos_weight` and `sample_weight` give inconsistent results.
Setting the model parameter scale_pos_weight and the fit parameter sample_weight are giving inconsistent results (not due to randomness) while fitting binary classifiers using the sklearn interface.
Language: Python 3.10.12 XGBoost Version: 2.1.1
Printout from MWE:
> class_weight: {0: 1.0, 1: 4.1} scale_pos_weight: 4.1 sample_weights: [1. 4.1]
> accuracy using scale_pos_weight (mean, std, n): (0.625, 0.0)
> accuracy using class_weight (mean, std, n): (0.635, 0.0)
MWE :
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import class_weight
np.random.seed(42)
# Synthetic data
X = np.random.rand(1000, 10)
y = np.zeros(1000)
# Introduce class imbalance (e.g., 80% class 0, 20% class 1)
y[800:] = 1
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Calculate scale_pos_weight for XGBoost
spw = np.round(np.sum(y_train == 0) / np.sum(y_train == 1), 1)
cws = {0: 1.0, 1: spw}
sample_weight = class_weight.compute_sample_weight(
class_weight=cws,
y=y_train
)
print(f'class_weight: {cws}, scale_pos_weight: {spw}, sample_weights: {np.unique(sample_weight)}')
def train_and_evaluate(n_runs, model_params, fit_params):
accuracy_list = []
for run in range(n_runs):
mod = xgb.XGBClassifier(**model_params)
mod.fit(X_train, y_train, **fit_params)
y_pred = mod.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy_list.append(accuracy)
return np.mean(accuracy_list), np.std(accuracy_list)
# Set the number of repeat runs
n_runs = 5
# Train XGBoost with scale_pos_weight
model_params = {'scale_pos_weight': spw, 'random_state': 42, 'n_estimators': 7}
fit_params = {}
mean_acc, std_acc = train_and_evaluate(n_runs, model_params, fit_params)
print(f'accuracy using scale_pos_weight (mean, std, n): {mean_acc, std_acc}')
# Train XGBoost with class_weight
model_params = {'random_state': 42, 'n_estimators': 7}
fit_params = {'sample_weight': sample_weight}
mean_acc, std_acc = train_and_evaluate(n_runs, model_params, fit_params)
print(f'accuracy using class_weight (mean, std, n): {mean_acc, std_acc}')