FLAML icon indicating copy to clipboard operation
FLAML copied to clipboard

[Bug]: flaml.default.XGBRegressor does not preprocess eval_set

Open fingoldo opened this issue 2 months ago • 2 comments

Describe the bug

Sometimes (when there are cat columns, for example) flaml (at least zero-shot) re-arranges columns. But for XGBRegressor/XGBClassifier, it misses to do that for the validation dataframe.

Steps to reproduce

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# --- 1. Create synthetic dataset with numeric + categorical features ---
np.random.seed(42)
n = 1000

df = pd.DataFrame({
    "num1": np.random.randn(n),
    "num2": np.random.rand(n) * 10,
    "cat1": np.random.choice(["A", "B", "C"], size=n),
    "cat2": np.random.choice(["X", "Y"], size=n),
    "target": np.random.choice([0, 1], size=n)
})

# --- 2. Split data ---
X = df.drop(columns="target")
y = df["target"]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

# --- 3. Convert categorical columns to pandas 'category' dtype ---
for col in X_train.select_dtypes(include="object").columns:
    X_train[col] = X_train[col].astype("category")
    X_valid[col] = X_valid[col].astype("category")


# --- 4. Define XGBoost model ---
model = XGBClassifier(
    tree_method="hist",              # Efficient, supports categorical features
    enable_categorical=True,         # Important!
    eval_metric="logloss",
    use_label_encoder=False,
    early_stopping_rounds=10,
    random_state=0
)

# --- 5. Fit model with early stopping ---
model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],   # validation set for early stopping    
    verbose=True
)

[0] validation_0-logloss:0.69096 [1] validation_0-logloss:0.69439 [2] validation_0-logloss:0.70184 [3] validation_0-logloss:0.70530 [4] validation_0-logloss:0.70542 [5] validation_0-logloss:0.70719 [6] validation_0-logloss:0.71508 [7] validation_0-logloss:0.71836 [8] validation_0-logloss:0.72136 [9] validation_0-logloss:0.72541


import flaml.default as flaml_zeroshot

model = flaml_zeroshot.XGBClassifier(
    tree_method="hist",              # Efficient, supports categorical features
    enable_categorical=True,         # Important!
    eval_metric="logloss",
    use_label_encoder=False,
    early_stopping_rounds=10,
    random_state=0
)

# --- 5. Fit model with early stopping ---
model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],   # validation set for early stopping    
    verbose=True
)


ValueError Traceback (most recent call last) Cell In[4], line 13 3 model = flaml_zeroshot.XGBClassifier( 4 tree_method="hist", # Efficient, supports categorical features 5 enable_categorical=True, # Important! (...) 9 random_state=0 10 ) 12 # --- 5. Fit model with early stopping --- ---> 13 model.fit( 14 X_train, y_train, 15 eval_set=[(X_valid, y_valid)], # validation set for early stopping
16 verbose=True 17 )

File /venv/main/lib/python3.12/site-packages/flaml/default/estimator.py:106, in flamlize_estimator..EstimatorClass.fit(self, X, y, *args, **params) 97 self.set_params(**hyperparams) 98 if self._label_transformer and estimator_name in [ 99 "rf", 100 "extra_tree", (...) 104 ]: 105 # rf and et have trouble in handling boolean labels; xgboost requires integer labels --> 106 fitted = super().fit(X, y_transformed, *args, **params) 107 # if hasattr(self, "_classes"): 108 # self._classes = self.label_transformer.classes 109 # else: 110 try:

File /venv/main/lib/python3.12/site-packages/xgboost/core.py:774, in require_keyword_args..throw_if..inner_f(*args, **kwargs) 772 for k, arg in zip(sig.parameters, args): 773 kwargs[k] = arg --> 774 return func(**kwargs)

File /venv/main/lib/python3.12/site-packages/xgboost/sklearn.py:1803, in XGBClassifier.fit(self, X, y, sample_weight, base_margin, eval_set, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights) 1783 evals_result: EvalsLog = {} 1784 train_dmatrix, evals = _wrap_evaluation_matrices( 1785 missing=self.missing, 1786 X=X, (...) 1800 feature_types=feature_types, 1801 ) -> 1803 self._Booster = train( 1804 params, 1805 train_dmatrix, 1806 self.get_num_boosting_rounds(), 1807 evals=evals, 1808 early_stopping_rounds=self.early_stopping_rounds, 1809 evals_result=evals_result, 1810 obj=obj, 1811 custom_metric=metric, 1812 verbose_eval=verbose, 1813 xgb_model=model, 1814 callbacks=self.callbacks, 1815 ) 1817 if not callable(self.objective): 1818 self.objective = params["objective"]

File /venv/main/lib/python3.12/site-packages/xgboost/core.py:774, in require_keyword_args..throw_if..inner_f(*args, **kwargs) 772 for k, arg in zip(sig.parameters, args): 773 kwargs[k] = arg --> 774 return func(**kwargs)

File /venv/main/lib/python3.12/site-packages/xgboost/training.py:182, in train(params, dtrain, num_boost_round, evals, obj, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, custom_metric) 175 if ( 176 isinstance(va, _RefMixIn) 177 and va.ref is not weakref.ref(dtrain) 178 and va is not dtrain 179 ): 180 raise ValueError(_RefError) --> 182 bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model) 183 start_iteration = 0 185 if verbose_eval:

File /venv/main/lib/python3.12/site-packages/xgboost/core.py:2005, in Booster.init(self, params, cache, model_file) 1998 _check_call( 1999 _LIB.XGBoosterCreate( 2000 dmats, c_bst_ulong(len(cache)), ctypes.byref(self.handle) 2001 ) 2002 ) 2003 for d in cache: 2004 # Validate feature only after the feature names are saved into booster. -> 2005 self._assign_dmatrix_features(d) 2007 if isinstance(model_file, Booster): 2008 assert self.handle is not None

File /venv/main/lib/python3.12/site-packages/xgboost/core.py:3396, in Booster._assign_dmatrix_features(self, data) 3393 if self.feature_types is None: 3394 self.feature_types = ft -> 3396 self._validate_features(fn)

File /venv/main/lib/python3.12/site-packages/xgboost/core.py:3431, in Booster._validate_features(self, feature_names) 3425 if my_missing: 3426 msg += ( 3427 "\ntraining data did not have the following fields: " 3428 + ", ".join(str(s) for s in my_missing) 3429 ) -> 3431 raise ValueError(msg.format(self.feature_names, feature_names))

ValueError: feature_names mismatch: ['cat1', 'cat2', 'num1', 'num2'] ['num1', 'num2', 'cat1', 'cat2']

Model Used

No response

Expected Behavior

To process all validation sets.

Surprisingly, LGBMClassifier does that:


model = flaml_zeroshot.LGBMClassifier(
    early_stopping_rounds=10,
    verbose=2,
    random_state=0
)

# --- 5. Fit model with early stopping ---
model.fit(
    X_train, y_train,
    eval_set=(X_valid, y_valid),   # validation set for early stopping    
)

Early stopping, best iteration is: [1] valid_0's binary_logloss: 0.691358

I'm also not a fan of copying entire X dataframe. This makes FLAML not ready for big datasets. Can there be added some flag to tell FLAML to do its preprocessing inplace instead of copying entire X dataframe?

Screenshots and logs

No response

Additional Information

No response

fingoldo avatar Nov 08 '25 10:11 fingoldo

In fact, why do XGB* estimators even need changing X? Xgboost supports auto handling of features of dtype=Category, I vote for dropping current logic & just ensuring that enable_categorical=True param is passed to the estimator.

fingoldo avatar Nov 08 '25 12:11 fingoldo

Thank you, @fingoldo , for the feedback! Would you like to raise a PR for the fix?

thinkall avatar Nov 19 '25 04:11 thinkall