onnxmltools
onnxmltools copied to clipboard
Spark XGB converter accesses missing param improperly
Description
https://github.com/onnx/onnxmltools/pull/373 introduced this constraint on the converter
if hasattr(xgb_node, 'missing') and not np.isnan(xgb_node.missing):
raise RuntimeError("Cannot convert a XGBoost model where missing values are not "
"nan but {}.".format(xgb_node.missing))
Even when I initialize SparkXGBClassifier(missing=np.nan)
, this check still fails
TypeError: ufunc 'isnan' not supported for the input types,
and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
Upon inspecting, xgb_node.missing
is type pyspark.ml.param.Param
so it makes sense that numpy can't apply the function. @xadupre can you provide some context on this change? I couldn't find much in the PR or linked issues but it seems like this is missing something like Param.value
to access the data before passing to numpy
Example
Reproducible script using the following library versions
numpy==1.26.4
onnxconverter_common==1.14.0
onnxmltools==1.12.0
pyspark==3.5.1
xgboost==2.1.0
import numpy as np
from onnxconverter_common.data_types import FloatTensorType
from onnxconverter_common.data_types import Int64TensorType
from onnxconverter_common.registration import register_converter, register_shape_calculator
from onnxmltools import convert_sparkml
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
from onnxmltools.convert.sparkml.ops_names import sparkml_operator_name_map
from onnxmltools.convert.sparkml.ops_input_output import io_name_map
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel
spark = SparkSession.builder.appName("example").getOrCreate()
def calculate_xgb_output_shape(operator):
n = operator.inputs[0].type.shape[0]
operator.outputs[0].type = Int64TensorType(shape=[n])
operator.outputs[1].type = FloatTensorType(shape=[n, 1])
# Train dummy model
df = spark.createDataFrame([(0.0, 1.0), (1.0, 2.0), (2.0, 3.0)], ["input", "input_2"])
pipe = Pipeline(stages=[
VectorAssembler(outputCol="features", inputCols=["input", "input_2"]),
SparkXGBClassifier(
features_col="features",
label_col="input",
missing=np.nan, # <-- doesn't take effect
)
])
fitted_pipe = pipe.fit(df)
# Register converters so SparkML recognizes XGB
sparkml_operator_name_map[SparkXGBClassifierModel] = "XGBOOST"
register_converter("XGBOOST", convert_xgboost, overwrite=True)
register_shape_calculator("XGBOOST", calculate_xgb_output_shape, overwrite=True)
io_name_map["XGBOOST"] = (
lambda model: [model.getFeaturesCol()],
lambda model: [model.getPredictionCol(), model.getProbabilityCol()],
)
# Convert to ONNX
initial_types = [
("input", FloatTensorType([None, 1])),
("input_2", FloatTensorType([None, 1]))
]
onnx_pipe = convert_sparkml(fitted_pipe, "pipe", initial_types) # Triggers numpy TypeError