sklearn-onnx
sklearn-onnx copied to clipboard
Got input with wrong type during conversion when using pipeline
I am trying to convert a regression model that needs preprocessing over numerical and categorical features. The categorical one works fine, but the numerical has something off. The code is the following:
# %%
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
import skl2onnx
from skl2onnx.common.data_types import StringTensorType, Int32TensorType, FloatTensorType
from pyquickhelper.helpgen.graphviz_helper import plot_graphviz
from mlprodict.onnxrt import OnnxInference
import onnxruntime as rt
path = "./datasets/unbalanced_unique.csv"
# %%
df_train = pd.read_csv(path)
df_train.head()
# %%
X = df_train[["rows", "query_type", "query_size", "unique", "mode", "threads", "backend"]].to_numpy()
y = df_train["execution_mean"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# %%
numeric_features = [0, 2, 3, 5]
categorical_features = [1, 4, 6]
numeric_transformer = Pipeline(steps=[
('scaler', sklearn.preprocessing.StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('onehot', sklearn.preprocessing.OneHotEncoder(sparse=True, handle_unknown='ignore')),
])
preprocessor = sklearn.compose.ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# %%
reg = GradientBoostingRegressor()
pipeline = Pipeline([
('preprocess', preprocessor),
('reg', reg)
])
pipeline.fit(X_train, y_train)
# %%
initial_type = [
("rows", Int32TensorType([1,1])),
("query_type", StringTensorType([1,1])),
("query_size", Int32TensorType([1,1])),
("unique", FloatTensorType([1,1])),
("mode", StringTensorType([1,1])),
("threads", Int32TensorType([1,1])),
("backend", StringTensorType([1,1])),
]
# %%
onx = skl2onnx.convert_sklearn(pipeline, initial_types=initial_type)
with open("./out/regression.onnx", "wb") as f:
f.write(onx.SerializeToString())
The error message is
RuntimeError: Operator SklearnScaler (type: SklearnScaler) got an input merged_columns with a wrong type . Only [, , ] are allowed
I tried with a dummy set and it works. Maybe pandas changed the type of a column because one row is misaligned or for some other reason.
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
import skl2onnx
from skl2onnx.common.data_types import (
StringTensorType,
Int32TensorType,
FloatTensorType,
)
from pyquickhelper.helpgen.graphviz_helper import plot_graphviz
from onnx.reference import ReferenceEvaluator
import onnxruntime as rt
# path = "./datasets/unbalanced_unique.csv"
# %%
# df_train = pd.read_csv(path)
df_train = pd.DataFrame(
[
{
"rows": 5,
"query_type": "A",
"query_size": 4,
"unique": 1,
"mode": "E",
"threads": 5,
"backend": "ZZ",
"execution_mean": 5.5,
},
{
"rows": 4,
"query_type": "B",
"query_size": 2,
"unique": 0,
"mode": "FF",
"threads": 5,
"backend": "WWW",
"execution_mean": 4.5,
},
]
)
df_train.head()
# %%
X = df_train[
["rows", "query_type", "query_size", "unique", "mode", "threads", "backend"]
].to_numpy()
y = df_train["execution_mean"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# %%
numeric_features = [0, 2, 3, 5]
categorical_features = [1, 4, 6]
numeric_transformer = Pipeline(
steps=[("scaler", sklearn.preprocessing.StandardScaler())]
)
categorical_transformer = Pipeline(
steps=[
(
"onehot",
sklearn.preprocessing.OneHotEncoder(sparse=True, handle_unknown="ignore"),
),
]
)
preprocessor = sklearn.compose.ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
# %%
reg = GradientBoostingRegressor()
pipeline = Pipeline([("preprocess", preprocessor), ("reg", reg)])
pipeline.fit(X_train, y_train)
# %%
initial_type = [
("rows", Int32TensorType([1, 1])),
("query_type", StringTensorType([1, 1])),
("query_size", Int32TensorType([1, 1])),
("unique", FloatTensorType([1, 1])),
("mode", StringTensorType([1, 1])),
("threads", Int32TensorType([1, 1])),
("backend", StringTensorType([1, 1])),
]
# %%
onx = skl2onnx.convert_sklearn(pipeline, initial_types=initial_type)
with open("regression.onnx", "wb") as f:
f.write(onx.SerializeToString())