sklearn-onnx
sklearn-onnx copied to clipboard
cannot save/serialize model with sparse matrix
In the code below, I am able to serialize a model but only if the feature vectors are converted from sparse (e.g. 'scipy.sparse.csr.csr_matrix') to dense.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
data = [
["schedule a meeting", 0],
["schedule a sync with the team", 0],
["slot in a meeting", 0],
["call ron", 1],
["make a phone call", 1],
["call in on the phone", 2]
]
docs = [doc for (doc, _) in data]
labels = [label for (_, label) in data]
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(docs)
embeddings = vectorizer.transform(docs)
dim = embeddings.shape[1]
#embeddings = np.vstack(embeddings)
clf = SVC()
clf = KNeighborsClassifier(n_neighbors=1)
#embeddings = embeddings.todense() # casting to dense matrix works, leaving this as sparse does not
clf.fit(embeddings, labels)
initial_type = [('float_input', FloatTensorType([1, dim]))]
onnx_model = convert_sklearn(clf, initial_types=initial_type) # this is line 37, where the crash occurs
with open('model.onnx', 'wb') as f:
f.write(onnx_model.SerializeToString())
If we do not do the embeddings = embeddings.todense()
line in the snippet above, I get this error:
Traceback (most recent call last):
File ".\xo.py", line 32, in <module>
onnx_model = convert_sklearn(clf, initial_types=initial_type) # this is line 37, where the crash occurs
File "C:\Users\Stefan Larson\AppData\Local\Programs\Python\Python38\lib\site-packages\skl2onnx\convert.py", line 160, in convert_sklearn
onnx_model = convert_topology(topology, name, doc_string, target_opset,
File "C:\Users\Stefan Larson\AppData\Local\Programs\Python\Python38\lib\site-packages\skl2onnx\common\_topology.py", line 1087, in convert_topology
conv(scope, operator, container)
File "C:\Users\Stefan Larson\AppData\Local\Programs\Python\Python38\lib\site-packages\skl2onnx\common\_registration.py", line 29, in __call__
return self._fct(*args)
File "C:\Users\Stefan Larson\AppData\Local\Programs\Python\Python38\lib\site-packages\skl2onnx\operator_converters\nearest_neighbours.py", line 442, in convert_nearest_neighbors_classifier
many = _convert_nearest_neighbors(operator, container)
File "C:\Users\Stefan Larson\AppData\Local\Programs\Python\Python38\lib\site-packages\skl2onnx\operator_converters\nearest_neighbours.py", line 250, in _convert_nearest_neighbors
top_indices = onnx_nearest_neighbors_indices_k(
File "C:\Users\Stefan Larson\AppData\Local\Programs\Python\Python38\lib\site-packages\skl2onnx\operator_converters\nearest_neighbours.py", line 89, in onnx_nearest_neighbors_indices_k
dist = onnx_cdist(X, Y, metric=metric, dtype=dtype,
File "C:\Users\Stefan Larson\AppData\Local\Programs\Python\Python38\lib\site-packages\skl2onnx\algebra\complex_functions.py", line 88, in onnx_cdist
res = _onnx_cdist_sqeuclidean(
File "C:\Users\Stefan Larson\AppData\Local\Programs\Python\Python38\lib\site-packages\skl2onnx\algebra\complex_functions.py", line 148, in _onnx_cdist_sqeuclidean
return _onnx_cdist_end(XA, XB, id_next, flat, dtype, op_version,
File "C:\Users\Stefan Larson\AppData\Local\Programs\Python\Python38\lib\site-packages\skl2onnx\algebra\complex_functions.py", line 130, in _onnx_cdist_end
node = OnnxScan(XA, XB, output_names=['u(scan0)', 'u(scan1)'],
File "C:\Users\Stefan Larson\AppData\Local\Programs\Python\Python38\lib\site-packages\skl2onnx\algebra\onnx_ops.py", line 79, in __init__
OnnxOperator.__init__(self, *args, **kwargs)
File "C:\Users\Stefan Larson\AppData\Local\Programs\Python\Python38\lib\site-packages\skl2onnx\algebra\onnx_operator.py", line 311, in __init__
raise TypeError(
TypeError: Unable to interpret the input name for type <class 'scipy.sparse.csr.csr_matrix'> in operator 'OnnxScan' (value= (0, 3) 0.70710677
(0, 7) 0.70710677
(1, 7) 0.39339983
(1, 9) 0.47974753
(1, 10) 0.47974753
(1, 11) 0.39339983
(1, 12) 0.47974753
(2, 1) 0.53550583
(2, 3) 0.53550583
(2, 8) 0.65304446
(3, 0) 0.5692126
(3, 6) 0.82219034
(4, 0) 0.47196442
(4, 2) 0.6817217
(4, 5) 0.55902153
(5, 0) 0.37023818
(5, 1) 0.43853122
(5, 4) 0.53478485
(5, 5) 0.43853122
(5, 11) 0.43853122).
Possibly related: https://github.com/microsoft/onnxruntime/issues/3144
The error message is not clear enough but sparse is not fully supported yet in onnxruntime. It is safer to remove every use of sparse when converting a pipeline to sklearn-onnx.