scikit-learn-intelex
scikit-learn-intelex copied to clipboard
joblib Object is not serializable
Describe the bug
When I add these code
from sklearnex import patch_sklearn
patch_sklearn()
And when used with joblib.dump or pickle.dump, an error will occur
Steps/Code to Reproduce
from sklearnex import patch_sklearn
import joblib
import pandas as pd
# If I add this line _patch_sklearn()_, code will error. But I need acceleration from sklearnex .
patch_sklearn()
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction import DictVectorizer
class ItemSelector(TransformerMixin, BaseEstimator):
def __init__(self, keys):
self.keys = keys
def fit(self, x, y=None):
return self
def transform(self, dataframe):
return dataframe[self.keys]
class VotesToDictTransformer(TransformerMixin, BaseEstimator):
def fit(self, x, y=None):
return self
def transform(self, votes):
overall = votes['overall']
return [{'overall': o} for o in overall]
X_train = pd.DataFrame({'reviewText':["i hate you", "i love you"], 'overall': [1.2, 1.2]})
y_train = pd.DataFrame({'label': [1, 0]})
pipe = Pipeline([
('union', FeatureUnion(
transformer_list=[
('bagofwords', Pipeline([
('selector', ItemSelector(keys='reviewText')),
('counts', TfidfVectorizer()),
])),
('votes', Pipeline([
('selector', ItemSelector(keys=['overall'])),
('votes_to_dict', VotesToDictTransformer()),
('vectorizer', DictVectorizer()),
])),
],
transformer_weights={
'bagofwords': 3.0,
'votes': 1
},
)),
('clf', SVC()),
])
pipe.fit(X_train, y_train)
joblib.dump(pipe, f'model.joblib')
Expected Results
no error
Actual Results
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_25092/3721571015.py in <module>
63
64 pipe.fit(X_train, y_train)
---> 65 joblib.dump(pipe, f'model.joblib')
e:\ProgramData\Anaconda3\envs\python38\lib\site-packages\joblib\numpy_pickle.py in dump(value, filename, compress, protocol, cache_size)
478 elif is_filename:
479 with open(filename, 'wb') as f:
--> 480 NumpyPickler(f, protocol=protocol).dump(value)
481 else:
482 NumpyPickler(filename, protocol=protocol).dump(value)
e:\ProgramData\Anaconda3\envs\python38\lib\pickle.py in dump(self, obj)
485 if self.proto >= 4:
486 self.framer.start_framing()
--> 487 self.save(obj)
488 self.write(STOP)
489 self.framer.end_framing()
e:\ProgramData\Anaconda3\envs\python38\lib\site-packages\joblib\numpy_pickle.py in save(self, obj)
280 return
281
--> 282 return Pickler.save(self, obj)
...
--> 578 rv = reduce(self.proto)
579 else:
580 reduce = getattr(obj, "__reduce__", None)
ValueError: Object is not serializable
Versions
System:
python: 3.8.12 | packaged by conda-forge | (default, Oct 12 2021, 21:19:05) [MSC v.1916 64 bit (AMD64)]
executable: e:\ProgramData\Anaconda3\envs\python38\python.exe
machine: Windows-10-10.0.19042-SP0
Python dependencies:
pip: 20.3.3
setuptools: 52.0.0.post20210125
sklearn: 0.24.1
numpy: 1.20.1
scipy: 1.6.1
Cython: 0.29.23
pandas: 1.3.0
matplotlib: 3.4.2
joblib: 1.0.1
threadpoolctl: 2.1.0
Built with OpenMP: True
OS: Windows10 PRO 20H2