scikit-learn-intelex icon indicating copy to clipboard operation
scikit-learn-intelex copied to clipboard

joblib Object is not serializable

Open coolmian opened this issue 1 year ago • 0 comments

Describe the bug

When I add these code

from sklearnex import patch_sklearn
patch_sklearn()

And when used with joblib.dump or pickle.dump, an error will occur

Steps/Code to Reproduce

from sklearnex import patch_sklearn
import joblib
import pandas as pd
# If I add this line _patch_sklearn()_, code will error. But I need acceleration from sklearnex .
patch_sklearn()
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction import DictVectorizer

class ItemSelector(TransformerMixin, BaseEstimator):

    def __init__(self, keys):
        self.keys = keys

    def fit(self, x, y=None):
        return self

    def transform(self, dataframe):
        return dataframe[self.keys]


class VotesToDictTransformer(TransformerMixin, BaseEstimator):

    def fit(self, x, y=None):
        return self

    def transform(self, votes):
        overall = votes['overall']
        return [{'overall': o} for o in overall]

X_train = pd.DataFrame({'reviewText':["i hate you", "i love you"], 'overall': [1.2, 1.2]})
y_train = pd.DataFrame({'label': [1, 0]})

pipe = Pipeline([

    ('union', FeatureUnion(
        transformer_list=[

            ('bagofwords', Pipeline([
                ('selector', ItemSelector(keys='reviewText')),
                ('counts', TfidfVectorizer()),
            ])),

            ('votes', Pipeline([
                ('selector', ItemSelector(keys=['overall'])),
                ('votes_to_dict', VotesToDictTransformer()),
                ('vectorizer', DictVectorizer()),
            ])),

        ],

        transformer_weights={
            'bagofwords': 3.0,
            'votes': 1
        },

    )),

    ('clf', SVC()),
])

pipe.fit(X_train, y_train)
joblib.dump(pipe, f'model.joblib')

Expected Results

no error

Actual Results

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_25092/3721571015.py in <module>
     63 
     64 pipe.fit(X_train, y_train)
---> 65 joblib.dump(pipe, f'model.joblib')

e:\ProgramData\Anaconda3\envs\python38\lib\site-packages\joblib\numpy_pickle.py in dump(value, filename, compress, protocol, cache_size)
    478     elif is_filename:
    479         with open(filename, 'wb') as f:
--> 480             NumpyPickler(f, protocol=protocol).dump(value)
    481     else:
    482         NumpyPickler(filename, protocol=protocol).dump(value)

e:\ProgramData\Anaconda3\envs\python38\lib\pickle.py in dump(self, obj)
    485         if self.proto >= 4:
    486             self.framer.start_framing()
--> 487         self.save(obj)
    488         self.write(STOP)
    489         self.framer.end_framing()

e:\ProgramData\Anaconda3\envs\python38\lib\site-packages\joblib\numpy_pickle.py in save(self, obj)
    280             return
    281 
--> 282         return Pickler.save(self, obj)
...
--> 578                     rv = reduce(self.proto)
    579                 else:
    580                     reduce = getattr(obj, "__reduce__", None)

ValueError: Object is not serializable

Versions

System:
    python: 3.8.12 | packaged by conda-forge | (default, Oct 12 2021, 21:19:05) [MSC v.1916 64 bit (AMD64)]
executable: e:\ProgramData\Anaconda3\envs\python38\python.exe
   machine: Windows-10-10.0.19042-SP0

Python dependencies:
          pip: 20.3.3
   setuptools: 52.0.0.post20210125
      sklearn: 0.24.1
        numpy: 1.20.1
        scipy: 1.6.1
       Cython: 0.29.23
       pandas: 1.3.0
   matplotlib: 3.4.2
       joblib: 1.0.1
threadpoolctl: 2.1.0

Built with OpenMP: True

OS: Windows10 PRO 20H2

coolmian avatar Aug 11 '22 02:08 coolmian