kiwipiepy
kiwipiepy copied to clipboard
`SwTokenizer`가 피클되지 않는 문제
키위토크나이저를 멀티프로세싱과 함께 사용하려다 발견한 문제입니다.
개발환경
Windows 11
python 3.10.12
kiwipiepy 0.15.2
transformers 4.31.0
tokenizers 0.13.3
dill 0.3.7
문제
import kiwipiepy.transformers_addon
from transformers import AutoTokenizer
repo = "kiwi-farm/roberta-base-32k"
tk = AutoTokenizer.from_pretrained(repo)
import pickle
pickled = pickle.dumps(tk)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[2], line 4
1 import pickle
2 import dill
----> 4 pickled = pickle.dumps(tk)
TypeError: cannot pickle 'SwTokenizer' object
import dill
pickled = dill.dumps(tk)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[3], line 3
1 import dill
----> 3 pickled = dill.dumps(tk)
File [c:\Users\dowon\miniconda3\envs\kiwi\lib\site-packages\dill\_dill.py:278](file:///C:/Users/dowon/miniconda3/envs/kiwi/lib/site-packages/dill/_dill.py:278), in dumps(obj, protocol, byref, fmode, recurse, **kwds)
254 """
255 Pickle an object to a string.
256
(...)
275 Default values for keyword arguments can be set in :mod:`dill.settings`.
276 """
277 file = StringIO()
--> 278 dump(obj, file, protocol, byref, fmode, recurse, **kwds)#, strictio)
279 return file.getvalue()
File [c:\Users\dowon\miniconda3\envs\kiwi\lib\site-packages\dill\_dill.py:250](file:///C:/Users/dowon/miniconda3/envs/kiwi/lib/site-packages/dill/_dill.py:250), in dump(obj, file, protocol, byref, fmode, recurse, **kwds)
248 _kwds = kwds.copy()
249 _kwds.update(dict(byref=byref, fmode=fmode, recurse=recurse))
--> 250 Pickler(file, protocol, **_kwds).dump(obj)
251 return
File [c:\Users\dowon\miniconda3\envs\kiwi\lib\site-packages\dill\_dill.py:418](file:///C:/Users/dowon/miniconda3/envs/kiwi/lib/site-packages/dill/_dill.py:418), in Pickler.dump(self, obj)
416 def dump(self, obj): #NOTE: if settings change, need to update attributes
417 logger.trace_setup(self)
--> 418 StockPickler.dump(self, obj)
File [c:\Users\dowon\miniconda3\envs\kiwi\lib\pickle.py:487](file:///C:/Users/dowon/miniconda3/envs/kiwi/lib/pickle.py:487), in _Pickler.dump(self, obj)
485 if self.proto >= 4:
486 self.framer.start_framing()
--> 487 self.save(obj)
488 self.write(STOP)
489 self.framer.end_framing()
File [c:\Users\dowon\miniconda3\envs\kiwi\lib\site-packages\dill\_dill.py:412](file:///C:/Users/dowon/miniconda3/envs/kiwi/lib/site-packages/dill/_dill.py:412), in Pickler.save(self, obj, save_persistent_id)
410 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
411 raise PicklingError(msg)
--> 412 StockPickler.save(self, obj, save_persistent_id)
File [c:\Users\dowon\miniconda3\envs\kiwi\lib\pickle.py:603](file:///C:/Users/dowon/miniconda3/envs/kiwi/lib/pickle.py:603), in _Pickler.save(self, obj, save_persistent_id)
599 raise PicklingError("Tuple returned by %s must have "
600 "two to six elements" % reduce)
602 # Save the reduce() output and finally memoize the object
--> 603 self.save_reduce(obj=obj, *rv)
File [c:\Users\dowon\miniconda3\envs\kiwi\lib\pickle.py:717](file:///C:/Users/dowon/miniconda3/envs/kiwi/lib/pickle.py:717), in _Pickler.save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj)
715 if state is not None:
716 if state_setter is None:
--> 717 save(state)
718 write(BUILD)
719 else:
720 # If a state_setter is specified, call it instead of load_build
721 # to update obj's with its previous state.
722 # First, push state_setter and its tuple of expected arguments
723 # (obj, state) onto the stack.
File [c:\Users\dowon\miniconda3\envs\kiwi\lib\site-packages\dill\_dill.py:412](file:///C:/Users/dowon/miniconda3/envs/kiwi/lib/site-packages/dill/_dill.py:412), in Pickler.save(self, obj, save_persistent_id)
410 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
411 raise PicklingError(msg)
--> 412 StockPickler.save(self, obj, save_persistent_id)
File [c:\Users\dowon\miniconda3\envs\kiwi\lib\pickle.py:560](file:///C:/Users/dowon/miniconda3/envs/kiwi/lib/pickle.py:560), in _Pickler.save(self, obj, save_persistent_id)
558 f = self.dispatch.get(t)
559 if f is not None:
--> 560 f(self, obj) # Call unbound method with explicit self
561 return
563 # Check private dispatch table if any, or else
564 # copyreg.dispatch_table
File [c:\Users\dowon\miniconda3\envs\kiwi\lib\site-packages\dill\_dill.py:1212](file:///C:/Users/dowon/miniconda3/envs/kiwi/lib/site-packages/dill/_dill.py:1212), in save_module_dict(pickler, obj)
1209 if is_dill(pickler, child=False) and pickler._session:
1210 # we only care about session the first pass thru
1211 pickler._first_pass = False
-> 1212 StockPickler.save_dict(pickler, obj)
1213 logger.trace(pickler, "# D2")
1214 return
File [c:\Users\dowon\miniconda3\envs\kiwi\lib\pickle.py:972](file:///C:/Users/dowon/miniconda3/envs/kiwi/lib/pickle.py:972), in _Pickler.save_dict(self, obj)
969 self.write(MARK + DICT)
971 self.memoize(obj)
--> 972 self._batch_setitems(obj.items())
File [c:\Users\dowon\miniconda3\envs\kiwi\lib\pickle.py:998](file:///C:/Users/dowon/miniconda3/envs/kiwi/lib/pickle.py:998), in _Pickler._batch_setitems(self, items)
996 for k, v in tmp:
997 save(k)
--> 998 save(v)
999 write(SETITEMS)
1000 elif n:
File [c:\Users\dowon\miniconda3\envs\kiwi\lib\site-packages\dill\_dill.py:412](file:///C:/Users/dowon/miniconda3/envs/kiwi/lib/site-packages/dill/_dill.py:412), in Pickler.save(self, obj, save_persistent_id)
410 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
411 raise PicklingError(msg)
--> 412 StockPickler.save(self, obj, save_persistent_id)
File [c:\Users\dowon\miniconda3\envs\kiwi\lib\pickle.py:578](file:///C:/Users/dowon/miniconda3/envs/kiwi/lib/pickle.py:578), in _Pickler.save(self, obj, save_persistent_id)
576 reduce = getattr(obj, "__reduce_ex__", None)
577 if reduce is not None:
--> 578 rv = reduce(self.proto)
579 else:
580 reduce = getattr(obj, "__reduce__", None)
TypeError: cannot pickle 'SwTokenizer' object
dill.detect.baditems(tk)
# repr 문제 해결한 그 브랜치에서 작업해서 에러가 나지 않습니다.
[KiwiTokenizer(name_or_path='kiwi-farm/roberta-base-32k', vocab_size=32000, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[BOS]', 'eos_token': '[EOS]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)]
dill.detect.baditems(tk._tokenizer)
[<kiwipiepy.sw_tokenizer.SwTokenizer object at 0x0000023F82CE9040>]