tesserocr
                                
                                 tesserocr copied to clipboard
                                
                                    tesserocr copied to clipboard
                            
                            
                            
                        (Parallelization) TypeError: no default __reduce__ due to non-trivial __cinit__
I was trying to setup Dask to do parallelization when running OCR on multiple documents however it doesn't seem like the PyTessBaseAPI class is pickleable. Is this something that could be implemented in the future or is there some alternative for multiprocessing that is supported that I am missing?
TypeError                                 Traceback (most recent call last)
<ipython-input-8-eed971444e6e> in <module>
----> 1 client.map(t.extract, ["samples/statement.pdf"])
//anaconda3/envs/eve/lib/python3.7/site-packages/distributed/client.py in map(self, func, key, workers, retries, resources, priority, allow_other_workers, fifo_timeout, actor, actors, pure, *iterables, **kwargs)
   1672             user_priority=priority,
   1673             fifo_timeout=fifo_timeout,
-> 1674             actors=actor,
   1675         )
   1676         logger.debug("map(%s, ...)", funcname(func))
//anaconda3/envs/eve/lib/python3.7/site-packages/distributed/client.py in _graph_to_futures(self, dsk, keys, restrictions, loose_restrictions, priority, user_priority, resources, retries, fifo_timeout, actors)
   2486                 {
   2487                     "op": "update-graph",
-> 2488                     "tasks": valmap(dumps_task, dsk3),
   2489                     "dependencies": dependencies,
   2490                     "keys": list(flatkeys),
//anaconda3/envs/eve/lib/python3.7/site-packages/cytoolz/dicttoolz.pyx in cytoolz.dicttoolz.valmap()
//anaconda3/envs/eve/lib/python3.7/site-packages/cytoolz/dicttoolz.pyx in cytoolz.dicttoolz.valmap()
//anaconda3/envs/eve/lib/python3.7/site-packages/distributed/worker.py in dumps_task(task)
   3236             return d
   3237         elif not any(map(_maybe_complex, task[1:])):
-> 3238             return {"function": dumps_function(task[0]), "args": warn_dumps(task[1:])}
   3239     return to_serialize(task)
   3240
//anaconda3/envs/eve/lib/python3.7/site-packages/distributed/worker.py in dumps_function(func)
   3201         result = cache[func]
   3202     except KeyError:
-> 3203         result = pickle.dumps(func)
   3204         if len(result) < 100000:
   3205             cache[func] = result
//anaconda3/envs/eve/lib/python3.7/site-packages/distributed/protocol/pickle.py in dumps(x)
     49     except Exception:
     50         try:
---> 51             return cloudpickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL)
     52         except Exception as e:
     53             logger.info("Failed to serialize %s. Exception: %s", x, e)
//anaconda3/envs/eve/lib/python3.7/site-packages/cloudpickle/cloudpickle.py in dumps(obj, protocol)
   1123     try:
   1124         cp = CloudPickler(file, protocol=protocol)
-> 1125         cp.dump(obj)
   1126         return file.getvalue()
   1127     finally:
//anaconda3/envs/eve/lib/python3.7/site-packages/cloudpickle/cloudpickle.py in dump(self, obj)
    480         self.inject_addons()
    481         try:
--> 482             return Pickler.dump(self, obj)
    483         except RuntimeError as e:
    484             if 'recursion' in e.args[0]:
//anaconda3/envs/eve/lib/python3.7/pickle.py in dump(self, obj)
    435         if self.proto >= 4:
    436             self.framer.start_framing()
--> 437         self.save(obj)
    438         self.write(STOP)
    439         self.framer.end_framing()
//anaconda3/envs/eve/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
    502         f = self.dispatch.get(t)
    503         if f is not None:
--> 504             f(self, obj) # Call unbound method with explicit self
    505             return
    506
//anaconda3/envs/eve/lib/python3.7/site-packages/cloudpickle/cloudpickle.py in save_instancemethod(self, obj)
    888         else:
    889             if PY3:  # pragma: no branch
--> 890                 self.save_reduce(types.MethodType, (obj.__func__, obj.__self__), obj=obj)
    891             else:
    892                 self.save_reduce(
//anaconda3/envs/eve/lib/python3.7/pickle.py in save_reduce(self, func, args, state, listitems, dictitems, obj)
    636         else:
    637             save(func)
--> 638             save(args)
    639             write(REDUCE)
    640
//anaconda3/envs/eve/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
    502         f = self.dispatch.get(t)
    503         if f is not None:
--> 504             f(self, obj) # Call unbound method with explicit self
    505             return
    506
//anaconda3/envs/eve/lib/python3.7/pickle.py in save_tuple(self, obj)
    772         if n <= 3 and self.proto >= 2:
    773             for element in obj:
--> 774                 save(element)
    775             # Subtle.  Same as in the big comment below.
    776             if id(obj) in memo:
//anaconda3/envs/eve/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
    547
    548         # Save the reduce() output and finally memoize the object
--> 549         self.save_reduce(obj=obj, *rv)
    550
    551     def persistent_id(self, obj):
//anaconda3/envs/eve/lib/python3.7/pickle.py in save_reduce(self, func, args, state, listitems, dictitems, obj)
    660
    661         if state is not None:
--> 662             save(state)
    663             write(BUILD)
    664
//anaconda3/envs/eve/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
    502         f = self.dispatch.get(t)
    503         if f is not None:
--> 504             f(self, obj) # Call unbound method with explicit self
    505             return
    506
//anaconda3/envs/eve/lib/python3.7/pickle.py in save_dict(self, obj)
    857
    858         self.memoize(obj)
--> 859         self._batch_setitems(obj.items())
    860
    861     dispatch[dict] = save_dict
//anaconda3/envs/eve/lib/python3.7/pickle.py in _batch_setitems(self, items)
    883                 for k, v in tmp:
    884                     save(k)
--> 885                     save(v)
    886                 write(SETITEMS)
    887             elif n:
//anaconda3/envs/eve/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
    522             reduce = getattr(obj, "__reduce_ex__", None)
    523             if reduce is not None:
--> 524                 rv = reduce(self.proto)
    525             else:
    526                 reduce = getattr(obj, "__reduce__", None)
//anaconda3/envs/eve/lib/python3.7/site-packages/tesserocr.cpython-37m-darwin.so in tesserocr.PyTessBaseAPI.__reduce_cython__()
TypeError: no default __reduce__ due to non-trivial __cinit__
Not a Dask user myself so I'm not familiar with how it works but normally what I'd do is initialize a PyTessBaseAPI in each process on startup.
Making the class picklable isn't a bad idea tho and shouldn't be too hard, I'll look into it whenever I have some time and any help is appreciated.
class TesseractProcess(Process):
    def __init__(self, language, img):
        Process.__init__(self)
        self.tesseractInstance = PyTessBaseAPI(
            lang=language, psm=PSM.AUTO_OSD)
        self.img = img
    def run(self):
        confidence = 0
        self.tesseractInstance.SetImage(self.img)
        print("Image set")
        arr = list(self.tesseractInstance.AllWordConfidences())
        if len(arr) > 0:
            confidence = sum(arr) / float(len(arr))
        print(confidence)
        return confidence
@sirfz  Regarding the approach you mentioned, this still throws TypeError: no default __reduce__ due to non-trivial __cinit__ . Any idea what I'm doing wrong?
self.tesseractInstance is being serialized to be used in the child process and it's failing because PyTessBaseAPI is not picklable.
My suggestion was to have a pool of processes which initialize the API and then process requests (e.g. via a queue).