mars
mars copied to clipboard
[BUG] Failover remote functions failed
trafficstars
Describe the bug
Failover remote functions failed.
To Reproduce To help us reproducing this bug, please provide information below:
- Your Python version
- The version of Mars you use
- Versions of crucial packages, such as numpy, scipy and protobuf
- Full stack of the error.
- Minimized code to reproduce the error.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/graph.py in _execute_graph()
/home/admin/work/_public-mars-0.4.2.zip/mars/utils.py in _wrapped()
/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/graph.py in analyze_graph()
/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/graph.py in _assign_initial_workers()
/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult.result()
/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult.get()
/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult.get()
/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult.get()
/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult._raise_exception()
/opt/conda/lib/python3.7/site-packages/gevent/_compat.py in reraise()
/opt/conda/lib/python3.7/site-packages/gevent/threadpool.py in __run_task()
/usr/local/lib/python3.7/site-packages/mars/actors/pool/gevent_pool.pyx in mars.actors.pool.gevent_pool.GeventThreadPool._wrap_watch.inner()
126 gevent.spawn(check, event)
--> 127 result = fn(*args, **kwargs)
128 event.set()
/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/graph.py in _do_assign()
/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/graph.py in assign_operand_workers()
/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/analyzer.py in calc_operand_assignments()
/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/analyzer.py in _iter_assignments_by_transfer_sizes()
/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/analyzer.py in <genexpr>()
/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/analyzer.py in <genexpr>()
AttributeError: 'NoneType' object has no attribute 'chunk_size'
The above exception was the direct cause of the following exception:
ExecutionFailed Traceback (most recent call last)
/home/admin/work/_public-mars-0.4.2.zip/mars/promise.py in _wrapped()
/home/admin/work/_public-mars-0.4.2.zip/mars/worker/calc.py in <lambda>()
/home/admin/work/_public-mars-0.4.2.zip/mars/worker/calc.py in _start_calc()
/home/admin/work/_public-mars-0.4.2.zip/mars/worker/calc.py in _calc_results()
/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult.result()
/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult.get()
/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult.get()
/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult.get()
/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult._raise_exception()
/opt/conda/lib/python3.7/site-packages/gevent/_compat.py in reraise()
/opt/conda/lib/python3.7/site-packages/gevent/threadpool.py in __run_task()
/usr/local/lib/python3.7/site-packages/mars/actors/pool/gevent_pool.pyx in mars.actors.pool.gevent_pool.GeventThreadPool._wrap_watch.inner()
126 gevent.spawn(check, event)
--> 127 result = fn(*args, **kwargs)
128 event.set()
/home/admin/work/_public-mars-0.4.2.zip/mars/executor.py in execute_graph()
/home/admin/work/_public-mars-0.4.2.zip/mars/executor.py in execute()
/opt/conda/lib/python3.7/concurrent/futures/_base.py in result()
/opt/conda/lib/python3.7/concurrent/futures/_base.py in __get_result()
/opt/conda/lib/python3.7/concurrent/futures/thread.py in run()
/home/admin/work/_public-mars-0.4.2.zip/mars/executor.py in _execute_operand()
/home/admin/work/_public-mars-0.4.2.zip/mars/executor.py in handle()
/home/admin/work/_public-pyodps-0.9.3.zip/odps/mars_extension/core.py in wrapper()
/home/admin/work/_public-mars-0.4.2.zip/mars/remote/core.py in execute()
<ipython-input-4-8acd440be626> in process_shap()
54
---> 55 return mr.ExecutableTuple(results).execute()
56
/home/admin/work/_public-mars-0.4.2.zip/mars/core.py in execute()
/home/admin/work/_public-mars-0.4.2.zip/mars/core.py in execute()
/home/admin/work/_public-mars-0.4.2.zip/mars/session.py in run()
/home/admin/work/_public-mars-0.4.2.zip/mars/session.py in run()
ExecutionFailed: '\'\\\'\\\\\\\'"\\\\\\\\\\\\\\\'Graph execution failed.\\\\\\\\\\\\\\\'"\\\\\\\'\\\'\''
The above exception was the direct cause of the following exception:
ExecutionFailed Traceback (most recent call last)
<ipython-input-4-8acd440be626> in <module>
55 return mr.ExecutableTuple(results).execute()
56
---> 57 print(mr.spawn(process_shap, retry_when_fail=False).execute().fetch())
/usr/local/lib/python3.7/site-packages/mars/core.py in execute(self, session, **kw)
368
369 # no more fetch, thus just fire run
--> 370 session.run(self, **kw)
371 # return Tileable or ExecutableTuple itself
372 return self
/usr/local/lib/python3.7/site-packages/mars/session.py in run(self, *tileables, **kw)
426 tileables = tuple(mt.tensor(t) if not isinstance(t, (Entity, Base)) else t
427 for t in tileables)
--> 428 result = self._sess.run(*tileables, **kw)
429
430 for t in tileables:
/usr/local/lib/python3.7/site-packages/mars/web/session.py in run(self, *tileables, **kw)
185 timeout_val = min(check_interval, timeout - time_elapsed) if timeout > 0 else check_interval
186 try:
--> 187 if self._check_response_finished(graph_url, timeout_val):
188 break
189 except KeyboardInterrupt:
/usr/local/lib/python3.7/site-packages/mars/web/session.py in _check_response_finished(self, graph_url, timeout)
144 exc_info = pickle.loads(base64.b64decode(resp_json['exc_info']))
145 exc = exc_info[1].with_traceback(exc_info[2])
--> 146 raise ExecutionFailed('Graph execution failed.') from exc
147 else:
148 raise ExecutionFailed('Graph execution failed with unknown reason.')
ExecutionFailed: 'Graph execution failed.'
This problem has nothing to do with mars.remote. The actual cause is that we do not support fail-over in graphs with Fetch nodes.