mars icon indicating copy to clipboard operation
mars copied to clipboard

[BUG] Failover remote functions failed

Open qinxuye opened this issue 5 years ago • 1 comments
trafficstars

Describe the bug

Failover remote functions failed.

To Reproduce To help us reproducing this bug, please provide information below:

  1. Your Python version
  2. The version of Mars you use
  3. Versions of crucial packages, such as numpy, scipy and protobuf
  4. Full stack of the error.
  5. Minimized code to reproduce the error.
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/graph.py in _execute_graph()

/home/admin/work/_public-mars-0.4.2.zip/mars/utils.py in _wrapped()

/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/graph.py in analyze_graph()

/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/graph.py in _assign_initial_workers()

/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult.result()

/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult.get()

/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult.get()

/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult.get()

/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult._raise_exception()

/opt/conda/lib/python3.7/site-packages/gevent/_compat.py in reraise()

/opt/conda/lib/python3.7/site-packages/gevent/threadpool.py in __run_task()

/usr/local/lib/python3.7/site-packages/mars/actors/pool/gevent_pool.pyx in mars.actors.pool.gevent_pool.GeventThreadPool._wrap_watch.inner()
    126             gevent.spawn(check, event)
--> 127             result = fn(*args, **kwargs)
    128             event.set()

/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/graph.py in _do_assign()

/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/graph.py in assign_operand_workers()

/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/analyzer.py in calc_operand_assignments()

/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/analyzer.py in _iter_assignments_by_transfer_sizes()

/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/analyzer.py in <genexpr>()

/home/admin/work/_public-mars-0.4.2.zip/mars/scheduler/analyzer.py in <genexpr>()

AttributeError: 'NoneType' object has no attribute 'chunk_size'

The above exception was the direct cause of the following exception:

ExecutionFailed                           Traceback (most recent call last)
/home/admin/work/_public-mars-0.4.2.zip/mars/promise.py in _wrapped()

/home/admin/work/_public-mars-0.4.2.zip/mars/worker/calc.py in <lambda>()

/home/admin/work/_public-mars-0.4.2.zip/mars/worker/calc.py in _start_calc()

/home/admin/work/_public-mars-0.4.2.zip/mars/worker/calc.py in _calc_results()

/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult.result()

/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult.get()

/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult.get()

/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult.get()

/opt/conda/lib/python3.7/site-packages/gevent/_gevent_cevent.cpython-37m-x86_64-linux-gnu.so in gevent._gevent_cevent.AsyncResult._raise_exception()

/opt/conda/lib/python3.7/site-packages/gevent/_compat.py in reraise()

/opt/conda/lib/python3.7/site-packages/gevent/threadpool.py in __run_task()

/usr/local/lib/python3.7/site-packages/mars/actors/pool/gevent_pool.pyx in mars.actors.pool.gevent_pool.GeventThreadPool._wrap_watch.inner()
    126             gevent.spawn(check, event)
--> 127             result = fn(*args, **kwargs)
    128             event.set()

/home/admin/work/_public-mars-0.4.2.zip/mars/executor.py in execute_graph()

/home/admin/work/_public-mars-0.4.2.zip/mars/executor.py in execute()

/opt/conda/lib/python3.7/concurrent/futures/_base.py in result()

/opt/conda/lib/python3.7/concurrent/futures/_base.py in __get_result()

/opt/conda/lib/python3.7/concurrent/futures/thread.py in run()

/home/admin/work/_public-mars-0.4.2.zip/mars/executor.py in _execute_operand()

/home/admin/work/_public-mars-0.4.2.zip/mars/executor.py in handle()

/home/admin/work/_public-pyodps-0.9.3.zip/odps/mars_extension/core.py in wrapper()

/home/admin/work/_public-mars-0.4.2.zip/mars/remote/core.py in execute()

<ipython-input-4-8acd440be626> in process_shap()
     54 
---> 55     return mr.ExecutableTuple(results).execute()
     56 

/home/admin/work/_public-mars-0.4.2.zip/mars/core.py in execute()

/home/admin/work/_public-mars-0.4.2.zip/mars/core.py in execute()

/home/admin/work/_public-mars-0.4.2.zip/mars/session.py in run()

/home/admin/work/_public-mars-0.4.2.zip/mars/session.py in run()

ExecutionFailed: '\'\\\'\\\\\\\'"\\\\\\\\\\\\\\\'Graph execution failed.\\\\\\\\\\\\\\\'"\\\\\\\'\\\'\''

The above exception was the direct cause of the following exception:

ExecutionFailed                           Traceback (most recent call last)
<ipython-input-4-8acd440be626> in <module>
     55     return mr.ExecutableTuple(results).execute()
     56 
---> 57 print(mr.spawn(process_shap, retry_when_fail=False).execute().fetch())

/usr/local/lib/python3.7/site-packages/mars/core.py in execute(self, session, **kw)
    368 
    369         # no more fetch, thus just fire run
--> 370         session.run(self, **kw)
    371         # return Tileable or ExecutableTuple itself
    372         return self

/usr/local/lib/python3.7/site-packages/mars/session.py in run(self, *tileables, **kw)
    426         tileables = tuple(mt.tensor(t) if not isinstance(t, (Entity, Base)) else t
    427                           for t in tileables)
--> 428         result = self._sess.run(*tileables, **kw)
    429 
    430         for t in tileables:

/usr/local/lib/python3.7/site-packages/mars/web/session.py in run(self, *tileables, **kw)
    185             timeout_val = min(check_interval, timeout - time_elapsed) if timeout > 0 else check_interval
    186             try:
--> 187                 if self._check_response_finished(graph_url, timeout_val):
    188                     break
    189             except KeyboardInterrupt:

/usr/local/lib/python3.7/site-packages/mars/web/session.py in _check_response_finished(self, graph_url, timeout)
    144                 exc_info = pickle.loads(base64.b64decode(resp_json['exc_info']))
    145                 exc = exc_info[1].with_traceback(exc_info[2])
--> 146                 raise ExecutionFailed('Graph execution failed.') from exc
    147             else:
    148                 raise ExecutionFailed('Graph execution failed with unknown reason.')

ExecutionFailed: 'Graph execution failed.'

qinxuye avatar Jul 24 '20 10:07 qinxuye

This problem has nothing to do with mars.remote. The actual cause is that we do not support fail-over in graphs with Fetch nodes.

wjsi avatar Jul 28 '20 06:07 wjsi