mars
mars copied to clipboard
[BUG] Mars storage data key error when autoscaling in workers
Mars storage data key error when autoscaling in workers:
2022-02-24 17:04:33,625 ERROR autoscale.py:343 -- Exception occurred when try to auto scale
Traceback (most recent call last):
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/services/scheduling/supervisor/autoscale.py", line 339, in _run
await self._run_round()
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/services/scheduling/supervisor/autoscale.py", line 351, in _run_round
await self._scale_in()
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/services/scheduling/supervisor/autoscale.py", line 449, in _scale_in
await self._autoscaler.release_workers(worker_addresses)
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/services/scheduling/supervisor/autoscale.py", line 167, in release_workers
await asyncio.gather(*[release_worker(address) for address in addresses])
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/services/scheduling/supervisor/autoscale.py", line 158, in release_worker
await self._migrate_data_of_bands(worker_bands, excluded_bands)
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/services/scheduling/supervisor/autoscale.py", line 232, in _migrate_data_of_bands
*[api.fetch.batch(*fetches) for api, fetches in batch_fetch.items()]
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/oscar/batch.py", line 144, in _async_batch
return await self.batch_func(args_list, kwargs_list)
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/services/storage/api/oscar.py", line 205, in batch_fetch
self._session_id, data_keys, *extracted_args
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/context.py", line 186, in send
return self._process_result_message(result)
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/context.py", line 70, in _process_result_message
raise message.as_instanceof_cause()
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/pool.py", line 590, in send
result = await self._run_coro(message.message_id, coro)
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/pool.py", line 343, in _run_coro
return await coro
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/oscar/api.py", line 115, in __on_receive__
return await super().__on_receive__(message)
File "mars/oscar/core.pyx", line 371, in __on_receive__
raise ex
File "mars/oscar/core.pyx", line 343, in mars.oscar.core._BaseActor.__on_receive__
async with self._lock:
File "mars/oscar/core.pyx", line 344, in mars.oscar.core._BaseActor.__on_receive__
with debug_async_timeout('actor_lock_timeout',
File "mars/oscar/core.pyx", line 349, in mars.oscar.core._BaseActor.__on_receive__
result = await result
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/services/storage/handler.py", line 556, in fetch_batch
await async_call(asyncio.gather(*transfer_tasks))
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/oscar/debug.py", line 146, in async_call
return await coro
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/services/storage/handler.py", line 434, in _fetch_remote
data_infos = await remote_manager_ref.get_data_info.batch(*get_data_infos)
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/context.py", line 186, in send
return self._process_result_message(result)
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/context.py", line 70, in _process_result_message
raise message.as_instanceof_cause()
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/pool.py", line 590, in send
result = await self._run_coro(message.message_id, coro)
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/pool.py", line 343, in _run_coro
return await coro
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/oscar/api.py", line 115, in __on_receive__
return await super().__on_receive__(message)
File "mars/oscar/core.pyx", line 371, in __on_receive__
raise ex
File "mars/oscar/core.pyx", line 352, in mars.oscar.core._BaseActor.__on_receive__
async with self._lock:
File "mars/oscar/core.pyx", line 353, in mars.oscar.core._BaseActor.__on_receive__
with debug_async_timeout('actor_lock_timeout',
File "mars/oscar/core.pyx", line 359, in mars.oscar.core._BaseActor.__on_receive__
result = func.batch(*delays)
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/oscar/batch.py", line 170, in batch
return self._sync_batch(*delays)
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/oscar/batch.py", line 164, in _sync_batch
return [self.func(*d.args, **d.kwargs) for d in delays]
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/oscar/batch.py", line 164, in <listcomp>
return [self.func(*d.args, **d.kwargs) for d in delays]
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/services/storage/core.py", line 235, in get_data_info
return self._get_data_info(session_id, data_key, band_name, error)
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/services/storage/core.py", line 221, in _get_data_info
infos = self._get_data_infos(session_id, data_key, band_name, error)
File "/home/admin/ray-pack/tmp/job/99000080/pyenv/lib/python3.7/site-packages/mars/services/storage/core.py", line 206, in _get_data_infos
raise DataNotExist(f"Data key {session_id, data_key} not exists.")
types._MarsError: [address=ray://mars_cluster_1645693134_75/0/0, pid=60755] Data key ('tcgdIQh1lK7yvw5bVv46qTSW', '9013e195154fb1d927c4afaf01464ff8') not exists.
2022-02-24 17:10:22,667 INFO manager.py:289 -- Finished subtask sdzuO030ANAehRBByDzTXky0 with r