ray_lightning
ray_lightning copied to clipboard
`shard-ddp` test of system exit
(ci) @JiahaoYao ➜ /workspaces/ray_lightning/ray_lightning/tests (main ✗) $ python -m pytest -v --durations=0 -x test_ddp_sharded.py
=========================================================== test session starts ===========================================================
platform linux -- Python 3.7.13, pytest-7.1.2, pluggy-1.0.0 -- /home/codespace/.conda/envs/ci/bin/python
cachedir: .pytest_cache
rootdir: /workspaces/ray_lightning
collected 6 items
test_ddp_sharded.py::test_ddp_choice_sharded FAILED [ 16%]
================================================================ FAILURES =================================================================
_________________________________________________________ test_ddp_choice_sharded _________________________________________________________
tmpdir = local('/tmp/pytest-of-codespace/pytest-5/test_ddp_choice_sharded0')
ray_start_2_cpus = RayContext(dashboard_url='', python_version='3.7.13', ray_version='1.13.0', ray_commit='e4ce38d001dbbe09cd21c497fedd03...2.16.5.4:61562', 'address': '172.16.5.4:61562', 'node_id': '2f09949b7dfab4041d0518bd034dbd8594dfb02225329508f3bff47d'})
seed = None
def test_ddp_choice_sharded(tmpdir, ray_start_2_cpus, seed):
"""Tests if sharded strategy is properly recognized."""
class CB(Callback):
def on_fit_start(self, trainer, pl_module):
assert isinstance(trainer.strategy, RayShardedStrategy)
raise SystemExit()
model = BoringModel()
trainer = Trainer(
fast_dev_run=True,
strategy=RayShardedStrategy(num_workers=2),
callbacks=[CB()],
)
with pytest.raises(SystemExit):
> trainer.fit(model)
test_ddp_sharded.py:43:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/home/codespace/.conda/envs/ci/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py:771: in fit
self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
/home/codespace/.conda/envs/ci/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py:721: in _call_and_handle_interrupt
return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
../launchers/ray_launcher.py:62: in launch
function, *args, trainer=trainer, **kwargs)
../launchers/ray_launcher.py:214: in run_function_on_workers
results = process_results(self._futures, self.tune_queue)
../util.py:68: in process_results
return ray.get(training_result_futures)
/home/codespace/.conda/envs/ci/lib/python3.7/site-packages/ray/_private/client_mode_hook.py:105: in wrapper
return func(*args, **kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
object_refs = [ObjectRef(8849b62d89cb30f9dcf70f191ffc8678d40097d90100000001000000), ObjectRef(80e22aed7718a12575476cfffbde90be272a59800100000001000000)]
@PublicAPI
@client_mode_hook(auto_init=True)
def get(
object_refs: Union[ray.ObjectRef, List[ray.ObjectRef]],
*,
timeout: Optional[float] = None,
) -> Union[Any, List[Any]]:
"""Get a remote object or a list of remote objects from the object store.
This method blocks until the object corresponding to the object ref is
available in the local object store. If this object is not in the local
object store, it will be shipped from an object store that has it (once the
object has been created). If object_refs is a list, then the objects
corresponding to each object in the list will be returned.
Ordering for an input list of object refs is preserved for each object
returned. That is, if an object ref to A precedes an object ref to B in the
input list, then A will precede B in the returned list.
This method will issue a warning if it's running inside async context,
you can use ``await object_ref`` instead of ``ray.get(object_ref)``. For
a list of object refs, you can use ``await asyncio.gather(*object_refs)``.
Args:
object_refs: Object ref of the object to get or a list of object refs
to get.
timeout (Optional[float]): The maximum amount of time in seconds to
wait before returning.
Returns:
A Python object or a list of Python objects.
Raises:
GetTimeoutError: A GetTimeoutError is raised if a timeout is set and
the get takes longer than timeout to return.
Exception: An exception is raised if the task that created the object
or that created one of the objects raised an exception.
"""
worker = global_worker
worker.check_connected()
if hasattr(worker, "core_worker") and worker.core_worker.current_actor_is_asyncio():
global blocking_get_inside_async_warned
if not blocking_get_inside_async_warned:
logger.warning(
"Using blocking ray.get inside async actor. "
"This blocks the event loop. Please use `await` "
"on object ref with asyncio.gather if you want to "
"yield execution to the event loop instead."
)
blocking_get_inside_async_warned = True
with profiling.profile("ray.get"):
is_individual_id = isinstance(object_refs, ray.ObjectRef)
if is_individual_id:
object_refs = [object_refs]
if not isinstance(object_refs, list):
raise ValueError(
"'object_refs' must either be an object ref "
"or a list of object refs."
)
# TODO(ujvl): Consider how to allow user to retrieve the ready objects.
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
for i, value in enumerate(values):
if isinstance(value, RayError):
if isinstance(value, ray.exceptions.ObjectLostError):
worker.core_worker.dump_object_store_memory_usage()
if isinstance(value, RayTaskError):
raise value.as_instanceof_cause()
else:
> raise value
E ray.exceptions.RayActorError: The actor died unexpectedly before finishing this task.
E class_name: RayExecutor
E actor_id: dcf70f191ffc8678d40097d901000000
E pid: 17370
E namespace: 51ab6b9b-fc30-48d3-a206-9074af78573b
E ip: 172.16.5.4
E The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR_EXIT
/home/codespace/.conda/envs/ci/lib/python3.7/site-packages/ray/worker.py:1833: RayActorError
---------------------------------------------------------- Captured stderr setup ----------------------------------------------------------
2022-07-02 02:47:15,857 WARNING services.py:2013 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 67108864 bytes available. This will harm performance! You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you can increase /dev/shm size by passing '--shm-size=1.70gb' to 'docker run' (or add it to the run_options list in a Ray cluster config). Make sure to set this to more than 30% of available RAM.
----------------------------------------------------------- Captured log setup ------------------------------------------------------------
INFO pytorch_lightning.utilities.seed:seed.py:71 Global seed set to 0
---------------------------------------------------------- Captured stderr call -----------------------------------------------------------
(RayExecutor pid=17370) Global seed set to 0
(RayExecutor pid=17370) /home/codespace/.conda/envs/ci/lib/python3.7/site-packages/pytorch_lightning/utilities/warnings.py:54: LightningDeprecationWarning: pytorch_lightning.utilities.warnings.rank_zero_deprecation has been deprecated in v1.6 and will be removed in v1.8. Use the equivalent function from the pytorch_lightning.utilities.rank_zero module instead.
(RayExecutor pid=17370) "pytorch_lightning.utilities.warnings.rank_zero_deprecation has been deprecated in v1.6"
(RayExecutor pid=17370) /home/codespace/.conda/envs/ci/lib/python3.7/site-packages/pytorch_lightning/utilities/warnings.py:58: LightningDeprecationWarning: ParallelStrategy.torch_distributed_backend was deprecated in v1.6 and will be removed in v1.8.
(RayExecutor pid=17370) return new_rank_zero_deprecation(*args, **kwargs)
(RayExecutor pid=17370) Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
(RayExecutor pid=17371) Global seed set to 0
(RayExecutor pid=17371) Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
(RayExecutor pid=17371) 2022-07-02 02:47:20,319 ERROR worker.py:451 -- SystemExit was raised from the worker.
(RayExecutor pid=17371) Traceback (most recent call last):
(RayExecutor pid=17371) File "python/ray/_raylet.pyx", line 799, in ray._raylet.task_execution_handler
(RayExecutor pid=17371) File "python/ray/_raylet.pyx", line 618, in ray._raylet.execute_task
(RayExecutor pid=17371) File "python/ray/_raylet.pyx", line 658, in ray._raylet.execute_task
(RayExecutor pid=17371) File "python/ray/_raylet.pyx", line 665, in ray._raylet.execute_task
(RayExecutor pid=17371) File "python/ray/_raylet.pyx", line 669, in ray._raylet.execute_task
(RayExecutor pid=17371) File "python/ray/_raylet.pyx", line 616, in ray._raylet.execute_task.function_executor
(RayExecutor pid=17371) File "/home/codespace/.conda/envs/ci/lib/python3.7/site-packages/ray/_private/function_manager.py", line 675, in actor_method_executor
(RayExecutor pid=17371) return method(__ray_actor, *args, **kwargs)
(RayExecutor pid=17371) File "/home/codespace/.conda/envs/ci/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 462, in _resume_span
(RayExecutor pid=17371) return method(self, *_args, **_kwargs)
(RayExecutor pid=17371) File "/workspaces/ray_lightning/ray_lightning/launchers/ray_launcher.py", line 329, in execute
(RayExecutor pid=17371) return fn(*args, **kwargs)
(RayExecutor pid=17371) File "/workspaces/ray_lightning/ray_lightning/launchers/ray_launcher.py", line 235, in _wrapping_function
(RayExecutor pid=17371) results = function(*args, **kwargs)
(RayExecutor pid=17371) File "/home/codespace/.conda/envs/ci/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 811, in _fit_impl
(RayExecutor pid=17371) results = self._run(model, ckpt_path=self.ckpt_path)
(RayExecutor pid=17371) File "/home/codespace/.conda/envs/ci/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1221, in _run
(RayExecutor pid=17371) self._call_callback_hooks("on_fit_start")
(RayExecutor pid=17371) File "/home/codespace/.conda/envs/ci/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1636, in _call_callback_hooks
(RayExecutor pid=17371) fn(self, self.lightning_module, *args, **kwargs)
(RayExecutor pid=17371) File "/workspaces/ray_lightning/ray_lightning/tests/test_ddp_sharded.py", line 33, in on_fit_start
(RayExecutor pid=17371) raise SystemExit()
(RayExecutor pid=17371) SystemExit
(RayExecutor pid=17370) ----------------------------------------------------------------------------------------------------
(RayExecutor pid=17370) distributed_backend=gloo
(RayExecutor pid=17370) All distributed processes registered. Starting with 2 processes
(RayExecutor pid=17370) ----------------------------------------------------------------------------------------------------
(RayExecutor pid=17370)
(RayExecutor pid=17370) 2022-07-02 02:47:20,318 ERROR worker.py:451 -- SystemExit was raised from the worker.
(RayExecutor pid=17370) Traceback (most recent call last):
(RayExecutor pid=17370) File "python/ray/_raylet.pyx", line 799, in ray._raylet.task_execution_handler
(RayExecutor pid=17370) File "python/ray/_raylet.pyx", line 618, in ray._raylet.execute_task
(RayExecutor pid=17370) File "python/ray/_raylet.pyx", line 658, in ray._raylet.execute_task
(RayExecutor pid=17370) File "python/ray/_raylet.pyx", line 665, in ray._raylet.execute_task
(RayExecutor pid=17370) File "python/ray/_raylet.pyx", line 669, in ray._raylet.execute_task
(RayExecutor pid=17370) File "python/ray/_raylet.pyx", line 616, in ray._raylet.execute_task.function_executor
(RayExecutor pid=17370) File "/home/codespace/.conda/envs/ci/lib/python3.7/site-packages/ray/_private/function_manager.py", line 675, in actor_method_executor
(RayExecutor pid=17370) return method(__ray_actor, *args, **kwargs)
(RayExecutor pid=17370) File "/home/codespace/.conda/envs/ci/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 462, in _resume_span
(RayExecutor pid=17370) return method(self, *_args, **_kwargs)
(RayExecutor pid=17370) File "/workspaces/ray_lightning/ray_lightning/launchers/ray_launcher.py", line 329, in execute
(RayExecutor pid=17370) return fn(*args, **kwargs)
(RayExecutor pid=17370) File "/workspaces/ray_lightning/ray_lightning/launchers/ray_launcher.py", line 235, in _wrapping_function
(RayExecutor pid=17370) results = function(*args, **kwargs)
(RayExecutor pid=17370) File "/home/codespace/.conda/envs/ci/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 811, in _fit_impl
(RayExecutor pid=17370) results = self._run(model, ckpt_path=self.ckpt_path)
(RayExecutor pid=17370) File "/home/codespace/.conda/envs/ci/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1221, in _run
(RayExecutor pid=17370) self._call_callback_hooks("on_fit_start")
(RayExecutor pid=17370) File "/home/codespace/.conda/envs/ci/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1636, in _call_callback_hooks
(RayExecutor pid=17370) fn(self, self.lightning_module, *args, **kwargs)
(RayExecutor pid=17370) File "/workspaces/ray_lightning/ray_lightning/tests/test_ddp_sharded.py", line 33, in on_fit_start
(RayExecutor pid=17370) raise SystemExit()
(RayExecutor pid=17370) SystemExit
2022-07-02 02:47:20,382 WARNING worker.py:1404 -- A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffffdcf70f191ffc8678d40097d901000000 Worker ID: a3e587add9fc798a4912b8c9243d4d4af5c8bbf0ab59df0331cb3298 Node ID: 2f09949b7dfab4041d0518bd034dbd8594dfb02225329508f3bff47d Worker IP address: 172.16.5.4 Worker port: 36315 Worker PID: 17370