ray_lightning
ray_lightning copied to clipboard
Error in RayStrategy.root_device when using multi GPU node
Problem statement
When starting a hyperparameter search on a multi GPU node (4 GPUs) I run into a mismatch of visible CUDA devices. Below is the full code to recreate the error (it is the same as found here with a modification to use GPU as well as a change in local_dir
).
Code to recreate
from ray import tune
from ray_lightning import RayStrategy
from ray_lightning.examples.ray_ddp_example import MNISTClassifier
from ray_lightning.tune import TuneReportCallback, get_tune_resources
import pytorch_lightning as pl
def train_mnist(config):
# Create your PTL model.
model = MNISTClassifier(config)
# Create the Tune Reporting Callback
metrics = {"loss": "ptl/val_loss", "acc": "ptl/val_accuracy"}
callbacks = [TuneReportCallback(metrics, on="validation_end")]
trainer = pl.Trainer(
max_epochs=4,
callbacks=callbacks,
strategy=RayStrategy(use_gpu=True),
)
trainer.fit(model)
config = {
"layer_1": tune.choice([32, 64, 128]),
"layer_2": tune.choice([64, 128, 256]),
"lr": tune.loguniform(1e-4, 1e-1),
"batch_size": tune.choice([32, 64, 128]),
}
# Make sure to pass in ``resources_per_trial`` using the ``get_tune_resources`` utility.
analysis = tune.run(
train_mnist,
metric="loss",
mode="min",
config=config,
num_samples=10,
resources_per_trial=get_tune_resources(use_gpu=True),
local_dir='~/scratch/raytune',
name="tune_mnist")
print("Best hyperparameters found were: ", analysis.best_config)
Modified ray_lightning
code
To highlight the error I am experiencing, i have modified code within ray_lightning.ray_dpp.RayStrategy.root_device
from
if cuda_visible_str and cuda_visible_str != "NoDevFiles":
cuda_visible_list = [
int(dev) for dev in cuda_visible_str.split(",")
]
device_id = cuda_visible_list.index(gpu_id)
return torch.device("cuda", device_id)
to
if cuda_visible_str and cuda_visible_str != "NoDevFiles":
cuda_visible_list = [
int(dev) for dev in cuda_visible_str.split(",")
]
try:
device_id = cuda_visible_list.index(gpu_id)
except ValueError as err:
raise ValueError(f'cuda_visible_str -> "{cuda_visible_str}", cuda_visible_list -> "{cuda_visible_list}", gpu_id -> "{gpu_id}",') from err
return torch.device("cuda", device_id)
Error log output
Below is the output from an error.txt
log in one of the trials:
Failure # 1 (occurred at 2022-08-05_17-02-02)
ray::ImplicitFunc.train() (pid=195537, ip=10.10.8.7, repr=train_mnist)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/trainable.py", line 360, in train
result = self.step()
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 404, in step
self._report_thread_runner_error(block=True)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 574, in _report_thread_runner_error
raise e
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 277, in run
self._entrypoint()
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 349, in entrypoint
return self._trainable_func(
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 645, in _trainable_func
output = fn()
File "/mnt/iusers01/gb01/c38028ml/Dev/git/phd-torchscripts/PhD/torchscripts/pcconv/tests/raytune_test.py", line 24, in train_mnist
trainer.fit(model)
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 768, in fit
self._call_and_handle_interrupt(
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 719, in _call_and_handle_interrupt
return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/ray_launcher.py", line 55, in launch
ray_output = self.run_function_on_workers(
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/ray_launcher.py", line 229, in run_function_on_workers
results = process_results(self._futures, self.tune_queue)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/util.py", line 64, in process_results
ray.get(ready)
ray.exceptions.RayTaskError(ValueError): ray::RayExecutor.execute() (pid=195612, ip=10.10.8.7, repr=<ray_lightning.launchers.utils.RayExecutor object at 0x2b876ccc0bb0>)
ValueError: '2' is not in list
The above exception was the direct cause of the following exception:
ray::RayExecutor.execute() (pid=195612, ip=10.10.8.7, repr=<ray_lightning.launchers.utils.RayExecutor object at 0x2b876ccc0bb0>)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/utils.py", line 52, in execute
return fn(*args, **kwargs)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/ray_launcher.py", line 267, in _wrapping_function
self._strategy._worker_setup(process_idx=global_rank)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/ray_ddp.py", line 155, in _worker_setup
self._process_group_backend = self._get_process_group_backend()
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/strategies/ddp_spawn.py", line 163, in _get_process_group_backend
or get_default_process_group_backend_for_device(self.root_device)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/ray_ddp.py", line 258, in root_device
raise ValueError(f'cuda_visible_str -> "{cuda_visible_str}", cuda_visible_list -> "{cuda_visible_list}", gpu_id -> "{gpu_id}"') from err
ValueError: cuda_visible_str -> "2", cuda_visible_list -> "[2]", gpu_id -> "2"
Other trials fail with a similar error, differing only with the gpu_id
present, e.g.
ValueError: cuda_visible_str -> "0", cuda_visible_list -> "[0]", gpu_id -> "0"
Extra information
This code is submitted as a job in a HPC (SGE) server, and I can confirm that the main script has the environment variable ${CUDA_VISIBLE_DEVICES}
equal to "0,1,2,3"
.
It seems that for some reason each trial process only has access to one CUDA device, therefore indexing the cuda_visible_list
fails.
To try and find a solution I tried the following change to the code:
Change from:
device_id = cuda_visible_list.index(gpu_id)
to:
if len(cuda_visible_list) == 1:
device_id = cuda_visible_list[0]
else:
device_id = cuda_visible_list.index(gpu_id)
Which resulted in the following error:
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/utils.py", line 52, in execute
return fn(*args, **kwargs)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/ray_launcher.py", line 271, in _wrapping_function
set_cuda_device_if_used(trainer.strategy)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/util.py", line 102, in set_cuda_device_if_used
torch.cuda.set_device(strategy.root_device)
File "/opt/apps/apps/binapps/pytorch/1.11.0/python3.9/gpu-cuda11.3/lib/python3.9/site-packages/torch/cuda/__init__.py", line 313, in set_device
torch._C._cuda_setDevice(device)
RuntimeError: CUDA error: invalid device ordinal
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
hi @m-lyon , i am trying to looking into the problem. I wonder if it is possible to remove ${CUDA_VISIBLE_DEVICES} equal to "0,1,2,3"
setup in the problem?
i am worried about that ray
is also setting the environment variables for you.
== Status ==
Current time: 2022-08-05 16:13:21 (running for 00:01:46.10)
Memory usage on this node: 22.1/186.6 GiB
Using FIFO scheduling algorithm.
Resources requested: 8.0/48 CPUs, 4.0/4 GPUs, 0.0/120.88 GiB heap, 0.0/55.8 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /home/ray/scratch/raytune/tune_mnist
Number of trials: 10/10 (6 PENDING, 4 RUNNING)
+-------------------------+----------+--------------------+--------------+-----------+-----------+-------------+
| Trial name | status | loc | batch_size | layer_1 | layer_2 | lr |
|-------------------------+----------+--------------------+--------------+-----------+-----------+-------------|
| train_mnist_f3985_00000 | RUNNING | 172.31.91.173:2066 | 64 | 128 | 128 | 0.000301458 |
| train_mnist_f3985_00001 | RUNNING | 172.31.91.173:2132 | 32 | 64 | 128 | 0.0120597 |
| train_mnist_f3985_00002 | RUNNING | 172.31.91.173:2134 | 64 | 128 | 256 | 0.00052504 |
| train_mnist_f3985_00003 | RUNNING | 172.31.91.173:2136 | 32 | 64 | 128 | 0.00301416 |
| train_mnist_f3985_00004 | PENDING | | 128 | 64 | 64 | 0.0749624 |
| train_mnist_f3985_00005 | PENDING | | 64 | 32 | 128 | 0.000440828 |
| train_mnist_f3985_00006 | PENDING | | 32 | 32 | 256 | 0.00521079 |
| train_mnist_f3985_00007 | PENDING | | 32 | 64 | 64 | 0.00171421 |
| train_mnist_f3985_00008 | PENDING | | 64 | 128 | 64 | 0.00174926 |
| train_mnist_f3985_00009 | PENDING | | 128 | 64 | 64 | 0.00142737 |
+-------------------------+----------+--------------------+--------------+-----------+-----------+-------------+
Epoch 0: 12%|█▏ | 108/937 [00:00<00:06, 129.34it/s, loss=0.85, v_num=0]
Epoch 0: 11%|█ | 99/937 [00:00<00:07, 118.54it/s, loss=0.561, v_num=0]
Epoch 0: 7%|▋ | 135/1874 [00:01<00:14, 124.06it/s, loss=0.432, v_num=0]
Epoch 0: 7%|▋ | 133/1874 [00:01<00:14, 121.89it/s, loss=0.466, v_num=0]
Epoch 0: 13%|█▎ | 122/937 [00:00<00:06, 130.38it/s, loss=0.719, v_num=0]
Epoch 0: 12%|█▏ | 113/937 [00:00<00:06, 120.26it/s, loss=0.49, v_num=0]
Epoch 0: 8%|▊ | 150/1874 [00:01<00:13, 125.89it/s, loss=0.446, v_num=0]
Epoch 0: 8%|▊ | 149/1874 [00:01<00:13, 124.88it/s, loss=0.427, v_num=0]
Epoch 0: 15%|█▍ | 136/937 [00:01<00:06, 131.28it/s, loss=0.661, v_num=0]
Epoch 0: 14%|█▎ | 127/937 [00:01<00:06, 122.10it/s, loss=0.425, v_num=0]
Epoch 0: 9%|▉ | 166/1874 [00:01<00:13, 128.13it/s, loss=0.47, v_num=0]
Epoch 0: 9%|▉ | 165/1874 [00:01<00:13, 127.30it/s, loss=0.469, v_num=0]
Epoch 0: 15%|█▍ | 137/937 [00:01<00:06, 131.29it/s, loss=0.666, v_num=0]
Epoch 0: 9%|▉ | 166/1874 [00:01<00:13, 127.45it/s, loss=0.469, v_num=0]
Epoch 0: 9%|▉ | 166/1874 [00:01<00:13, 127.40it/s, loss=0.461, v_num=0]
Epoch 0: 14%|█▎ | 128/937 [00:01<00:06, 122.19it/s, loss=0.432, v_num=0]
Epoch 0: 9%|▉ | 167/1874 [00:01<00:13, 128.27it/s, loss=0.47, v_num=0]
Epoch 0: 9%|▉ | 167/1874 [00:01<00:13, 128.24it/s, loss=0.47, v_num=0]
Epoch 0: 16%|█▌ | 152/937 [00:01<00:05, 132.45it/s, loss=0.594, v_num=0]
Epoch 0: 15%|█▌ | 143/937 [00:01<00:06, 124.40it/s, loss=0.429, v_num=0]
Hi @m-lyon , i can successfully run the jobs.
this is my gpu usage
(base) ray@ip-172-31-91-173:~/default$ nvidia-smi
Fri Aug 5 16:12:58 2022
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 CUDA Version: 11.6 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla T4 On | 00000000:00:1B.0 Off | 0 |
| N/A 42C P0 26W / 70W | 503MiB / 15360MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 1 Tesla T4 On | 00000000:00:1C.0 Off | 0 |
| N/A 43C P0 27W / 70W | 511MiB / 15360MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 2 Tesla T4 On | 00000000:00:1D.0 Off | 0 |
| N/A 43C P0 26W / 70W | 501MiB / 15360MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 3 Tesla T4 On | 00000000:00:1E.0 Off | 0 |
| N/A 42C P0 27W / 70W | 489MiB / 15360MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 6710 C 501MiB |
| 1 N/A N/A 6708 C 509MiB |
| 2 N/A N/A 6393 C 499MiB |
| 3 N/A N/A 6709 C 487MiB |
+-----------------------------------------------------------------------------+
https://user-images.githubusercontent.com/20907377/183223108-df41db61-83df-4294-95a2-ec481482c311.mp4
hi @m-lyon , i am trying to looking into the problem. I wonder if it is possible to remove
${CUDA_VISIBLE_DEVICES} equal to "0,1,2,3"
setup in the problem?
I did a run where I set export CUDA_VISIBLE_DEVICES=""
, however this lead to the script unable to recognise any GPU resources, and thus all trials were stuck in pending:
2022-08-06 12:21:38,861 WARNING insufficient_resources_manager.py:128 -- Ignore this message if the cluster is autoscaling. You asked for 2.0 cpu and 1.0 gpu per trial, but the cluster only has 32.0 cpu and 0 gpu. Stop the tuning job and adjust the resources requested per trial (possibly via `resources_per_trial` or via `num_workers` for rllib) and/or add more resources to your Ray runtime.
I suppose if you're unable to recreate the error then I'm going to have to do some further digging into the codebase to try and find the cause of the problem. To be honest i'm unsure why CUDA_VISIBLE_DEVICES
is being changed for each process?
Hi @m-lyon, may i ask several questions:
- what is ur
ray
version? can u try with the nightly wheel - did u try the trainer without tune? does that work with gpu?
the reason of CUDA_VISIBLE_DEVICES
: https://github.com/ray-project/ray_lightning/blob/main/ray_lightning/ray_ddp.py#L221-L256
or in other words, the ray tune will separate the cuda visible environments: say
CUDA_VISIBLE_DEVICES="0" for trial # 1, ray.get_gpu_id = 0, should use torch.device('cuda:0')
CUDA_VISIBLE_DEVICES="1" for trial # 2, ray.get_gpu_id = 1, should use torch.device('cuda:0')
CUDA_VISIBLE_DEVICES="2" for trial # 3, ray.get_gpu_id = 2, should use torch.device('cuda:0')
CUDA_VISIBLE_DEVICES="3" for trial # 4, ray.get_gpu_id = 3, should use torch.device('cuda:0')
in this case, it should use torch.device('cuda:0')
, because it is set as the local rank for cuda device.
also, i wonder if it is possible for u to try a clean and empty aws or gcp machine. sometimes n a HPC (SGE) server might have their different environment setups.
what is ur ray version? can u try with the nightly wheel
Using ray==1.13.0
, I can try with the nightly wheel or ray and see how I go.
did u try the trainer without tune? does that work with gpu?
As in have I just used pytorch_lightning
without ray
or ray_lightning
? If so then yes, this works fine, though I use DDP
with 4 GPUs to train 1 model at a time.
also, i wonder if it is possible for u to try a clean and empty aws or gcp machine. sometimes n a HPC (SGE) server might have their different environment setups.
I'm a university student using the HPC system provided so I don't really have the financial means to use a commercial setup to conduct the experiments i'm trying to run unfortunately.
if len(cuda_visible_list) == 1:
device_id = cuda_visible_list[0]
else:
device_id = cuda_visible_list.index(gpu_id)
Your change fails due to RuntimeError: CUDA error: invalid device ordinal
, if cuda_visible_list=2
, and use torch.device('cuda:2')
it will be out of bound. the original code is to use torch.device('cuda:0')
in this setup.
by the way, can u also show
ray status
what is ur ray version? can u try with the nightly wheel
Using
ray==1.13.0
, I can try with the nightly wheel or ray and see how I go.did u try the trainer without tune? does that work with gpu?
As in have I just used
pytorch_lightning
withoutray
orray_lightning
? If so then yes, this works fine, though I useDDP
with 4 GPUs to train 1 model at a time.also, i wonder if it is possible for u to try a clean and empty aws or gcp machine. sometimes n a HPC (SGE) server might have their different environment setups.
I'm a university student using the HPC system provided so I don't really have the financial means to use a commercial setup to conduct the experiments i'm trying to run unfortunately.
for the second item, i mean train_mnist
in this example: https://github.com/ray-project/ray_lightning/blob/main/ray_lightning/examples/ray_ddp_example.py
if len(cuda_visible_list) == 1: device_id = cuda_visible_list[0] else: device_id = cuda_visible_list.index(gpu_id)
Your change fails due to
RuntimeError: CUDA error: invalid device ordinal
, ifcuda_visible_list=2
, and usetorch.device('cuda:2')
it will be out of bound. the original code is to usetorch.device('cuda:0')
in this setup.
So would this work?
if len(cuda_visible_list) == 1:
return torch.device('cuda:0')
I can confirm the above code has fixed this issue.
I guess a question is then given the code below, if each trial should only see CUDA_VISIBLE_DEVICES=0
(because ray
sets this for each trial) then how does this ever not produce an error for trials in parallel where NGPUs > 1? Because gpu_id
and cuda_visible_list
are both not 0
when the desired device is torch.device('cuda', 0)
gpu_id = ray.get_gpu_ids()[0]
cuda_visible_str = os.environ.get("CUDA_VISIBLE_DEVICES", "")
cuda_visible_list = [
int(dev) for dev in cuda_visible_str.split(",")
]
device_id = cuda_visible_list.index(gpu_id)
return torch.device("cuda", device_id)
can u also print nvidia-sim
in the command line during the training and
can u find all the gpus are used?
On Sun, Aug 7, 2022, 04:21 Matt @.***> wrote:
I can confirm the above code has fixed this issue.
I guess a question is then given the code below, if each trial should only see CUDA_VISIBLE_DEVICES=0 (because ray sets this for each trial) then how does this ever not produce an error for trials in parallel where NGPUs
1? Because gpu_id and cuda_visible_list are both not 0 when the desired device is torch.device('cuda', 0)
gpu_id = ray.get_gpu_ids()[0]cuda_visible_str = os.environ.get("CUDA_VISIBLE_DEVICES", "")cuda_visible_list = [ int(dev) for dev in cuda_visible_str.split(",") ]device_id = cuda_visible_list.index(gpu_id)return torch.device("cuda", device_id)
— Reply to this email directly, view it on GitHub https://github.com/ray-project/ray_lightning/issues/192#issuecomment-1207385543, or unsubscribe https://github.com/notifications/unsubscribe-auth/AE7QK4MOX24IIWMZ6NU4CELVX6LZ3ANCNFSM55WVFBKA . You are receiving this because you were assigned.Message ID: @.***>
Hi @m-lyon , wonder whether your issue is solved?
As far as I can tell, setting each device to torch.device('cuda', 0)
within the if statement has fixed the issue, however i can't confirm the GPU activity through ray status
or nvidia-sim
because of the submission system within the HPC i am using.
Specifically, as it is a batched submission system, I'm unable to open an interactive terminal on the same processing node as the one that is running the raytune job.
Having said that though training times are indicative of GPU use, so it seems that this is the case (CPU training would obviously be much much slower).
After some time training with the aforementioned code fix, I ran into this error for some of the trials:
Failure # 1 (occurred at 2022-08-14_19-33-21)
ray::ImplicitFunc.train() (pid=33261, ip=10.10.8.10, repr=train_model_raytune)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/trainable.py", line 360, in train
result = self.step()
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 404, in step
self._report_thread_runner_error(block=True)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 574, in _report_thread_runner_error
raise e
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 277, in run
self._entrypoint()
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 349, in entrypoint
return self._trainable_func(
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 645, in _trainable_func
output = fn()
File "/mnt/iusers01/gb01/c38028ml/Dev/git/phd-torchscripts/PhD/torchscripts/pcconv/models/raytune1/raytune1_test2.py", line 141, in train_model_raytune
trainer.fit(model, data_module)
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 768, in fit
self._call_and_handle_interrupt(
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 719, in _call_and_handle_interrupt
return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/ray_launcher.py", line 55, in launch
ray_output = self.run_function_on_workers(
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/ray_launcher.py", line 229, in run_function_on_workers
results = process_results(self._futures, self.tune_queue)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/util.py", line 64, in process_results
ray.get(ready)
ray.exceptions.RayTaskError(RuntimeError): ray::RayExecutor.execute() (pid=33830, ip=10.10.8.10, repr=<ray_lightning.launchers.utils.RayExecutor object at 0x2b480b548bb0>)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/utils.py", line 52, in execute
return fn(*args, **kwargs)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/ray_launcher.py", line 273, in _wrapping_function
results = function(*args, **kwargs)
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 809, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1170, in _run
self.__setup_profiler()
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1795, in __setup_profiler
self.profiler.setup(stage=self.state.fn._setup_fn, local_rank=local_rank, log_dir=self.log_dir)
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 2232, in log_dir
dirpath = self.strategy.broadcast(dirpath)
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/strategies/ddp_spawn.py", line 208, in broadcast
torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
File "/opt/apps/apps/binapps/pytorch/1.11.0/python3.9/gpu-cuda11.3/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 1866, in broadcast_object_list
object_sizes_tensor = object_sizes_tensor.to(current_device)
RuntimeError: CUDA error: all CUDA-capable devices are busy or unavailable
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
This was with the additional configuration of using TuneBOHB
and HyperBandForBOHB
, not sure if that would affect things though.
Afterwards I thought I would try DDP across the 4 GPUs available on the node, but in this configuration the original error was raised:
Failure # 1 (occurred at 2022-08-15_12-05-00)
ray::ImplicitFunc.train() (pid=250377, ip=10.10.8.9, repr=train_model_raytune)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/trainable.py", line 360, in train
result = self.step()
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 404, in step
self._report_thread_runner_error(block=True)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 574, in _report_thread_runner_error
raise e
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 277, in run
self._entrypoint()
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 349, in entrypoint
return self._trainable_func(
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 645, in _trainable_func
output = fn()
File "/mnt/iusers01/gb01/c38028ml/Dev/git/phd-torchscripts/PhD/torchscripts/pcconv/models/raytune1/raytune1_test3.py", line 141, in train_model_raytune
trainer.fit(model, data_module)
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 768, in fit
self._call_and_handle_interrupt(
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 719, in _call_and_handle_interrupt
return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/ray_launcher.py", line 55, in launch
ray_output = self.run_function_on_workers(
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/ray_launcher.py", line 229, in run_function_on_workers
results = process_results(self._futures, self.tune_queue)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/util.py", line 64, in process_results
ray.get(ready)
ray.exceptions.RayTaskError(ValueError): ray::RayExecutor.execute() (pid=250425, ip=10.10.8.9, repr=<ray_lightning.launchers.utils.RayExecutor object at 0x2b69a7be3bb0>)
ValueError: '0' is not in list
The above exception was the direct cause of the following exception:
ray::RayExecutor.execute() (pid=250425, ip=10.10.8.9, repr=<ray_lightning.launchers.utils.RayExecutor object at 0x2b69a7be3bb0>)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/utils.py", line 52, in execute
return fn(*args, **kwargs)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/ray_launcher.py", line 267, in _wrapping_function
self._strategy._worker_setup(process_idx=global_rank)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/ray_ddp.py", line 155, in _worker_setup
self._process_group_backend = self._get_process_group_backend()
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/strategies/ddp_spawn.py", line 163, in _get_process_group_backend
or get_default_process_group_backend_for_device(self.root_device)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/ray_ddp.py", line 263, in root_device
raise ValueError(f'cuda_visible_str -> "{cuda_visible_str}", cuda_visible_list -> "{cuda_visible_list}", gpu_id -> "{gpu_id}"') from err
ValueError: cuda_visible_str -> "3,1,0,2", cuda_visible_list -> "[3, 1, 0, 2]", gpu_id -> "0"
To me this seems very odd, as the visible list clearly contains GPU ID 0. In any case; pytorch-lightning
works flawlessly with multiple GPUs in DDP, so it's pretty disappointing to see raytune
failing with this. Honestly i'm a bit at a loss with this now.
So the above error hinted at what the problem was here.
From the raytune GPU documentation:
a call to ray.get_gpu_ids() will return a list of strings indicating which GPUs the remote function is allowed to use
Crucially it is a list of strings, whereas in ray_ddp.RayStrategy.root_device
:
cuda_visible_list = [
int(dev) for dev in cuda_visible_str.split(",")
]
So it's a type mismatch. I wonder how this does not cause an error every time though?
Possible fix: changing gpu_id = ray.get_gpu_ids()[0]
to gpu_id = int(ray.get_gpu_ids()[0])
.
Update:
after implementing the above fix I immediately run into the CUDA error:
Failure # 1 (occurred at 2022-08-15_14-20-47)
ray::ImplicitFunc.train() (pid=237695, ip=10.10.8.0, repr=train_model_raytune)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/trainable.py", line 360, in train
result = self.step()
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 404, in step
self._report_thread_runner_error(block=True)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 574, in _report_thread_runner_error
raise e
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 277, in run
self._entrypoint()
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 349, in entrypoint
return self._trainable_func(
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray/tune/function_runner.py", line 645, in _trainable_func
output = fn()
File "/mnt/iusers01/gb01/c38028ml/Dev/git/phd-torchscripts/PhD/torchscripts/pcconv/models/raytune1/raytune1_test3.py", line 141, in train_model_raytune
trainer.fit(model, data_module)
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 768, in fit
self._call_and_handle_interrupt(
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 719, in _call_and_handle_interrupt
return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/ray_launcher.py", line 55, in launch
ray_output = self.run_function_on_workers(
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/ray_launcher.py", line 229, in run_function_on_workers
results = process_results(self._futures, self.tune_queue)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/util.py", line 64, in process_results
ray.get(ready)
ray.exceptions.RayTaskError(RuntimeError): ray::RayExecutor.execute() (pid=237728, ip=10.10.8.0, repr=<ray_lightning.launchers.utils.RayExecutor object at 0x2ad319eadbb0>)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/utils.py", line 52, in execute
return fn(*args, **kwargs)
File "/mnt/iusers01/gb01/c38028ml/.conda/envs/torch/lib/python3.9/site-packages/ray_lightning/launchers/ray_launcher.py", line 273, in _wrapping_function
results = function(*args, **kwargs)
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 809, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1170, in _run
self.__setup_profiler()
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1795, in __setup_profiler
self.profiler.setup(stage=self.state.fn._setup_fn, local_rank=local_rank, log_dir=self.log_dir)
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 2232, in log_dir
dirpath = self.strategy.broadcast(dirpath)
File "/mnt/iusers01/gb01/c38028ml/.local/lib/python3.9/site-packages/pytorch_lightning/strategies/ddp_spawn.py", line 208, in broadcast
torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
File "/opt/apps/apps/binapps/pytorch/1.11.0/python3.9/gpu-cuda11.3/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 1866, in broadcast_object_list
object_sizes_tensor = object_sizes_tensor.to(current_device)
RuntimeError: CUDA error: all CUDA-capable devices are busy or unavailable
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
For reference, here is the tune.run
call:
algo = TuneBOHB(metric="loss", mode="min")
bohb = HyperBandForBOHB(
time_attr="training_iteration",
metric="loss",
mode="min",
max_t=30
)
analysis = tune.run(
train_model_raytune,
config=configuration,
scheduler=bohb,
search_alg=algo,
num_samples=30,
resources_per_trial=get_tune_resources(num_workers=4, use_gpu=True),
local_dir='~/scratch/raytune',
name='initial_raytune_test3',
)
and the pl.Trainer
instance created within train_model_raytune
function:
callbacks = [TuneReportCallback({'loss': 'val_loss'}, on='validation_end')]
trainer = pl.Trainer(
max_epochs=MAX_EPOCHS,
callbacks=callbacks,
strategy=RayStrategy(num_workers=4, use_gpu=True),
)