openfold
openfold copied to clipboard
Example training set fails with CUDA out of memory on CUDA 12.1 and PyTorch 2.1
Trying to run the example training from here: https://openfold.readthedocs.io/en/latest/Training_OpenFold.html
Command:
python3 train_openfold.py $DATA_DIR/pdb_data/mmcif_files \
$DATA_DIR/alignment_data/ $DATA_DIR/pdb_data/mmcif_files $OUTPUT_DIR 2021-10-10 \
--template_release_dates_cache_path ../../pdb_data/data_caches/mmcif_cache.json \
--train_chain_data_cache_path ../../pdb_data/data_caches/chain_data_cache.json \
--seed 1 --num_nodes 1 --config_preset initial_training --gpus 4 --precision bf16-mixed
Epoch 0: 0%| | 0/20 [00:00<?, ?it/s]Traceback (most recent call last):
File "/data/AF_Training/Public_GitHub/openfold/train_openfold.py", line 703, in <module>
main(args)
File "/data/AF_Training/Public_GitHub/openfold/train_openfold.py", line 452, in main
trainer.fit(
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 538, in fit
call._call_and_handle_interrupt(
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 46, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
return function(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 574, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 981, in _run
results = self._run_stage()
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1025, in _run_stage
self.fit_loop.run()
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 205, in run
self.advance()
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 363, in advance
self.epoch_loop.run(self._data_fetcher)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 140, in run
self.advance(data_fetcher)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 250, in advance
batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 190, in run
self._optimizer_step(batch_idx, closure)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 268, in _optimizer_step
call._call_lightning_module_hook(
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 167, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/core/module.py", line 1306, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/core/optimizer.py", line 153, in step
step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 270, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 238, in optimizer_step
return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/amp.py", line 75, in optimizer_step
return super().optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/precision.py", line 122, in optimizer_step
return optimizer.step(closure=closure, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
return wrapped(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper
out = func(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 76, in _use_grad
ret = func(self, *args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/optim/adam.py", line 143, in step
loss = closure()
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/precision.py", line 108, in _wrap_closure
closure_result = closure()
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 144, in __call__
self._result = self.closure(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 129, in closure
step_output = self._step_fn()
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 317, in _training_step
training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 319, in _call_strategy_hook
output = fn(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 389, in training_step
return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 640, in __call__
wrapper_output = wrapper_module(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1519, in forward
else self._run_ddp_forward(*inputs, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1355, in _run_ddp_forward
return self.module(*inputs, **kwargs) # type: ignore[index]
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 633, in wrapped_forward
out = method(*_args, **_kwargs)
File "/data/AF_Training/Public_GitHub/openfold/train_openfold.py", line 104, in training_step
outputs = self(batch)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/data/AF_Training/Public_GitHub/openfold/train_openfold.py", line 63, in forward
return self.model(batch)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/data/AF_Training/Public_GitHub/openfold/openfold/model/model.py", line 568, in forward
outputs, m_1_prev, z_prev, x_prev, early_stop = self.iteration(
File "/data/AF_Training/Public_GitHub/openfold/openfold/model/model.py", line 393, in iteration
z = self.extra_msa_stack(
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/data/AF_Training/Public_GitHub/openfold/openfold/model/evoformer.py", line 1217, in forward
m, z = b(m, z)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/data/AF_Training/Public_GitHub/openfold/openfold/model/evoformer.py", line 642, in forward
self.msa_att_row(
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/data/AF_Training/Public_GitHub/openfold/openfold/model/msa.py", line 281, in forward
m = self.mha(
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/data/AF_Training/Public_GitHub/openfold/openfold/model/primitives.py", line 523, in forward
o = attention_core(q, k, v, *((biases + [None] * 2)[:2]))
File "micromamba/envs/openfold-pl-2/lib/python3.10/site-packages/torch/autograd/function.py", line 539, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/data/AF_Training/Public_GitHub/openfold/openfold/utils/kernel/attention_core.py", line 38, in forward
attention_logits = torch.matmul(
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 GiB. GPU 3 has a total capacty of 79.22 GiB of which 24.22 GiB is free. Including non-PyTorch memory, this process has 54.99 GiB memory in use. Of the allocated memory 40.00 GiB is allocated by PyTorch, and 13.91 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
[rank: 3] Child process with PID 11479 terminated with code 1. Forcefully terminating all other processes to avoid zombies 🧟
Killed
Setup: 4x H100 Torch 2.1.2 CUDA Toolkit 12.1 CUDA user-mode driver 12.5 (version visible top-right of nvidia-smi)