Dreambooth-Stable-Diffusion
Dreambooth-Stable-Diffusion copied to clipboard
Training error: RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`
While training it always gets stuck after:
Sanity Checking DataLoader 0: 0%| | 0/2 [00:00<?, ?it/s]Here comes the checkpoint...
Training complete. max_training_steps reached or we blew up.
With the following Traceback:
Traceback (most recent call last):
File "main.py", line 848, in <module>
trainer.fit(model, data)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 771, in fit
self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 723, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 811, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1236, in _run
results = self._run_stage()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1323, in _run_stage
return self._run_train()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1345, in _run_train
self._run_sanity_check()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1413, in _run_sanity_check
val_loop.run()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 155, in advance
dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 128, in advance
output = self._evaluation_step(**kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 226, in _evaluation_step
output = self.trainer._call_strategy_hook("validation_step", *kwargs.values())
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1765, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/strategies/strategy.py", line 344, in validation_step
return self.model.validation_step(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/project/Dreambooth-Stable-Diffusion/ldm/models/diffusion/ddpm.py", line 368, in validation_step
_, loss_dict_no_ema = self.shared_step(batch)
File "/project/Dreambooth-Stable-Diffusion/ldm/models/diffusion/ddpm.py", line 907, in shared_step
x, c = self.get_input(batch, self.first_stage_key)
File "/opt/conda/lib/python3.7/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/project/Dreambooth-Stable-Diffusion/ldm/models/diffusion/ddpm.py", line 701, in get_input
encoder_posterior = self.encode_first_stage(x)
File "/opt/conda/lib/python3.7/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/project/Dreambooth-Stable-Diffusion/ldm/models/diffusion/ddpm.py", line 904, in encode_first_stage
return self.first_stage_model.encode(x)
File "/project/Dreambooth-Stable-Diffusion/ldm/models/autoencoder.py", line 325, in encode
h = self.encoder(x)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/project/Dreambooth-Stable-Diffusion/ldm/modules/diffusionmodules/model.py", line 452, in forward
h = self.mid.attn_1(h)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/project/Dreambooth-Stable-Diffusion/ldm/modules/diffusionmodules/model.py", line 190, in forward
w_ = torch.bmm(q,k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`
Thanks in advance.