pytorch-lightning
pytorch-lightning copied to clipboard
FileNotFoundError: [Errno 2] No such file or directory tfevents file
Bug description
I am working on the code base of stable diffusion here https://github.com/CompVis/latent-diffusion. I am getting below error in Multi GPU trianing where it can not find the tfevents file.
trainer.fit(model, data)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 553, in fit
self._run(model)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 918, in _run
self._dispatch()
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 986, in _dispatch
self.accelerator.start_training(self)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 92, in start_training
self.training_type_plugin.start_training(trainer)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 161, in start_training
self._results = trainer.run_stage()
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 996, in run_stage
return self._run_train()
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1058, in _run_train
self.training_type_plugin.reconciliate_processes(traceback.format_exc())
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/ddp.py", line 453, in reconciliate_processes
raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}")
pytorch_lightning.utilities.exceptions.DeadlockDetectedException: DeadLock detected from rank: 0
Traceback (most recent call last):
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1045, in _run_train
self.fit_loop.run()
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 111, in run
self.advance(*args, **kwargs)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 200, in advance
epoch_output = self.epoch_loop.run(train_dataloader)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 111, in run
self.advance(*args, **kwargs)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 149, in advance
self.trainer.call_hook(
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1217, in call_hook
trainer_hook(*args, **kwargs)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/trainer/callback_hook.py", line 189, in on_train_batch_end
callback.on_train_batch_end(self, self.lightning_module, outputs, batch, batch_idx, dataloader_idx)
File "/home/csgrad/mbhosale/phd/Pathdiff/PathLDM/main.py", line 443, in on_train_batch_end
self.log_img(pl_module, batch, batch_idx, split="train")
File "/home/csgrad/mbhosale/phd/Pathdiff/PathLDM/main.py", line 424, in log_img
logger_log_images(pl_module, images, pl_module.global_step, split)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/utilities/distributed.py", line 48, in wrapped_fn
return fn(*args, **kwargs)
File "/home/csgrad/mbhosale/phd/Pathdiff/PathLDM/main.py", line 363, in _testtube
pl_module.logger.experiment.add_image(tag, grid, global_step=pl_module.global_step)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/torch/utils/tensorboard/writer.py", line 614, in add_image
self._get_file_writer().add_summary(
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/torch/utils/tensorboard/writer.py", line 113, in add_summary
self.add_event(event, global_step, walltime)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/torch/utils/tensorboard/writer.py", line 98, in add_event
self.event_writer.add_event(event)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/summary/writer/event_file_writer.py", line 117, in add_event
self._async_writer.write(event.SerializeToString())
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/summary/writer/event_file_writer.py", line 171, in write
self._check_worker_status()
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/summary/writer/event_file_writer.py", line 212, in _check_worker_status
raise exception
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/summary/writer/event_file_writer.py", line 244, in run
self._run()
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/summary/writer/event_file_writer.py", line 275, in _run
self._record_writer.write(data)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/summary/writer/record_writer.py", line 40, in write
self._writer.write(header + header_crc + data + footer_crc)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 773, in write
self.fs.append(self.filename, file_content, self.binary_mode)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 167, in append
self._write(filename, file_content, "ab" if binary_mode else "a")
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 171, in _write
with io.open(filename, mode, encoding=encoding) as f:
FileNotFoundError: [Errno 2] No such file or directory: b'logs/06-03T05-49_plip_imagenet_finetune_PanNuke/testtube/version_0/tf/events.out.tfevents.1717408192.deepbull8.818802.0
I checked the file, there is no folder names tf under version_0. Interestingly I get this error only when I run it on multiple GPUs, with a single GPU somehow it gets resolved. I have no idea how to resolve or start debugging this issue.
What version are you seeing the problem on?
v1.4.2
How to reproduce the bug
Run the multi gpu training of https://github.com/cvlab-stonybrook/PathLDM/blob/main/main.py
Error messages and logs
As shown in above comments.
Environment
Current environment
# Name Version Build Channel
_libgcc_mutex 0.1 main
_openmp_mutex 5.1 1_gnu
absl-py 2.0.0 pypi_0 pypi
aiohttp 3.9.1 pypi_0 pypi
aiosignal 1.3.1 pypi_0 pypi
albumentations 0.4.3 pypi_0 pypi
altair 5.2.0 pypi_0 pypi
antlr4-python3-runtime 4.8 pypi_0 pypi
appdirs 1.4.4 pypi_0 pypi
asttokens 2.4.1 pyhd8ed1ab_0 conda-forge
async-timeout 4.0.3 pypi_0 pypi
attrs 23.1.0 pypi_0 pypi
backcall 0.2.0 pyh9f0ad1d_0 conda-forge
backports-zoneinfo 0.2.1 pypi_0 pypi
blas 1.0 mkl
blessed 1.20.0 py38h06a4308_0
blinker 1.7.0 pypi_0 pypi
bottleneck 1.3.7 py38ha9d4c09_0
brotli 1.0.9 h5eee18b_7
brotli-bin 1.0.9 h5eee18b_7
brotli-python 1.0.9 py38h6a678d5_7
bzip2 1.0.8 h7b6447c_0
c-ares 1.19.1 h5eee18b_0
ca-certificates 2024.3.11 h06a4308_0
cachetools 5.3.2 pypi_0 pypi
certifi 2024.2.2 py38h06a4308_0
cffi 1.15.1 py38h74dc2b5_0
chardet 5.2.0 pypi_0 pypi
charset-normalizer 3.3.2 pypi_0 pypi
click 8.1.7 pypi_0 pypi
clip 1.0 dev_0 <develop>
comm 0.1.4 pyhd8ed1ab_0 conda-forge
contourpy 1.0.5 py38hdb19cb5_0
cryptography 41.0.3 py38h130f0dd_0
cuda-cudart 11.7.99 0 nvidia
cuda-cupti 11.7.101 0 nvidia
cuda-libraries 11.7.1 0 nvidia
cuda-nvrtc 11.7.99 0 nvidia
cuda-nvtx 11.7.91 0 nvidia
cuda-runtime 11.7.1 0 nvidia
cudatoolkit 11.0.221 h6bb024c_0
cycler 0.11.0 pyhd3eb1b0_0
cyrus-sasl 2.1.28 h9c0eb46_1
dbus 1.13.18 hb2f20db_0
debugpy 1.6.7 py38h6a678d5_0
decorator 5.1.1 pyhd8ed1ab_0 conda-forge
docker-pycreds 0.4.0 pypi_0 pypi
einops 0.3.0 pypi_0 pypi
entrypoints 0.4 pyhd8ed1ab_0 conda-forge
executing 2.0.1 pyhd8ed1ab_0 conda-forge
expat 2.5.0 h6a678d5_0
ffmpeg 4.3 hf484d3e_0 pytorch
filelock 3.13.1 py38h06a4308_0
fontconfig 2.14.1 h4c34cd2_2
fonttools 4.25.0 pyhd3eb1b0_0
freetype 2.12.1 h4a9f257_0
frozenlist 1.4.1 pypi_0 pypi
fsspec 2023.12.2 pypi_0 pypi
ftfy 6.1.3 pypi_0 pypi
future 0.18.3 pypi_0 pypi
giflib 5.2.1 h5eee18b_3
gitdb 4.0.11 pypi_0 pypi
gitpython 3.1.40 pypi_0 pypi
glib 2.69.1 h4ff587b_1
gmp 6.2.1 h295c915_3
gmpy2 2.1.2 py38heeb90bb_0
gnutls 3.6.15 he1e5248_0
google-auth 2.25.2 pypi_0 pypi
google-auth-oauthlib 1.0.0 pypi_0 pypi
gpustat 1.1.1 py38h06a4308_0
grpcio 1.60.0 pypi_0 pypi
gst-plugins-base 1.14.1 h6a678d5_1
gstreamer 1.14.1 h5eee18b_1
h5py 3.9.0 py38he06866b_0
hdf5 1.12.1 h70be1eb_2
huggingface-hub 0.20.1 pypi_0 pypi
icu 73.1 h6a678d5_0
idna 3.6 pypi_0 pypi
imageio 2.9.0 pypi_0 pypi
imageio-ffmpeg 0.4.2 pypi_0 pypi
imgaug 0.2.6 pypi_0 pypi
importlib-metadata 6.11.0 pypi_0 pypi
importlib_resources 6.1.1 py38h06a4308_0
intel-openmp 2021.4.0 h06a4308_3561
ipykernel 6.26.0 pyhf8b6a83_0 conda-forge
ipython 8.12.0 pyh41d4057_0 conda-forge
ipywidgets 8.1.2 pypi_0 pypi
jedi 0.19.1 pyhd8ed1ab_0 conda-forge
jinja2 3.1.2 py38h06a4308_0
joblib 1.3.2 pypi_0 pypi
jpeg 9e h5eee18b_1
jsonschema 4.20.0 pypi_0 pypi
jsonschema-specifications 2023.11.2 pypi_0 pypi
jupyter_client 7.3.4 pyhd8ed1ab_0 conda-forge
jupyter_core 5.6.0 py38h578d9bd_0 conda-forge
jupyterlab-widgets 3.0.10 pypi_0 pypi
kiwisolver 1.4.4 py38h6a678d5_0
krb5 1.20.1 h568e23c_1
lame 3.100 h7b6447c_0
latent-diffusion 0.0.1 dev_0 <develop>
lazy-loader 0.3 pypi_0 pypi
lcms2 2.12 h3be6417_0
ld_impl_linux-64 2.38 h1181459_1
lerc 3.0 h295c915_0
libbrotlicommon 1.0.9 h5eee18b_7
libbrotlidec 1.0.9 h5eee18b_7
libbrotlienc 1.0.9 h5eee18b_7
libclang 14.0.6 default_hc6dbbc7_1
libclang13 14.0.6 default_he11475f_1
libcublas 11.10.3.66 0 nvidia
libcufft 10.7.2.124 h4fbf590_0 nvidia
libcufile 1.8.1.2 0 nvidia
libcups 2.4.2 ha637b67_0
libcurand 10.3.4.101 0 nvidia
libcurl 8.2.1 h91b91d3_0
libcusolver 11.4.0.1 0 nvidia
libcusparse 11.7.4.91 0 nvidia
libdeflate 1.17 h5eee18b_1
libedit 3.1.20230828 h5eee18b_0
libev 4.33 h7f8727e_1
libffi 3.3 he6710b0_2
libgcc-ng 11.2.0 h1234567_1
libgfortran-ng 11.2.0 h00389a5_1
libgfortran5 11.2.0 h1234567_1
libgomp 11.2.0 h1234567_1
libiconv 1.16 h7f8727e_2
libidn2 2.3.4 h5eee18b_0
libllvm14 14.0.6 hef93074_0
libnghttp2 1.52.0 ha637b67_1
libnpp 11.7.4.75 0 nvidia
libnvjpeg 11.8.0.2 0 nvidia
libpng 1.6.39 h5eee18b_0
libpq 12.15 h37d81fd_1
libsodium 1.0.18 h36c2ea0_1 conda-forge
libssh2 1.10.0 h37d81fd_2
libstdcxx-ng 11.2.0 h1234567_1
libtasn1 4.19.0 h5eee18b_0
libtiff 4.5.1 h6a678d5_0
libunistring 0.9.10 h27cfd23_0
libuuid 1.41.5 h5eee18b_0
libuv 1.44.2 h5eee18b_0
libwebp 1.3.2 h11a3e52_0
libwebp-base 1.3.2 h5eee18b_0
libxcb 1.15 h7f8727e_0
libxkbcommon 1.0.1 h5eee18b_1
libxml2 2.10.4 hf1b16e4_1
lightning-utilities 0.10.0 pypi_0 pypi
lz4-c 1.9.4 h6a678d5_0
markdown 3.5.1 pypi_0 pypi
markdown-it-py 3.0.0 pypi_0 pypi
markupsafe 2.1.3 pypi_0 pypi
matplotlib 3.7.2 py38h06a4308_0
matplotlib-base 3.7.2 py38h1128e8f_0
matplotlib-inline 0.1.6 pyhd8ed1ab_0 conda-forge
mdurl 0.1.2 pypi_0 pypi
mkl 2021.4.0 h06a4308_640
mkl-service 2.4.0 py38h7f8727e_0
mkl_fft 1.3.1 py38hd3c417c_0
mkl_random 1.2.2 py38h51133e4_0
mpc 1.1.0 h10f8cd9_1
mpfr 4.0.2 hb69a4c5_1
mpmath 1.3.0 py38h06a4308_0
multidict 6.0.4 pypi_0 pypi
munkres 1.1.4 py_0
mysql 5.7.24 he378463_2
ncurses 6.4 h6a678d5_0
nest-asyncio 1.5.8 pyhd8ed1ab_0 conda-forge
nettle 3.7.3 hbbd107a_1
networkx 3.1 py38h06a4308_0
ninja 1.10.2 h06a4308_5
ninja-base 1.10.2 hd09550d_5
numexpr 2.8.4 py38he184ba9_0
numpy 1.24.4 pypi_0 pypi
numpy-base 1.24.3 py38h31eccc5_0
nvidia-cublas-cu12 12.1.3.1 pypi_0 pypi
nvidia-cuda-cupti-cu12 12.1.105 pypi_0 pypi
nvidia-cuda-nvrtc-cu12 12.1.105 pypi_0 pypi
nvidia-cuda-runtime-cu12 12.1.105 pypi_0 pypi
nvidia-cudnn-cu12 8.9.2.26 pypi_0 pypi
nvidia-cufft-cu12 11.0.2.54 pypi_0 pypi
nvidia-curand-cu12 10.3.2.106 pypi_0 pypi
nvidia-cusolver-cu12 11.4.5.107 pypi_0 pypi
nvidia-cusparse-cu12 12.1.0.106 pypi_0 pypi
nvidia-ml-py 12.535.133 py38h06a4308_0
nvidia-nccl-cu12 2.18.1 pypi_0 pypi
nvidia-nvjitlink-cu12 12.3.101 pypi_0 pypi
nvidia-nvtx-cu12 12.1.105 pypi_0 pypi
oauthlib 3.2.2 pypi_0 pypi
omegaconf 2.1.1 pypi_0 pypi
open-clip-torch 2.23.0 pypi_0 pypi
opencv-python 4.1.2.30 pypi_0 pypi
opencv-python-headless 4.8.1.78 pypi_0 pypi
openh264 2.1.1 h4ff587b_0
openjpeg 2.4.0 h3ad879b_0
openssl 1.1.1w h7f8727e_0
packaging 21.3 pypi_0 pypi
pandas 2.0.3 py38h1128e8f_0
parso 0.8.3 pyhd8ed1ab_0 conda-forge
pcre 8.45 h295c915_0
pexpect 4.8.0 pyh1a96a4e_2 conda-forge
pickleshare 0.7.5 py_1003 conda-forge
pillow 10.0.1 py38ha6cbd5a_0
pip 20.3.3 py38h06a4308_0
pkgutil-resolve-name 1.3.10 pypi_0 pypi
platformdirs 4.1.0 pyhd8ed1ab_0 conda-forge
ply 3.11 py38_0
pooch 1.7.0 py38h06a4308_0
prompt-toolkit 3.0.42 pyha770c72_0 conda-forge
prompt_toolkit 3.0.42 hd8ed1ab_0 conda-forge
protobuf 3.20.1 pypi_0 pypi
psutil 5.9.0 py38h5eee18b_0
ptyprocess 0.7.0 pyhd3deb0d_0 conda-forge
pudb 2019.2 pypi_0 pypi
pure_eval 0.2.2 pyhd8ed1ab_0 conda-forge
pyarrow 14.0.2 pypi_0 pypi
pyasn1 0.5.1 pypi_0 pypi
pyasn1-modules 0.3.0 pypi_0 pypi
pycparser 2.21 pyhd3eb1b0_0
pydeck 0.8.1b0 pypi_0 pypi
pydeprecate 0.3.1 pypi_0 pypi
pygments 2.17.2 pyhd8ed1ab_0 conda-forge
pyopenssl 23.2.0 py38h06a4308_0
pyparsing 3.0.9 py38h06a4308_0
pyqt 5.15.10 py38h6a678d5_0
pyqt5-sip 12.13.0 py38h5eee18b_0
pysocks 1.7.1 py38h06a4308_0
python 3.8.5 h7579374_1
python-dateutil 2.8.2 pyhd8ed1ab_0 conda-forge
python-tzdata 2023.3 pyhd3eb1b0_0
python_abi 3.8 2_cp38 conda-forge
pytorch 2.0.1 py3.8_cuda11.7_cudnn8.5.0_0 pytorch
pytorch-cuda 11.7 h778d358_5 pytorch
pytorch-fid 0.3.0 pypi_0 pypi
pytorch-lightning 1.4.2 pypi_0 pypi
pytorch-mutex 1.0 cuda pytorch
pytz 2023.3.post1 py38h06a4308_0
pywavelets 1.4.1 pypi_0 pypi
pyyaml 6.0.1 pypi_0 pypi
pyzmq 25.1.0 py38h6a678d5_0
qt-main 5.15.2 h110a718_10
readline 8.2 h5eee18b_0
referencing 0.32.0 pypi_0 pypi
regex 2023.12.25 pypi_0 pypi
requests 2.31.0 py38h06a4308_0
requests-oauthlib 1.3.1 pypi_0 pypi
rich 13.7.0 pypi_0 pypi
rpds-py 0.15.2 pypi_0 pypi
rsa 4.9 pypi_0 pypi
sacremoses 0.1.1 pypi_0 pypi
safetensors 0.4.1 pypi_0 pypi
scikit-image 0.20.0 pypi_0 pypi
scikit-learn 1.3.0 py38h1128e8f_0 anaconda
scipy 1.9.1 pypi_0 pypi
seaborn 0.12.2 py38h06a4308_0
sentencepiece 0.1.99 pypi_0 pypi
sentry-sdk 1.39.1 pypi_0 pypi
setproctitle 1.3.3 pypi_0 pypi
setuptools 68.2.2 py38h06a4308_0
sip 6.7.12 py38h6a678d5_0
six 1.16.0 pyhd3eb1b0_1
smmap 5.0.1 pypi_0 pypi
sqlite 3.41.2 h5eee18b_0
stack_data 0.6.2 pyhd8ed1ab_0 conda-forge
streamlit 1.29.0 pypi_0 pypi
sympy 1.12 py38h06a4308_0
taming-transformers 0.0.1 dev_0 <develop>
tenacity 8.2.3 pypi_0 pypi
tensorboard 2.14.0 pypi_0 pypi
tensorboard-data-server 0.7.2 pypi_0 pypi
test-tube 0.7.5 pypi_0 pypi
threadpoolctl 2.2.0 pyh0d69192_0
tifffile 2023.7.10 pypi_0 pypi
timm 0.9.12 pypi_0 pypi
tk 8.6.12 h1ccaba5_0
tokenizers 0.13.3 pypi_0 pypi
toml 0.10.2 pypi_0 pypi
tomli 2.0.1 py38h06a4308_0
toolz 0.12.0 pypi_0 pypi
torch-fidelity 0.3.0 pypi_0 pypi
torchaudio 2.0.2 py38_cu117 pytorch
torchmetrics 0.6.0 pypi_0 pypi
torchtriton 2.0.0 py38 pytorch
torchvision 0.15.2 py38_cu117 pytorch
tornado 6.1 py38h0a891b7_3 conda-forge
tqdm 4.66.1 pypi_0 pypi
traitlets 5.14.0 pyhd8ed1ab_0 conda-forge
transformers 4.28.0 pypi_0 pypi
triton 2.1.0 pypi_0 pypi
typing_extensions 4.7.1 py38h06a4308_0
tzlocal 5.2 pypi_0 pypi
urllib3 2.1.0 pypi_0 pypi
urwid 2.3.4 pypi_0 pypi
validators 0.22.0 pypi_0 pypi
wandb 0.16.1 pypi_0 pypi
watchdog 3.0.0 pypi_0 pypi
wcwidth 0.2.12 pyhd8ed1ab_0 conda-forge
werkzeug 3.0.1 pypi_0 pypi
wheel 0.41.2 py38h06a4308_0
widgetsnbextension 4.0.10 pypi_0 pypi
xz 5.4.5 h5eee18b_0
yarl 1.9.4 pypi_0 pypi
zeromq 4.3.4 h2531618_0
zipp 3.17.0 pypi_0 pypi
zlib 1.2.13 h5eee18b_0
zstd 1.5.5 hc292b87_0
More info
No response