ray_lightning
ray_lightning copied to clipboard
`ray_lightning` checkpoint dir not saving the checkpoint
def get_trainer(dir,
plugins: List[PLUGIN_INPUT],
max_epochs: int = 1000,
limit_train_batches: int = 10,
limit_val_batches: int = 10,
callbacks: Optional[List[Callback]] = None,
checkpoint_callback: bool = True,
**trainer_kwargs) -> Trainer:
"""Returns a Pytorch Lightning Trainer with the provided arguments."""
callbacks = [] if not callbacks else callbacks
trainer = pl.Trainer(
default_root_dir=dir,
callbacks=callbacks,
plugins=plugins,
max_epochs=max_epochs,
limit_train_batches=limit_train_batches,
limit_val_batches=limit_val_batches,
enable_progress_bar=True,
checkpoint_callback=checkpoint_callback,
**trainer_kwargs)
return trainer
if we passed the tmpdir
to the get_trainer
, the checkpoint will not be saved.
tmpdir = test_ckpt
Traceback (most recent call last):
File "test_ckpt.py", line 44, in <module>
ckpt()
File "test_ckpt.py", line 38, in ckpt
trained_model = BoringModel.load_from_checkpoint(
File "/home/ray/anaconda3/lib/python3.8/site-packages/pytorch_lightning/core/saving.py", line 134, in load_from_checkpoint
checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage)
File "/home/ray/anaconda3/lib/python3.8/site-packages/pytorch_lightning/utilities/cloud_io.py", line 37, in load
with fs.open(path_or_url, "rb") as f:
File "/home/ray/anaconda3/lib/python3.8/site-packages/fsspec/spec.py", line 976, in open
f = self._open(
File "/home/ray/anaconda3/lib/python3.8/site-packages/fsspec/implementations/local.py", line 145, in _open
return LocalFileOpener(path, mode, fs=self, **kwargs)
File "/home/ray/anaconda3/lib/python3.8/site-packages/fsspec/implementations/local.py", line 236, in __init__
self._open()
File "/home/ray/anaconda3/lib/python3.8/site-packages/fsspec/implementations/local.py", line 241, in _open
self.f = open(self.path, mode=self.mode)
FileNotFoundError: [Errno 2] No such file or directory: '/home/ray/default/ray_lightning/ray_lightning/tests/test_ckpt/lightning_logs/version_0/checkpoints/epoch=2-step=191.ckpt'
tmpdir = None
(RayExecutor pid=31579) Metric val_loss improved. New best score: 1.000
/home/ray/default/ray_lightning/ray_lightning/tests/lightning_logs/version_0/checkpoints/epoch=2-step=191.ckpt