neuralforecast
neuralforecast copied to clipboard
Use gpu to speed up training
Description
Hello, I am using TFT to train a forecast model. I wonder how can I use GPU to speed up my training process? It seems unclear in the document.
from neuralforecast import NeuralForecast from neuralforecast.losses.pytorch import MQLoss, DistributionLoss, GMM, PMM from neuralforecast.tsdataset import TimeSeriesDataset from neuralforecast.utils import AirPassengers, AirPassengersPanel, AirPassengersStatic import pandas as pd import pytorch_lightning as pl import matplotlib.pyplot as plt
from neuralforecast import NeuralForecast from neuralforecast.models import TFT from neuralforecast.losses.pytorch import MQLoss, DistributionLoss, GMM, PMM from neuralforecast.tsdataset import TimeSeriesDataset from neuralforecast.utils import AirPassengers, AirPassengersPanel, AirPassengersStatic
nf = NeuralForecast( models=[TFT(h=h, input_size=6, hidden_size=20, loss=DistributionLoss(distribution='StudentT', level=[80, 90]), learning_rate=0.005, #stat_exog_list=['airline1'], futr_exog_list=futr_exog_list, #hist_exog_list=['trend'], max_steps=500, val_check_steps=10, early_stop_patience_steps=10, scaler_type='robust', windows_batch_size=None, enable_progress_bar=True), ], freq='MS' ) nf.fit(df=train, val_size=12) Y_hat_df = nf.predict(futr_df=Y_test_df)
Link
No response
If you have CUDA installed, then neuralforecast will automatically leverage your GPU to train the models
Thanks! But I came across an error when training.
Stacktrace
ProcessRaisedException Traceback (most recent call last)
Cell In[23], line 29
8 from neuralforecast.models import TFT
10 nf = NeuralForecast(
11 models=[TFT(h=h, input_size=6,
12 hidden_size=20,
(...)
27 freq='MS'
28 )
---> 29 nf.fit(df=train, val_size=12)
File ~/anaconda3/envs/app/lib/python3.9/site-packages/neuralforecast/core.py:274, in NeuralForecast.fit(self, df, static_df, val_size, sort_df, use_init_models, verbose)
271 print("WARNING: Deleting previously fitted models.")
273 for model in self.models:
--> 274 model.fit(self.dataset, val_size=val_size)
276 self._fitted = True
File ~/anaconda3/envs/app/lib/python3.9/site-packages/neuralforecast/common/_base_windows.py:734, in BaseWindows.fit(self, dataset, val_size, test_size, random_seed)
731 self.trainer_kwargs["check_val_every_n_epoch"] = None
733 trainer = pl.Trainer(**self.trainer_kwargs)
--> 734 trainer.fit(self, datamodule=datamodule)
File ~/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:520, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
518 model = _maybe_unwrap_optimized(model)
519 self.strategy._lightning_module = model
--> 520 call._call_and_handle_interrupt(
521 self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
522 )
File ~/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py:42, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
40 try:
41 if trainer.strategy.launcher is not None:
---> 42 return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
43 else:
44 return trainer_fn(*args, **kwargs)
File ~/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py:124, in _MultiProcessingLauncher.launch(self, function, trainer, *args, **kwargs)
116 process_context = mp.start_processes(
117 self._wrapping_function,
118 args=process_args,
(...)
121 join=False, # we will join ourselves to get the process references
122 )
123 self.procs = process_context.processes
--> 124 while not process_context.join():
125 pass
127 worker_output = return_queue.get()
File ~/anaconda3/envs/app/lib/python3.9/site-packages/torch/multiprocessing/spawn.py:160, in ProcessContext.join(self, timeout)
158 msg = "\n\n-- Process %d terminated with the following error:\n" % error_index
159 msg += original_trace
--> 160 raise ProcessRaisedException(msg, error_index, failed_process.pid)
ProcessRaisedException:
-- Process 2 terminated with the following error:
Traceback (most recent call last):
File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py", line 147, in _wrapping_function
results = function(*args, **kwargs)
File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 559, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 893, in _run
self.strategy.setup_environment()
File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/strategies/ddp.py", line 143, in setup_environment
super().setup_environment()
File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 127, in setup_environment
self.accelerator.setup_device(self.root_device)
File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/accelerators/cuda.py", line 43, in setup_device
_check_cuda_matmul_precision(device)
File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/lightning_fabric/accelerators/cuda.py", line 345, in _check_cuda_matmul_precision
major, _ = torch.cuda.get_device_capability(device)
File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/torch/cuda/__init__.py", line 381, in get_device_capability
prop = get_device_properties(device)
File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/torch/cuda/__init__.py", line 395, in get_device_properties
_lazy_init() # will define _get_device_properties
File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/torch/cuda/__init__.py", line 235, in _lazy_init
raise RuntimeError(
RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
</details>
@kkckk1110 do you have multiple GPUs? pytorch lightning will try to use all of them by default if you do, can you try setting devices=[0]
in your model constructor?
This issue has been automatically closed because it has been awaiting a response for too long. When you have time to to work with the maintainers to resolve this issue, please post a new comment and it will be re-opened. If the issue has been locked for editing by the time you return to it, please open a new issue and reference this one.