neuralforecast icon indicating copy to clipboard operation
neuralforecast copied to clipboard

Use gpu to speed up training

Open kkckk1110 opened this issue 11 months ago • 3 comments

Description

Hello, I am using TFT to train a forecast model. I wonder how can I use GPU to speed up my training process? It seems unclear in the document.

from neuralforecast import NeuralForecast from neuralforecast.losses.pytorch import MQLoss, DistributionLoss, GMM, PMM from neuralforecast.tsdataset import TimeSeriesDataset from neuralforecast.utils import AirPassengers, AirPassengersPanel, AirPassengersStatic import pandas as pd import pytorch_lightning as pl import matplotlib.pyplot as plt

from neuralforecast import NeuralForecast from neuralforecast.models import TFT from neuralforecast.losses.pytorch import MQLoss, DistributionLoss, GMM, PMM from neuralforecast.tsdataset import TimeSeriesDataset from neuralforecast.utils import AirPassengers, AirPassengersPanel, AirPassengersStatic

nf = NeuralForecast( models=[TFT(h=h, input_size=6, hidden_size=20, loss=DistributionLoss(distribution='StudentT', level=[80, 90]), learning_rate=0.005, #stat_exog_list=['airline1'], futr_exog_list=futr_exog_list, #hist_exog_list=['trend'], max_steps=500, val_check_steps=10, early_stop_patience_steps=10, scaler_type='robust', windows_batch_size=None, enable_progress_bar=True), ], freq='MS' ) nf.fit(df=train, val_size=12) Y_hat_df = nf.predict(futr_df=Y_test_df)

Link

No response

kkckk1110 avatar Mar 02 '24 01:03 kkckk1110

If you have CUDA installed, then neuralforecast will automatically leverage your GPU to train the models

flight505 avatar Mar 03 '24 23:03 flight505

Thanks! But I came across an error when training.

Stacktrace
ProcessRaisedException                    Traceback (most recent call last)
Cell In[23], line 29
      8 from neuralforecast.models import TFT
     10 nf = NeuralForecast(
     11     models=[TFT(h=h, input_size=6,
     12                 hidden_size=20,
   (...)
     27     freq='MS'
     28 )
---> 29 nf.fit(df=train, val_size=12)

File ~/anaconda3/envs/app/lib/python3.9/site-packages/neuralforecast/core.py:274, in NeuralForecast.fit(self, df, static_df, val_size, sort_df, use_init_models, verbose)
    271         print("WARNING: Deleting previously fitted models.")
    273 for model in self.models:
--> 274     model.fit(self.dataset, val_size=val_size)
    276 self._fitted = True

File ~/anaconda3/envs/app/lib/python3.9/site-packages/neuralforecast/common/_base_windows.py:734, in BaseWindows.fit(self, dataset, val_size, test_size, random_seed)
    731 self.trainer_kwargs["check_val_every_n_epoch"] = None
    733 trainer = pl.Trainer(**self.trainer_kwargs)
--> 734 trainer.fit(self, datamodule=datamodule)

File ~/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:520, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    518 model = _maybe_unwrap_optimized(model)
    519 self.strategy._lightning_module = model
--> 520 call._call_and_handle_interrupt(
    521     self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
    522 )

File ~/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py:42, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
     40 try:
     41     if trainer.strategy.launcher is not None:
---> 42         return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
     43     else:
     44         return trainer_fn(*args, **kwargs)

File ~/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py:124, in _MultiProcessingLauncher.launch(self, function, trainer, *args, **kwargs)
    116 process_context = mp.start_processes(
    117     self._wrapping_function,
    118     args=process_args,
   (...)
    121     join=False,  # we will join ourselves to get the process references
    122 )
    123 self.procs = process_context.processes
--> 124 while not process_context.join():
    125     pass
    127 worker_output = return_queue.get()

File ~/anaconda3/envs/app/lib/python3.9/site-packages/torch/multiprocessing/spawn.py:160, in ProcessContext.join(self, timeout)
    158 msg = "\n\n-- Process %d terminated with the following error:\n" % error_index
    159 msg += original_trace
--> 160 raise ProcessRaisedException(msg, error_index, failed_process.pid)

ProcessRaisedException: 

-- Process 2 terminated with the following error:
Traceback (most recent call last):
  File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
    fn(i, *args)
  File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py", line 147, in _wrapping_function
    results = function(*args, **kwargs)
  File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 559, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 893, in _run
    self.strategy.setup_environment()
  File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/strategies/ddp.py", line 143, in setup_environment
    super().setup_environment()
  File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 127, in setup_environment
    self.accelerator.setup_device(self.root_device)
  File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/pytorch_lightning/accelerators/cuda.py", line 43, in setup_device
    _check_cuda_matmul_precision(device)
  File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/lightning_fabric/accelerators/cuda.py", line 345, in _check_cuda_matmul_precision
    major, _ = torch.cuda.get_device_capability(device)
  File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/torch/cuda/__init__.py", line 381, in get_device_capability
    prop = get_device_properties(device)
  File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/torch/cuda/__init__.py", line 395, in get_device_properties
    _lazy_init()  # will define _get_device_properties
  File "/home/ckj/anaconda3/envs/app/lib/python3.9/site-packages/torch/cuda/__init__.py", line 235, in _lazy_init
    raise RuntimeError(
RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
</details>

kkckk1110 avatar Mar 05 '24 08:03 kkckk1110

@kkckk1110 do you have multiple GPUs? pytorch lightning will try to use all of them by default if you do, can you try setting devices=[0] in your model constructor?

jmoralez avatar Mar 05 '24 17:03 jmoralez

This issue has been automatically closed because it has been awaiting a response for too long. When you have time to to work with the maintainers to resolve this issue, please post a new comment and it will be re-opened. If the issue has been locked for editing by the time you return to it, please open a new issue and reference this one.

github-actions[bot] avatar Apr 05 '24 04:04 github-actions[bot]