pytorch_tabular icon indicating copy to clipboard operation
pytorch_tabular copied to clipboard

The model trained using multiple GPUs cannot be saved

Open hyxie2023 opened this issue 1 year ago • 0 comments

Describe the bug Hello, I'm new here and currently following a tutorial to train a classifier using FT-Transformer. The issue I'm encountering is that after parallel training on multiple GPUs with DDP, my model cannot be saved.

To Reproduce Here is my code.

import numpy as np
import pandas as pd
from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.models import FTTransformerConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig


if __name__ == '__main__':
    data = np.random.rand(50000, 10)
    labels = np.random.randint(0, 2, size=(50000, 1))
    dataset = np.hstack((data, labels))
    columns = [f'feature_{i}' for i in range(10)] + ['label']
    df = pd.DataFrame(dataset, columns=columns)

    data_config = DataConfig(
        target=["label"],
        continuous_cols=columns,
        num_workers=8
    )

    trainer_config = TrainerConfig(
        auto_lr_find=False,  # Runs the LRFinder to automatically derive a learning rate
        batch_size=512,
        max_epochs=5,
        checkpoints=None,  # Save best checkpoint monitoring val_loss
        trainer_kwargs={"strategy": 'ddp_find_unused_parameters_true'},
        devices_list=[0, 1, 2, 3]
    )

    optimizer_config = OptimizerConfig()

    head_config = LinearHeadConfig(
        layers="",  # No additional layer in head, just a mapping layer to output_dim
        dropout=0.1,
        initialization="kaiming",
    ).__dict__  # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)

    model_config = FTTransformerConfig(
        task="classification",
        head="LinearHead",  # Linear Head
        head_config=head_config,  # Linear Head Config
    )

    experiment_config = ExperimentConfig(
        project_name="pytorch-tabular",
        run_name="FTTransformer",
        exp_watch="gradients"
    )

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
        experiment_config=experiment_config
    )

    tabular_model.fit(train=df)
    tabular_model.save_model("/home/processed/tabular_model")

Error log:

Traceback (most recent call last):
  File "/home/new_competition_baseline/multigpu.py", line 60, in <module>
    tabular_model.save_model("/home/processed/tabular_model")
  File "/home/.local/lib/python3.9/site-packages/pytorch_tabular/tabular_model.py", line 1547, in save_model
    joblib.dump(self.callbacks, os.path.join(dir, "callbacks.sav"))
  File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 553, in dump
    NumpyPickler(f, protocol=protocol).dump(value)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 487, in dump
    self.save(obj)
  File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
    return Pickler.save(self, obj)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 560, in save
    f(self, obj)  # Call unbound method with explicit self
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 931, in save_list
    self._batch_appends(obj)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 955, in _batch_appends
    save(x)
  File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
    return Pickler.save(self, obj)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 603, in save
    self.save_reduce(obj=obj, *rv)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 717, in save_reduce
    save(state)
  File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
    return Pickler.save(self, obj)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 560, in save
    f(self, obj)  # Call unbound method with explicit self
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 971, in save_dict
    self._batch_setitems(obj.items())
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 997, in _batch_setitems
    save(v)
  File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
    return Pickler.save(self, obj)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 603, in save
    self.save_reduce(obj=obj, *rv)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 717, in save_reduce
    save(state)
  File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
    return Pickler.save(self, obj)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 560, in save
    f(self, obj)  # Call unbound method with explicit self
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 971, in save_dict
    self._batch_setitems(obj.items())
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 997, in _batch_setitems
    save(v)
  File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
    return Pickler.save(self, obj)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 603, in save
    self.save_reduce(obj=obj, *rv)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 717, in save_reduce
    save(state)
  File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
    return Pickler.save(self, obj)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 560, in save
    f(self, obj)  # Call unbound method with explicit self
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 971, in save_dict
    self._batch_setitems(obj.items())
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 997, in _batch_setitems
    save(v)
  File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
    return Pickler.save(self, obj)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 603, in save
    self.save_reduce(obj=obj, *rv)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 717, in save_reduce
    save(state)
  File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
    return Pickler.save(self, obj)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 560, in save
    f(self, obj)  # Call unbound method with explicit self
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 971, in save_dict
    self._batch_setitems(obj.items())
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 997, in _batch_setitems
    save(v)
  File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
    return Pickler.save(self, obj)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 603, in save
    self.save_reduce(obj=obj, *rv)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 717, in save_reduce
    save(state)
  File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
    return Pickler.save(self, obj)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 560, in save
    f(self, obj)  # Call unbound method with explicit self
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 971, in save_dict
    self._batch_setitems(obj.items())
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 997, in _batch_setitems
    save(v)
  File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
    return Pickler.save(self, obj)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 560, in save
    f(self, obj)  # Call unbound method with explicit self
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 931, in save_list
    self._batch_appends(obj)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 955, in _batch_appends
    save(x)
  File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
    return Pickler.save(self, obj)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 603, in save
    self.save_reduce(obj=obj, *rv)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 717, in save_reduce
    save(state)
  File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
    return Pickler.save(self, obj)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 560, in save
    f(self, obj)  # Call unbound method with explicit self
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 971, in save_dict
    self._batch_setitems(obj.items())
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 997, in _batch_setitems
    save(v)
  File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
    return Pickler.save(self, obj)
  File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 578, in save
    rv = reduce(self.proto)
TypeError: cannot pickle '_thread.lock' object

Additional context I am running the program on a server with 4 Tesla P100-PCIE-16GB GPUs, the system version is Ubuntu 16.04, and the CUDA version is 12.2. python==3.9.17 torch==2.0.0 pytorch-tabular==1.1.0

hyxie2023 avatar Oct 10 '24 10:10 hyxie2023