pytorch_tabular
pytorch_tabular copied to clipboard
The model trained using multiple GPUs cannot be saved
Describe the bug Hello, I'm new here and currently following a tutorial to train a classifier using FT-Transformer. The issue I'm encountering is that after parallel training on multiple GPUs with DDP, my model cannot be saved.
To Reproduce Here is my code.
import numpy as np
import pandas as pd
from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.models import FTTransformerConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig
if __name__ == '__main__':
data = np.random.rand(50000, 10)
labels = np.random.randint(0, 2, size=(50000, 1))
dataset = np.hstack((data, labels))
columns = [f'feature_{i}' for i in range(10)] + ['label']
df = pd.DataFrame(dataset, columns=columns)
data_config = DataConfig(
target=["label"],
continuous_cols=columns,
num_workers=8
)
trainer_config = TrainerConfig(
auto_lr_find=False, # Runs the LRFinder to automatically derive a learning rate
batch_size=512,
max_epochs=5,
checkpoints=None, # Save best checkpoint monitoring val_loss
trainer_kwargs={"strategy": 'ddp_find_unused_parameters_true'},
devices_list=[0, 1, 2, 3]
)
optimizer_config = OptimizerConfig()
head_config = LinearHeadConfig(
layers="", # No additional layer in head, just a mapping layer to output_dim
dropout=0.1,
initialization="kaiming",
).__dict__ # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)
model_config = FTTransformerConfig(
task="classification",
head="LinearHead", # Linear Head
head_config=head_config, # Linear Head Config
)
experiment_config = ExperimentConfig(
project_name="pytorch-tabular",
run_name="FTTransformer",
exp_watch="gradients"
)
tabular_model = TabularModel(
data_config=data_config,
model_config=model_config,
optimizer_config=optimizer_config,
trainer_config=trainer_config,
experiment_config=experiment_config
)
tabular_model.fit(train=df)
tabular_model.save_model("/home/processed/tabular_model")
Error log:
Traceback (most recent call last):
File "/home/new_competition_baseline/multigpu.py", line 60, in <module>
tabular_model.save_model("/home/processed/tabular_model")
File "/home/.local/lib/python3.9/site-packages/pytorch_tabular/tabular_model.py", line 1547, in save_model
joblib.dump(self.callbacks, os.path.join(dir, "callbacks.sav"))
File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 553, in dump
NumpyPickler(f, protocol=protocol).dump(value)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 487, in dump
self.save(obj)
File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
return Pickler.save(self, obj)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 931, in save_list
self._batch_appends(obj)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 955, in _batch_appends
save(x)
File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
return Pickler.save(self, obj)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 603, in save
self.save_reduce(obj=obj, *rv)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 717, in save_reduce
save(state)
File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
return Pickler.save(self, obj)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 971, in save_dict
self._batch_setitems(obj.items())
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 997, in _batch_setitems
save(v)
File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
return Pickler.save(self, obj)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 603, in save
self.save_reduce(obj=obj, *rv)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 717, in save_reduce
save(state)
File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
return Pickler.save(self, obj)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 971, in save_dict
self._batch_setitems(obj.items())
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 997, in _batch_setitems
save(v)
File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
return Pickler.save(self, obj)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 603, in save
self.save_reduce(obj=obj, *rv)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 717, in save_reduce
save(state)
File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
return Pickler.save(self, obj)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 971, in save_dict
self._batch_setitems(obj.items())
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 997, in _batch_setitems
save(v)
File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
return Pickler.save(self, obj)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 603, in save
self.save_reduce(obj=obj, *rv)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 717, in save_reduce
save(state)
File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
return Pickler.save(self, obj)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 971, in save_dict
self._batch_setitems(obj.items())
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 997, in _batch_setitems
save(v)
File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
return Pickler.save(self, obj)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 603, in save
self.save_reduce(obj=obj, *rv)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 717, in save_reduce
save(state)
File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
return Pickler.save(self, obj)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 971, in save_dict
self._batch_setitems(obj.items())
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 997, in _batch_setitems
save(v)
File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
return Pickler.save(self, obj)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 931, in save_list
self._batch_appends(obj)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 955, in _batch_appends
save(x)
File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
return Pickler.save(self, obj)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 603, in save
self.save_reduce(obj=obj, *rv)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 717, in save_reduce
save(state)
File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
return Pickler.save(self, obj)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 560, in save
f(self, obj) # Call unbound method with explicit self
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 971, in save_dict
self._batch_setitems(obj.items())
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 997, in _batch_setitems
save(v)
File "/home/.local/lib/python3.9/site-packages/joblib/numpy_pickle.py", line 355, in save
return Pickler.save(self, obj)
File "/opt/anaconda3/envs/torch/lib/python3.9/pickle.py", line 578, in save
rv = reduce(self.proto)
TypeError: cannot pickle '_thread.lock' object
Additional context I am running the program on a server with 4 Tesla P100-PCIE-16GB GPUs, the system version is Ubuntu 16.04, and the CUDA version is 12.2. python==3.9.17 torch==2.0.0 pytorch-tabular==1.1.0