-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/data/home/kw2501/repos/PiPPy/PiPPy/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/data/home/kw2501/repos/PiPPy/pippy/utils.py", line 107, in run_worker
run_master(pp_ranks_per_dp_group[rank], args, *extra_args)
File "/data/home/kw2501/repos/PiPPy/examples/hf/translation/run_translation.py", line 637, in run_master
trainer.save_model() # Saves the tokenizer too for easy upload
File "/data/home/kw2501/repos/PiPPy/PiPPy/lib/python3.9/site-packages/transformers/trainer.py", line 2613, in save_model
self._save(output_dir)
File "/data/home/kw2501/repos/PiPPy/PiPPy/lib/python3.9/site-packages/transformers/trainer.py", line 2670, in _save
torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
File "/data/home/kw2501/repos/PiPPy/PiPPy/lib/python3.9/site-packages/torch/serialization.py", line 389, in save
_save(obj, opened_zipfile, pickle_module, pickle_protocol)
File "/data/home/kw2501/repos/PiPPy/PiPPy/lib/python3.9/site-packages/torch/serialization.py", line 599, in _save
pickler.dump(obj)
TypeError: cannot pickle 'torch._C._distributed_c10d.ProcessGroupNCCL' object