您好,我单卡运行没有问题,多卡运行的时候就会出现这样的问题,希望能够得到您的解答
File "trainer.py", line 611, in
main()
File "trainer.py", line 605, in main
trainer.fit(model)
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/pytorch_lightning/trainer/states.py", line 48, in wrapped_fn
result = fn(self, *args, **kwargs)
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1064, in fit
results = self.accelerator_backend.train()
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/pytorch_lightning/accelerators/dp_backend.py", line 97, in train
results = self.trainer.run_pretrain_routine(model)
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1224, in run_pretrain_routine
self._run_sanity_check(ref_model, model)
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1257, in _run_sanity_check
eval_results = self._evaluate(model, self.val_dataloaders, max_batches, False)
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/pytorch_lightning/trainer/evaluation_loop.py", line 331, in _evaluate
output = self.evaluation_forward(model, batch, batch_idx, dataloader_idx, test_mode)
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/pytorch_lightning/trainer/evaluation_loop.py", line 661, in evaluation_forward
output = model(*args)
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/pytorch_lightning/overrides/data_parallel.py", line 83, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/pytorch_lightning/overrides/data_parallel.py", line 147, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/pytorch_lightning/overrides/data_parallel.py", line 288, in parallel_apply
raise output
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/pytorch_lightning/overrides/data_parallel.py", line 251, in _worker
output = module.validation_step(*input, **kwargs)
File "trainer.py", line 311, in validation_step
all_span_rep = self.forward(loadall,all_span_lens,all_span_idxs_ltoken, tokens, attention_mask, token_type_ids)
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/torch/cuda/amp/autocast_mode.py", line 135, in decorate_autocast
return func(*args, **kwargs)
File "trainer.py", line 201, in forward
return self.model(loadall,all_span_lens,all_span_idxs_ltoken,input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/work/dev/wxp/SpanNER-main/models/bert_model_spanner.py", line 92, in forward
bert_outputs = self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/transformers/modeling_bert.py", line 753, in forward
input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/transformers/modeling_bert.py", line 178, in forward
inputs_embeds = self.word_embeddings(input_ids)
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/torch/nn/modules/sparse.py", line 126, in forward
self.norm_type, self.scale_grad_by_freq, self.sparse)
File "/home/work/anaconda3/envs/wxp_torch/lib/python3.7/site-packages/torch/nn/functional.py", line 1814, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: arguments are located on different GPUs at /opt/conda/conda-bld/pytorch_1595629403081/work/aten/src/THC/generic/THCTensorIndex.cu:403
I apologize for the late reply.
To make it easier for my collaborators and others with similar problems to understand, I will answer your questions in English.
I have not tried running this code with multiple GPUs. A quick look at your error message seems to be a problem with PyTorch lighting. You can try to modify the code appropriately according to the official multi-GPU training tutorial: https://pytorch-lightning.readthedocs.io/en/1.4.3/advanced/multi_gpu.html. (You can first try to set a parameter for accelerator
that supports multi-GPU operation).