fairseq
fairseq copied to clipboard
Training model with GPU pipeline - RuntimeError Expected all tensors to be on the same device
I am trying to train a model with pipeline
echo `date`
exp_dir=$1
model_arch=${2:-"transformer_18_18"}
pretrained_ckpt=$3
fairseq-train $exp_dir/final_bin \
--max-source-positions=256 \
--max-target-positions=256 \
--source-lang=SRC \
--target-lang=TGT \
--max-update=1000000 \
--save-interval-updates=1000 \
--arch=$model_arch \
--activation-fn gelu --fp16 \
--criterion=label_smoothed_cross_entropy \
--label-smoothing=0.1 \
--optimizer adam \
--adam-betas "(0.9, 0.98)" \
--lr-scheduler=inverse_sqrt \
--clip-norm 1.0 \
--warmup-init-lr 1e-07 \
--lr 3e-5 \
--warmup-updates 2000 \
--dropout 0.2 \
--save-dir $exp_dir/model \
--keep-last-epochs 5 \
--keep-interval-updates 3 \
--patience 10 \
--skip-invalid-size-inputs-valid-test \
--user-dir model_configs \
--update-freq=1 \
--distributed-world-size 1 \
--num-workers 1 \
--max-tokens 256 \
--eval-bleu \
--eval-bleu-args "{\"beam\": 1, \"lenpen\": 1.0, \"max_len_a\": 1.2, \"max_len_b\": 10}" \
--eval-bleu-detok moses \
--eval-bleu-remove-bpe sentencepiece \
--eval-bleu-print-samples \
--best-checkpoint-metric bleu \
--maximize-best-checkpoint-metric \
--restore-file $pretrained_ckpt \
--reset-lr-scheduler \
--reset-meters \
--reset-dataloader \
--reset-optimizer \
--task translation \
--distributed-no-spawn \
--pipeline-model-parallel --ddp-backend "no_c10d" --pipeline-balance '[18,18]' --pipeline-devices '[0,1]' \
--pipeline-encoder-balance '[18]' \
--pipeline-encoder-devices '[0]' \
--pipeline-decoder-balance '[18]' \
--pipeline-decoder-devices '[1]'
This is inspired from https://github.com/facebookresearch/fairseq/tree/main/examples/m2m_100#generation-for-the-12b-model However, I am receiving
Traceback (most recent call last):
File "/Data/baban/anaconda3/envs/it2/bin/fairseq-train", line 8, in <module>
sys.exit(cli_main())
File "/Data/baban/scripts/it2/fairseq/fairseq_cli/train.py", line 574, in cli_main
distributed_utils.call_main(cfg, main)
File "/Data/baban/scripts/it2/fairseq/fairseq/distributed/utils.py", line 404, in call_main
main(cfg, **kwargs)
File "/Data/baban/scripts/it2/fairseq/fairseq_cli/train.py", line 205, in main
valid_losses, should_stop = train(cfg, trainer, task, epoch_itr)
File "/Data/baban/anaconda3/envs/it2/lib/python3.8/contextlib.py", line 75, in inner
return func(*args, **kwds)
File "/Data/baban/scripts/it2/fairseq/fairseq_cli/train.py", line 331, in train
log_output = trainer.train_step(samples)
File "/Data/baban/anaconda3/envs/it2/lib/python3.8/contextlib.py", line 75, in inner
return func(*args, **kwds)
File "/Data/baban/scripts/it2/fairseq/fairseq/trainer.py", line 868, in train_step
raise e
File "/Data/baban/scripts/it2/fairseq/fairseq/trainer.py", line 843, in train_step
loss, sample_size_i, logging_output = self.task.train_step(
File "/Data/baban/scripts/it2/fairseq/fairseq/tasks/fairseq_task.py", line 532, in train_step
loss, sample_size, logging_output = criterion(model, sample)
File "/Data/baban/anaconda3/envs/it2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/Data/baban/scripts/it2/fairseq/fairseq/criterions/label_smoothed_cross_entropy.py", line 80, in forward
net_output = model(**sample["net_input"])
File "/Data/baban/anaconda3/envs/it2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/Data/baban/scripts/it2/fairseq/fairseq/models/transformer/transformer_base.py", line 164, in forward
encoder_out = self.encoder(
File "/Data/baban/anaconda3/envs/it2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/Data/baban/scripts/it2/fairseq/fairseq/models/transformer/transformer_encoder.py", line 166, in forward
return self.forward_scriptable(
File "/Data/baban/scripts/it2/fairseq/fairseq/models/transformer/transformer_encoder.py", line 216, in forward_scriptable
x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings)
File "/Data/baban/scripts/it2/fairseq/fairseq/models/transformer/transformer_encoder.py", line 125, in forward_embedding
token_embedding = self.embed_tokens(src_tokens)
File "/Data/baban/anaconda3/envs/it2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/Data/baban/anaconda3/envs/it2/lib/python3.8/site-packages/torch/nn/modules/sparse.py", line 162, in forward
return F.embedding(
File "/Data/baban/anaconda3/envs/it2/lib/python3.8/site-packages/torch/nn/functional.py", line 2210, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)
I am using two GPUs for training. Any help regarding this will be really appreciated.
Have you figured out how to solve this problem? I have the same issue. Thank you. @babangain
@wangpichao @babangain Were you guys able to resolve this? I am also facing this issue. Thanks