GPU error with DataParallel
I am wrapping the model with DataParallel and getting the following error:
File "/data/anaconda3/envs/otc_orf/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker output = module(*input, **kwargs) File "/data/anaconda3/envs/otc_orf/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/data/RNA_Attribute_Prediction/scripts/Lib33_hl_data.py", line 282, in forward outputs = self.evo_model(input_ids, attention_mask=attention_mask,use_cache=False, return_dict=False) File "/data/anaconda3/envs/otc_orf/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/home/ec2-user/.cache/huggingface/modules/transformers_modules/togethercomputer/evo-1-131k-base/567369e9825aa08b3de4b122fca34fac6a890602/modeling_hyena.py", line 109, in forward logits, past_key_values = self.backbone( File "/data/anaconda3/envs/otc_orf/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/home/ec2-user/.cache/huggingface/modules/transformers_modules/togethercomputer/evo-1-131k-base/567369e9825aa08b3de4b122fca34fac6a890602/model.py", line 363, in forward x, inference_params_dict_out = self.stateless_forward(x, padding_mask=padding_mask) File "/home/ec2-user/.cache/huggingface/modules/transformers_modules/togethercomputer/evo-1-131k-base/567369e9825aa08b3de4b122fca34fac6a890602/model.py", line 382, in stateless_forward x, _ = block(x, inference_params=None, padding_mask=padding_mask) File "/data/anaconda3/envs/otc_orf/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/home/ec2-user/.cache/huggingface/modules/transformers_modules/togethercomputer/evo-1-131k-base/567369e9825aa08b3de4b122fca34fac6a890602/model.py", line 304, in forward z = self.proj_norm_fn(u) File "/home/ec2-user/.cache/huggingface/modules/transformers_modules/togethercomputer/evo-1-131k-base/567369e9825aa08b3de4b122fca34fac6a890602/model.py", line 298, in proj_norm return self.projections(self.pre_norm(x)) File "/data/anaconda3/envs/otc_orf/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/home/ec2-user/.cache/huggingface/modules/transformers_modules/togethercomputer/evo-1-131k-base/567369e9825aa08b3de4b122fca34fac6a890602/layers.py", line 40, in forward return self.scale * y RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!
The scale parameter does not seem to copy to multiple gpus and causes the error.
Any solution for this?
I also meet this problem too, but I can't solve it, who can give a solution, thanks a lot.
I have solved the problem, my reason is the data can't be transfer to other gpu automatically, so I revise evo's code to manually combinate it, and then it works.
I was able to fix this by using accelerate launch --num-processes=8 myscript.py todo multi gpu training instead of the built-in huggingface implementation.
I don't know why transformers can't be automatic train by
model = AutoModelForCausalLM.from_pretrained(
model_path,
config=model_config,
trust_remote_code=True,
device_map="auto",
}
so accordding to the debug information I find that the data on cuda:0 can't be automatic transfer to cuda:1 , you should manually finish it.
following is my revision. and you need to write device_map="auto" on you own, ref: https://github.com/evo-design/evo/issues/61
def forward(self, x, inference_params_dict=None, padding_mask=None):
L = x.shape[1]
x = self.embedding_layer.embed(x)
if inference_params_dict is not None:
x, inference_params_dict_out = self.stateful_forward(
x,
inference_params_dict=inference_params_dict,
)
else:
x, inference_params_dict_out = self.stateless_forward(x, padding_mask=padding_mask)
x = x.to("cuda:0")
x = self.norm(x)
x = self.unembed.unembed(x)
return x, inference_params_dict_out
def stateful_forward(self, x, inference_params_dict=None):
for block_idx, block in enumerate(self.blocks):
pre_device = 0
spilt_size = math.ceil(32 / self.device_num)
if (block_idx) // spilt_size != pre_device:
pre_device += 1
x = x.to(f"cuda:{block_idx // spilt_size}")
block_name = "mha" if block_idx in self.config.attn_layer_idxs else "hyena"
inference_params = inference_params_dict[block_name]
x, _ = block(x, inference_params=inference_params)
return x, inference_params_dict
Same error with the scale nn.Parameter not distributing to the other GPU's. Even weights of linear layers were having the same problem when I tested completely removing the scale parameter.
Didn't want to do too much hacking of the original code, but using Distributed Data Parallel instead of DataParallel seems to be working for me.
@clabornd I was wondering if you could share the code setup with DistributedDataParallel to parallelize Evo across GPUs?
@alexandre239 Sry this slipped off my radar. I'm not using DDP to train anymore, but in case its still useful something similar to below was how I was implementing it, which pretty much follows the pytorch tutorial I linked to.
import os
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
dist.init_process_group("nccl", rank=rank, world_size=world_size)
def cleanup():
dist.destroy_process_group()
def main(rank, world_size, args):
setup(rank, world_size)
### LOAD PROTEIN MODEL ###
model_name = 'togethercomputer/evo-1-8k-base'
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, revision="1.1_fix")
config.use_cache = True
protein_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
protein_tokenizer.pad_token = "X"
protein_tokenizer.padding_side = "left"
# dummy layer to return logits instead of probabilities
class CustomEmbedding(nn.Module):
def unembed(self, u):
return u
protein_model = AutoModelForCausalLM.from_pretrained(
model_name,
config=config,
trust_remote_code=True,
revision="1.1_fix",
# device_map=device_map
)
protein_model.unembed = CustomEmbedding()
protein_model = DDP(
protein_model, device_ids=[rank],
output_device=rank,
find_unused_parameters=True
)
# do training stuff
if __name__ == "__main__":
# args from argparse here
world_size = 3
mp.spawn(
main,
args=(world_size, args),
nprocs=world_size
)