accelerate
accelerate copied to clipboard
not able to launch multi-node training
I can run single-node training without any problem. But when I switch to multi-node training, it failed immediately.
my command:
accelerate launch --config_file=/opt/tiger/alignment/accelerate_configs/deepspeed_zero3.yaml --num_machines 2 --machine_rank 0 --num_processes 16 --main_process_ip 10.124.167.213 --main_process_port 9686 sft/sft.py --model_name_or_path=model_dir --dataset_name=data_dir --per_device_train_batch_size=1 --output_dir=model_save --bf16 --save_total_limit=5 --warmup_steps=500 --save_steps=1000 --max_seq_length=2048 --attn_implementation=flash_attention_2 --neftune_noise_alpha=5
error messages:
File "/opt/tiger/alignment/sft/sft.py", line 75, in <module>
model = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
File "/usr/local/lib/python3.9/dist-packages/transformers/models/auto/auto_factory.py", line 566, in from_pretrained
Traceback (most recent call last):
File "/opt/tiger/alignment/sft/sft.py", line 75, in <module>
model = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
File "/usr/local/lib/python3.9/dist-packages/transformers/models/auto/auto_factory.py", line 566, in from_pretrained
return model_class.from_pretrained(
File "/usr/local/lib/python3.9/dist-packages/transformers/modeling_utils.py", line 3462, in from_pretrained
return model_class.from_pretrained(
File "/usr/local/lib/python3.9/dist-packages/transformers/modeling_utils.py", line 3462, in from_pretrained
Traceback (most recent call last):
File "/opt/tiger/alignment/sft/sft.py", line 75, in <module>
model = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
File "/usr/local/lib/python3.9/dist-packages/transformers/models/auto/auto_factory.py", line 566, in from_pretrained
return model_class.from_pretrained(
File "/usr/local/lib/python3.9/dist-packages/transformers/modeling_utils.py", line 3462, in from_pretrained
model = cls(config, *model_args, **model_kwargs)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 459, in wrapper
model = cls(config, *model_args, **model_kwargs)model = cls(config, *model_args, **model_kwargs)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 459, in wrapper
File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 459, in wrapper
f(module, *args, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/transformers/models/llama/modeling_llama.py", line 1109, in __init__
f(module, *args, **kwargs)f(module, *args, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/transformers/models/llama/modeling_llama.py", line 1109, in __init__
File "/usr/local/lib/python3.9/dist-packages/transformers/models/llama/modeling_llama.py", line 1109, in __init__
self.model = LlamaModel(config)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 459, in wrapper
self.model = LlamaModel(config)self.model = LlamaModel(config)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 459, in wrapper
File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 459, in wrapper
f(module, *args, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/transformers/models/llama/modeling_llama.py", line 954, in __init__
f(module, *args, **kwargs)f(module, *args, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/transformers/models/llama/modeling_llama.py", line 954, in __init__
File "/usr/local/lib/python3.9/dist-packages/transformers/models/llama/modeling_llama.py", line 954, in __init__
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 466, in wrapper
self._post_init_method(module)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 1000, in _post_init_method
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 466, in wrapper
File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 466, in wrapper
self._post_init_method(module)
self._post_init_method(module) File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 1000, in _post_init_method
File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 1000, in _post_init_method
self._zero_init_param(param)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 956, in _zero_init_param
self._zero_init_param(param)
dist.broadcast(param, 0, self.get_dp_process_group()) File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 956, in _zero_init_param
self._zero_init_param(param)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 956, in _zero_init_param
dist.broadcast(param, 0, self.get_dp_process_group())
File "/usr/local/lib/python3.9/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
dist.broadcast(param, 0, self.get_dp_process_group())
File "/usr/local/lib/python3.9/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
return func(*args, **kwargs) return func(*args, **kwargs)
return func(*args, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
File "/usr/local/lib/python3.9/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
File "/usr/local/lib/python3.9/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/comm/torch.py", line 196, in broadcast
return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/comm/torch.py", line 196, in broadcast
File "/usr/local/lib/python3.9/dist-packages/deepspeed/comm/torch.py", line 196, in broadcast
return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
File "/usr/local/lib/python3.9/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
File "/usr/local/lib/python3.9/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
File "/usr/local/lib/python3.9/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
Traceback (most recent call last):
File "/opt/tiger/alignment/sft/sft.py", line 75, in <module>
model = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
File "/usr/local/lib/python3.9/dist-packages/transformers/models/auto/auto_factory.py", line 566, in from_pretrained
return model_class.from_pretrained(
File "/usr/local/lib/python3.9/dist-packages/transformers/modeling_utils.py", line 3462, in from_pretrained
return func(*args, **kwargs)return func(*args, **kwargs)return func(*args, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/torch/distributed/distributed_c10d.py", line 1895, in broadcast
File "/usr/local/lib/python3.9/dist-packages/torch/distributed/distributed_c10d.py", line 1895, in broadcast
File "/usr/local/lib/python3.9/dist-packages/torch/distributed/distributed_c10d.py", line 1895, in broadcast
model = cls(config, *model_args, **model_kwargs)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 459, in wrapper
f(module, *args, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/transformers/models/llama/modeling_llama.py", line 1109, in __init__
self.model = LlamaModel(config)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 459, in wrapper
f(module, *args, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/transformers/models/llama/modeling_llama.py", line 954, in __init__
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 466, in wrapper
self._post_init_method(module)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 1000, in _post_init_method
self._zero_init_param(param)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 956, in _zero_init_param
dist.broadcast(param, 0, self.get_dp_process_group())
File "/usr/local/lib/python3.9/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
File "/usr/local/lib/python3.9/dist-packages/deepspeed/comm/torch.py", line 196, in broadcast
return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
File "/usr/local/lib/python3.9/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/torch/distributed/distributed_c10d.py", line 1895, in broadcast
work = default_pg.broadcast([tensor], opts) work = default_pg.broadcast([tensor], opts)work = default_pg.broadcast([tensor], opts)
work = default_pg.broadcast([tensor], opts)```
Is that the full error trace? It seems like some of it may be cut off
This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.
Please note that issues that do not follow the contributing guidelines are likely to be ignored.