LLaVA
LLaVA copied to clipboard
[Usage] deepspeed-chat training error on v100 * 8:RuntimeError: output tensor must have the same type as input tensor
Describe the issue
My zero3.json config is:
{
"fp16": {
"enabled": “auto”,
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled":“auto”
},
"train_micro_batch_size_per_gpu": "auto",
"train_batch_size": "auto",
"gradient_accumulation_steps": "auto",
"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
}
}
my script is:
deepspeed llava/train/train_xformers.py
--lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5
--deepspeed ./scripts/zero3.json
--model_name_or_path liuhaotian/llava-v1.5-7b
--version v1
--data_path xxx
--image_folder xxx
--vision_tower openai/clip-vit-large-patch14-336
--mm_projector_type mlp2x_gelu
--mm_vision_select_layer -2
--mm_use_im_start_end False
--mm_use_im_patch_token False
--image_aspect_ratio pad
--group_by_modality_length True
--bf16 false
--output_dir ./checkpoints/llava-v1.5-7b-task-lora
--num_train_epochs 1
--per_device_train_batch_size 16
--per_device_eval_batch_size 4
--gradient_accumulation_steps 1
--evaluation_strategy "no"
--save_strategy "steps"
--save_steps 50000
--save_total_limit 1
--learning_rate 2e-4
--weight_decay 0.
--warmup_ratio 0.03
--lr_scheduler_type "cosine"
--logging_steps 1
--tf32 false
--model_max_length 2048
--gradient_checkpointing True
--dataloader_num_workers 4
--lazy_preprocess True
--report_to wandb
the bug is:
Traceback (most recent call last):
File "/mnt/afs/liwenhao/wuzhongze/mPLUG-Owl-main/mPLUG-Owl2/mplug_owl2/train/train_xformers.py", line 13, in
train()
File "/mnt/afs/liwenhao/wuzhongze/mPLUG-Owl-main/mPLUG-Owl2/mplug_owl2/train/train.py", line 778, in train
trainer.train()
File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/transformers/trainer.py", line 1539, in train
return inner_training_loop(
File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/transformers/trainer.py", line 1809, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/transformers/trainer.py", line 2654, in training_step
ret_val = func(*args, **kwargs)
File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/partitioned_param_coordinator.py", line 386, in __all_gather_params
ret_val = func(*args, **kwargs)
File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 312, in allgather_fn
loss = self.compute_loss(model, inputs)
File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/transformers/trainer.py", line 2679, in compute_loss
return all_gather_into_tensor(output_tensor, input_tensor, group=group, async_op=async_op, debug=debug)
File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 116, in log_wrapper
handle = partitioned_params[0].all_gather_coalesced(partitioned_params)
File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
outputs = model(**inputs)
File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return func(*args, **kwargs)
File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 297, in all_gather_into_tensor
return cdb.all_gather_into_tensor(output_tensor=output_tensor, input_tensor=tensor, group=group, async_op=async_op)
File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/torch.py", line 136, in all_gather_into_tensor
return forward_call(*args, **kwargs)
File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
return self.all_gather_function(output_tensor=output_tensor,
File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1451, in wrapper
return func(*args, **kwargs)
File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2532, in all_gather_into_tensor
work = group._allgather_base(output_tensor, input_tensor)
RuntimeError: output tensor must have the same type as input tensor
handle = _dist_allgather_fn(param.ds_tensor.to(get_accelerator().current_device_name()), param_buffer,
File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 83, in _dist_allgather_fn
return instrument_w_nvtx(dist.allgather_fn)(output_tensor, input_tensor, group=group, async_op=True)
File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)ret_val = func(*args, **kwargs)
File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 935, in all_gather_coalesced File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1735, in forward ret_val = func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 312, in allgather_fn return all_gather_into_tensor(output_tensor, input_tensor, group=group, async_op=async_op, debug=debug) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 116, in log_wrapper handle = _dist_allgather_fn(param.ds_tensor.to(get_accelerator().current_device_name()), param_buffer, File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 83, in _dist_allgather_fn loss = self.module(*inputs, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 297, in all_gather_into_tensor return instrument_w_nvtx(dist.allgather_fn)(output_tensor, input_tensor, group=group, async_op=True) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn return cdb.all_gather_into_tensor(output_tensor=output_tensor, input_tensor=tensor, group=group, async_op=async_op) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/torch.py", line 136, in all_gather_into_tensor result = forward_call(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/peft/peft_model.py", line 922, in forward return self.all_gather_function(output_tensor=output_tensor, File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1451, in wrapper return self.base_model( File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2532, in all_gather_into_tensor result = forward_call(*args, **kwargs) File "/mnt/afs/liwenhao/wuzhongze/mPLUG-Owl-main/mPLUG-Owl2/mplug_owl2/model/modeling_mplug_owl2.py", line 239, in forward work = group._allgather_base(output_tensor, input_tensor) RuntimeError: output tensor must have the same type as input tensor self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images) File "/mnt/afs/liwenhao/wuzhongze/mPLUG-Owl-main/mPLUG-Owl2/mplug_owl2/model/modeling_mplug_owl2.py", line 80, in prepare_inputs_labels_for_multimodal image_features = self.encode_images(images) File "/mnt/afs/liwenhao/wuzhongze/mPLUG-Owl-main/mPLUG-Owl2/mplug_owl2/model/modeling_mplug_owl2.py", line 60, in encode_images image_features = self.get_model().vision_model(images).last_hidden_state File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl result = forward_call(*args, **kwargs) File "/mnt/afs/liwenhao/wuzhongze/mPLUG-Owl-main/mPLUG-Owl2/mplug_owl2/model/visual_encoder.py", line 419, in forward hidden_states = self.embeddings(pixel_values) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl result = forward_call(*args, **kwargs) File "/mnt/afs/liwenhao/wuzhongze/mPLUG-Owl-main/mPLUG-Owl2/mplug_owl2/model/visual_encoder.py", line 110, in forward image_embeds = self.patch_embed(pixel_values) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl result = hook(self, args) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn ret_val = func(*args, **kwargs) ret_val = func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 312, in allgather_fn File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/parameter_offload.py", line 371, in _pre_forward_module_hook self.pre_sub_module_forward_function(module) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return all_gather_into_tensor(output_tensor, input_tensor, group=group, async_op=async_op, debug=debug) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 116, in log_wrapper return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/parameter_offload.py", line 483, in pre_sub_module_forward_function return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 297, in all_gather_into_tensor param_coordinator.fetch_sub_module(sub_module) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn return cdb.all_gather_into_tensor(output_tensor=output_tensor, input_tensor=tensor, group=group, async_op=async_op) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/torch.py", line 136, in all_gather_into_tensor return self.all_gather_function(output_tensor=output_tensor, File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1451, in wrapper return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2532, in all_gather_into_tensor work = group._allgather_base(output_tensor, input_tensor) RuntimeError: output tensor must have the same type as input tensor ret_val = func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/partitioned_param_coordinator.py", line 254, in fetch_sub_module self.__all_gather_params(params_to_fetch) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn ret_val = func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/partitioned_param_coordinator.py", line 386, in __all_gather_params handle = partitioned_params[0].all_gather_coalesced(partitioned_params) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn ret_val = func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 935, in all_gather_coalesced handle = _dist_allgather_fn(param.ds_tensor.to(get_accelerator().current_device_name()), param_buffer, File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 83, in _dist_allgather_fn return instrument_w_nvtx(dist.allgather_fn)(output_tensor, input_tensor, group=group, async_op=True) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn ret_val = func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 312, in allgather_fn return all_gather_into_tensor(output_tensor, input_tensor, group=group, async_op=async_op, debug=debug) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 116, in log_wrapper return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 297, in all_gather_into_tensor return cdb.all_gather_into_tensor(output_tensor=output_tensor, input_tensor=tensor, group=group, async_op=async_op) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/torch.py", line 136, in all_gather_into_tensor return self.all_gather_function(output_tensor=output_tensor, File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1451, in wrapper return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2532, in all_gather_into_tensor work = group._allgather_base(output_tensor, input_tensor) RuntimeError: output tensor must have the same type as input tensor 0% 0/598 [00:04<?, ?it/s]
me too
请问您解决这个问题了吗?
--bf16 False
--fp16 True \
Fixed it for me
Thank you, faced the same problem and solved it with your solution
--bf16 False --fp16 True \
Fixed it for me