LLaVA [Usage] deepspeed-chat training error on v100 * 8:RuntimeError: output tensor must have the same type as input tensor

Describe the issue

My zero3.json config is: { "fp16": { "enabled": “auto”, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 16, "hysteresis": 2, "min_loss_scale": 1 }, "bf16": { "enabled":“auto” }, "train_micro_batch_size_per_gpu": "auto", "train_batch_size": "auto", "gradient_accumulation_steps": "auto", "zero_optimization": { "stage": 3, "overlap_comm": true, "contiguous_gradients": true, "sub_group_size": 1e9, "reduce_bucket_size": "auto", "stage3_prefetch_bucket_size": "auto", "stage3_param_persistence_threshold": "auto", "stage3_max_live_parameters": 1e9, "stage3_max_reuse_distance": 1e9, "stage3_gather_16bit_weights_on_model_save": true } } my script is： deepspeed llava/train/train_xformers.py
--lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5
--deepspeed ./scripts/zero3.json
--model_name_or_path liuhaotian/llava-v1.5-7b
--version v1
--data_path xxx
--image_folder xxx
--vision_tower openai/clip-vit-large-patch14-336
--mm_projector_type mlp2x_gelu
--mm_vision_select_layer -2
--mm_use_im_start_end False
--mm_use_im_patch_token False
--image_aspect_ratio pad
--group_by_modality_length True
--bf16 false
--output_dir ./checkpoints/llava-v1.5-7b-task-lora
--num_train_epochs 1
--per_device_train_batch_size 16
--per_device_eval_batch_size 4
--gradient_accumulation_steps 1
--evaluation_strategy "no"
--save_strategy "steps"
--save_steps 50000
--save_total_limit 1
--learning_rate 2e-4
--weight_decay 0.
--warmup_ratio 0.03
--lr_scheduler_type "cosine"
--logging_steps 1
--tf32 false
--model_max_length 2048
--gradient_checkpointing True
--dataloader_num_workers 4
--lazy_preprocess True
--report_to wandb the bug is: Traceback (most recent call last): File "/mnt/afs/liwenhao/wuzhongze/mPLUG-Owl-main/mPLUG-Owl2/mplug_owl2/train/train_xformers.py", line 13, in train() File "/mnt/afs/liwenhao/wuzhongze/mPLUG-Owl-main/mPLUG-Owl2/mplug_owl2/train/train.py", line 778, in train trainer.train() File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/transformers/trainer.py", line 1539, in train return inner_training_loop( File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/transformers/trainer.py", line 1809, in _inner_training_loop tr_loss_step = self.training_step(model, inputs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/transformers/trainer.py", line 2654, in training_step ret_val = func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/partitioned_param_coordinator.py", line 386, in __all_gather_params ret_val = func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 312, in allgather_fn loss = self.compute_loss(model, inputs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/transformers/trainer.py", line 2679, in compute_loss return all_gather_into_tensor(output_tensor, input_tensor, group=group, async_op=async_op, debug=debug) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 116, in log_wrapper handle = partitioned_params[0].all_gather_coalesced(partitioned_params) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn outputs = model(**inputs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 297, in all_gather_into_tensor return cdb.all_gather_into_tensor(output_tensor=output_tensor, input_tensor=tensor, group=group, async_op=async_op) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/torch.py", line 136, in all_gather_into_tensor return forward_call(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn return self.all_gather_function(output_tensor=output_tensor, File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1451, in wrapper return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2532, in all_gather_into_tensor work = group._allgather_base(output_tensor, input_tensor) RuntimeError: output tensor must have the same type as input tensor handle = _dist_allgather_fn(param.ds_tensor.to(get_accelerator().current_device_name()), param_buffer, File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 83, in _dist_allgather_fn return instrument_w_nvtx(dist.allgather_fn)(output_tensor, input_tensor, group=group, async_op=True) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn ret_val = func(*args, **kwargs)ret_val = func(*args, **kwargs)

File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 935, in all_gather_coalesced File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1735, in forward ret_val = func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 312, in allgather_fn return all_gather_into_tensor(output_tensor, input_tensor, group=group, async_op=async_op, debug=debug) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 116, in log_wrapper handle = _dist_allgather_fn(param.ds_tensor.to(get_accelerator().current_device_name()), param_buffer, File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 83, in _dist_allgather_fn loss = self.module(*inputs, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 297, in all_gather_into_tensor return instrument_w_nvtx(dist.allgather_fn)(output_tensor, input_tensor, group=group, async_op=True) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn return cdb.all_gather_into_tensor(output_tensor=output_tensor, input_tensor=tensor, group=group, async_op=async_op) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/torch.py", line 136, in all_gather_into_tensor result = forward_call(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/peft/peft_model.py", line 922, in forward return self.all_gather_function(output_tensor=output_tensor, File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1451, in wrapper return self.base_model( File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2532, in all_gather_into_tensor result = forward_call(*args, **kwargs) File "/mnt/afs/liwenhao/wuzhongze/mPLUG-Owl-main/mPLUG-Owl2/mplug_owl2/model/modeling_mplug_owl2.py", line 239, in forward work = group._allgather_base(output_tensor, input_tensor) RuntimeError: output tensor must have the same type as input tensor self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images) File "/mnt/afs/liwenhao/wuzhongze/mPLUG-Owl-main/mPLUG-Owl2/mplug_owl2/model/modeling_mplug_owl2.py", line 80, in prepare_inputs_labels_for_multimodal image_features = self.encode_images(images) File "/mnt/afs/liwenhao/wuzhongze/mPLUG-Owl-main/mPLUG-Owl2/mplug_owl2/model/modeling_mplug_owl2.py", line 60, in encode_images image_features = self.get_model().vision_model(images).last_hidden_state File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl result = forward_call(*args, **kwargs) File "/mnt/afs/liwenhao/wuzhongze/mPLUG-Owl-main/mPLUG-Owl2/mplug_owl2/model/visual_encoder.py", line 419, in forward hidden_states = self.embeddings(pixel_values) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl result = forward_call(*args, **kwargs) File "/mnt/afs/liwenhao/wuzhongze/mPLUG-Owl-main/mPLUG-Owl2/mplug_owl2/model/visual_encoder.py", line 110, in forward image_embeds = self.patch_embed(pixel_values) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl result = hook(self, args) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn ret_val = func(*args, **kwargs) ret_val = func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 312, in allgather_fn File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/parameter_offload.py", line 371, in _pre_forward_module_hook self.pre_sub_module_forward_function(module) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return all_gather_into_tensor(output_tensor, input_tensor, group=group, async_op=async_op, debug=debug) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 116, in log_wrapper return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/parameter_offload.py", line 483, in pre_sub_module_forward_function return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 297, in all_gather_into_tensor param_coordinator.fetch_sub_module(sub_module) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn return cdb.all_gather_into_tensor(output_tensor=output_tensor, input_tensor=tensor, group=group, async_op=async_op) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/torch.py", line 136, in all_gather_into_tensor return self.all_gather_function(output_tensor=output_tensor, File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1451, in wrapper return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2532, in all_gather_into_tensor work = group._allgather_base(output_tensor, input_tensor) RuntimeError: output tensor must have the same type as input tensor ret_val = func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/partitioned_param_coordinator.py", line 254, in fetch_sub_module self.__all_gather_params(params_to_fetch) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn ret_val = func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/partitioned_param_coordinator.py", line 386, in __all_gather_params handle = partitioned_params[0].all_gather_coalesced(partitioned_params) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn ret_val = func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 935, in all_gather_coalesced handle = _dist_allgather_fn(param.ds_tensor.to(get_accelerator().current_device_name()), param_buffer, File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 83, in _dist_allgather_fn return instrument_w_nvtx(dist.allgather_fn)(output_tensor, input_tensor, group=group, async_op=True) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn ret_val = func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 312, in allgather_fn return all_gather_into_tensor(output_tensor, input_tensor, group=group, async_op=async_op, debug=debug) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 116, in log_wrapper return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 297, in all_gather_into_tensor return cdb.all_gather_into_tensor(output_tensor=output_tensor, input_tensor=tensor, group=group, async_op=async_op) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/deepspeed/comm/torch.py", line 136, in all_gather_into_tensor return self.all_gather_function(output_tensor=output_tensor, File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1451, in wrapper return func(*args, **kwargs) File "/mnt/afs/liwenhao/mplug_owl/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2532, in all_gather_into_tensor work = group._allgather_base(output_tensor, input_tensor) RuntimeError: output tensor must have the same type as input tensor 0% 0/598 [00:04<?, ?it/s]