DeepSpeedExamples
DeepSpeedExamples copied to clipboard
[Bug] In step3, a runtime error will be thrown when inference_tp_size>1
Desciption: In DeepSpeed-Chat step3, a runtime error: The size of tensor a (4) must match the size of tensor b (8) at non-singleton dimension 0 will be thrown when inference_tp_size>1 and hybrid engine is enabled. I encountered this bug on the given 13b training scripts but not on that of 1.3b. I found that the main differences between the provided 13b and 1.3b training scripts are that zero stage are set to 3 and inference_tp_size is larger than 1 on 13b models, while zero stage are set to 2 and inference_tp_size are set to default on 1.3b scripts.
The exception looks like:
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /workspace/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py: │
│ 521 in <module> │
│ │
│ 518 │
│ 519 │
│ 520 if __name__ == "__main__": │
│ ❱ 521 │ main() │
│ 522 │
│ │
│ /workspace/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py: │
│ 429 in main │
│ │
│ 426 │ │ │ # prompts = prompts[:, length - args.max_prompt_seq_len:] │
│ 427 │ │ │ # raise ValueError("Prompt length is too long") │
│ 428 │ │ │ │
│ ❱ 429 │ │ │ out = trainer.generate_experience(batch_prompt['prompt'], │
│ 430 │ │ │ │ │ │ │ │ │ │ │ batch_prompt['prompt_att_mask']) │
│ 431 │ │ │ exp_dataset = exp_mini_dataset.add(out) │
│ 432 │
│ │
│ /workspace/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trai │
│ ner.py:98 in generate_experience │
│ │
│ 95 │ │
│ 96 │ def generate_experience(self, prompts, mask): │
│ 97 │ │ self.eval() │
│ ❱ 98 │ │ seq = self._generate_sequence(prompts, mask) │
│ 99 │ │ self.train() │
│ 100 │ │ │
│ 101 │ │ pad_token_id = self.tokenizer.pad_token_id │
│ │
│ /workspace/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trai │
│ ner.py:73 in _generate_sequence │
│ │
│ 70 │ │ max_min_length = self.max_answer_seq_len + prompts.shape[1] │
│ 71 │ │ │
│ 72 │ │ with torch.no_grad(): │
│ ❱ 73 │ │ │ seq = self.actor_model.module.generate(prompts, │
│ 74 │ │ │ │ │ │ │ │ │ │ │ │ attention_mask=mask, │
│ 75 │ │ │ │ │ │ │ │ │ │ │ │ max_length=max_min_length, │
│ 76 │ │ │ │ │ │ │ │ │ │ │ │ min_length=max_min_length) │
│ │
│ /opt/conda/lib/python3.8/site-packages/deepspeed/runtime/hybrid_engine.py:234 in generate │
│ │
│ 231 │ │ │ │ │
│ 232 │ │ │ │ non_active_params = get_inactive_params(non_tp_params) │
│ 233 │ │ │ │ with GatheredParameters(non_active_params): │
│ ❱ 234 │ │ │ │ │ generate_ret_vals = self._generate(*inputs, **kwargs) │
│ 235 │ │ │ │ │
│ 236 │ │ │ │ for layer_id in range(len(self.layer_params)): │
│ 237 │ │ │ │ │ self._inference_containers[layer_id].release_memory() │
│ │
│ /opt/conda/lib/python3.8/site-packages/torch/autograd/grad_mode.py:27 in decorate_context │
│ │
│ 24 │ │ @functools.wraps(func) │
│ 25 │ │ def decorate_context(*args, **kwargs): │
│ 26 │ │ │ with self.clone(): │
│ ❱ 27 │ │ │ │ return func(*args, **kwargs) │
│ 28 │ │ return cast(F, decorate_context) │
│ 29 │ │
│ 30 │ def _wrap_generator(self, func): │
│ │
│ /opt/conda/lib/python3.8/site-packages/transformers/generation/utils.py:1532 in generate │
│ │
│ 1529 │ │ │ │ ) │
│ 1530 │ │ │ │
│ 1531 │ │ │ # 11. run greedy search │
│ ❱ 1532 │ │ │ return self.greedy_search( │
│ 1533 │ │ │ │ input_ids, │
│ 1534 │ │ │ │ logits_processor=logits_processor, │
│ 1535 │ │ │ │ stopping_criteria=stopping_criteria, │
│ │
│ /opt/conda/lib/python3.8/site-packages/transformers/generation/utils.py:2356 in greedy_search │
│ │
│ 2353 │ │ │ model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) │
│ 2354 │ │ │ │
│ 2355 │ │ │ # forward pass to get next token │
│ ❱ 2356 │ │ │ outputs = self( │
│ 2357 │ │ │ │ **model_inputs, │
│ 2358 │ │ │ │ return_dict=True, │
│ 2359 │ │ │ │ output_attentions=output_attentions, │
│ │
│ /opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py:1148 in _call_impl │
│ │
│ 1145 │ │ │ bw_hook = hooks.BackwardHook(self, full_backward_hooks) │
│ 1146 │ │ │ input = bw_hook.setup_input_hook(input) │
│ 1147 │ │ │
│ ❱ 1148 │ │ result = forward_call(*input, **kwargs) │
│ 1149 │ │ if _global_forward_hooks or self._forward_hooks: │
│ 1150 │ │ │ for hook in (*_global_forward_hooks.values(), *self._forward_hooks.values()) │
│ 1151 │ │ │ │ hook_result = hook(self, input, result) │
│ │
│ /opt/conda/lib/python3.8/site-packages/transformers/models/opt/modeling_opt.py:944 in forward │
│ │
│ 941 │ │ return_dict = return_dict if return_dict is not None else self.config.use_return │
│ 942 │ │ │
│ 943 │ │ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) │
│ ❱ 944 │ │ outputs = self.model.decoder( │
│ 945 │ │ │ input_ids=input_ids, │
│ 946 │ │ │ attention_mask=attention_mask, │
│ 947 │ │ │ head_mask=head_mask, │
│ │
│ /opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py:1148 in _call_impl │
│ │
│ 1145 │ │ │ bw_hook = hooks.BackwardHook(self, full_backward_hooks) │
│ 1146 │ │ │ input = bw_hook.setup_input_hook(input) │
│ 1147 │ │ │
│ ❱ 1148 │ │ result = forward_call(*input, **kwargs) │
│ 1149 │ │ if _global_forward_hooks or self._forward_hooks: │
│ 1150 │ │ │ for hook in (*_global_forward_hooks.values(), *self._forward_hooks.values()) │
│ 1151 │ │ │ │ hook_result = hook(self, input, result) │
│ │
│ /opt/conda/lib/python3.8/site-packages/transformers/models/opt/modeling_opt.py:650 in forward │
│ │
│ 647 │ │ │ │ f"The provided attention mask has length {attention_mask.shape[1]}, but │
│ 648 │ │ │ │ f"{mask_seq_length} (sum of the lengths of current and past inputs)" │
│ 649 │ │ │ ) │
│ ❱ 650 │ │ causal_attention_mask = self._prepare_decoder_attention_mask( │
│ 651 │ │ │ attention_mask, input_shape, inputs_embeds, past_key_values_length │
│ 652 │ │ ) │
│ 653 │ │ pos_embeds = self.embed_positions(attention_mask, past_key_values_length) │
│ │
│ /opt/conda/lib/python3.8/site-packages/transformers/models/opt/modeling_opt.py:551 in │
│ _prepare_decoder_attention_mask │
│ │
│ 548 │ │ │ │ inputs_embeds.device │
│ 549 │ │ │ ) │
│ 550 │ │ │ combined_attention_mask = ( │
│ ❱ 551 │ │ │ │ expanded_attn_mask if combined_attention_mask is None else expanded_attn │
│ 552 │ │ │ ) │
│ 553 │ │ │
│ 554 │ │ return combined_attention_mask │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: The size of tensor a (4) must match the size of tensor b (8) at non-singleton dimension 0
Environment: torch1.12, deepspeed-0.10.0+d6f62217, deepspeed example-f9c3ae05, transformers-4.30.0
minimum bug reproduction for me: opt-1.3b+opt-350m, GPU: 8*40G A100
script:
#!/bin/bash
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
ACTOR_MODEL_PATH=$1
CRITIC_MODEL_PATH=$2
DATA_PATH=$3
ACTOR_ZERO_STAGE=$4
CRITIC_ZERO_STAGE=$5
OUTPUT=$6
if [ "$OUTPUT" == "" ]; then
OUTPUT=./output
fi
if [ "$ACTOR_ZERO_STAGE" == "" ]; then
ACTOR_ZERO_STAGE=3
fi
if [ "$CRITIC_ZERO_STAGE" == "" ]; then
CRITIC_ZERO_STAGE=3
fi
mkdir -p $OUTPUT
Num_Padding_at_Beginning=1 # this is model related
Actor_Lr=9.65e-6
Critic_Lr=5e-6
INFERENCE_TP_SIZE=2
deepspeed --master_port 12346 main.py \
--data_path $DATA_PATH \
--data_split 2,4,4 \
--actor_model_name_or_path $ACTOR_MODEL_PATH \
--critic_model_name_or_path $CRITIC_MODEL_PATH \
--num_padding_at_beginning 1 \
--per_device_train_batch_size 4 \
--per_device_mini_train_batch_size 4 \
--generation_batch_numbers 1 \
--ppo_epochs 1 \
--max_answer_seq_len 256 \
--max_prompt_seq_len 256 \
--actor_learning_rate ${Actor_Lr} \
--critic_learning_rate ${Critic_Lr} \
--num_train_epochs 1 \
--lr_scheduler_type cosine \
--gradient_accumulation_steps 1 \
--disable_actor_dropout \
--num_warmup_steps 100 \
--deepspeed --seed 1234 \
--enable_hybrid_engine \
--actor_zero_stage $ACTOR_ZERO_STAGE \
--critic_zero_stage $CRITIC_ZERO_STAGE \
--inference_tp_size ${INFERENCE_TP_SIZE} \
--tp_gather_partition_size 2 \
--enable_ema \
--output_dir $OUTPUT \
&> $OUTPUT/training.log
It would be of great help if anyone help me check this bug and fix it.