ColossalAI Question about Supervised instructs tuning in ColossalAI/applications/Chat/

🐛 Describe the bug

when I set the lora_rank in example/train_sft.sh to 8, the bug happens as following:

Traceback (most recent call last): File "/home/chaojiewang/NeurIPS_2023/Chatgpt/coati/train_sft.py", line 185, in train(args) File "/home/chaojiewang/NeurIPS_2023/Chatgpt/coati/train_sft.py", line 156, in train trainer.fit(logger=logger, log_interval=args.log_interval) File "/home/chaojiewang/NeurIPS_2023/Chatgpt/coati/coati/trainer/sft.py", line 110, in fit self.strategy.optimizer_step(self.optimizer) File "/home/chaojiewang/NeurIPS_2023/Chatgpt/coati/coati/trainer/strategies/colossalai.py", line 154, in optimizer_step optimizer.step() File "/home/chaojiewang/anaconda3/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper return wrapped(*args, **kwargs) File "/home/chaojiewang/anaconda3/lib/python3.10/site-packages/colossalai/zero/sharded_optim/low_level_optim.py", line 467, in step assert param_shape == flat_fp32_avg_grads.shape,
AssertionError: fp32 param and grad have different shape torch.Size([5069312]) vs torch.Size([72192]) ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /home/chaojiewang/NeurIPS_2023/Chatgpt/coati/train_sft.py:185 in │ │ │ │ 182 │ parser.add_argument('--lr', type=float, default=5e-6) │ │ 183 │ parser.add_argument('--accimulation_steps', type=int, default=8) │ │ 184 │ args = parser.parse_args() │ │ ❱ 185 │ train(args) │ │ 186 │ │ │ │ /home/chaojiewang/NeurIPS_2023/Chatgpt/coati/train_sft.py:156 in train │ │ │ │ 153 │ │ │ │ │ │ max_epochs=args.max_epochs, │ │ 154 │ │ │ │ │ │ accimulation_steps=args.accimulation_steps) │ │ 155 │ │ │ ❱ 156 │ trainer.fit(logger=logger, log_interval=args.log_interval) │ │ 157 │ │ │ 158 │ # save model checkpoint after fitting on only rank0 │ │ 159 │ trainer.save_model(path=args.save_path, only_rank0=True, tokenizer=tokenizer) │ │ │ │ /home/chaojiewang/NeurIPS_2023/Chatgpt/coati/coati/trainer/sft.py:110 in fit │ │ │ │ 107 │ │ │ │ │ │ 108 │ │ │ │ # gradient accumulation │ │ 109 │ │ │ │ if (batch_id + 1) % self.accimulation_steps == 0: │ │ ❱ 110 │ │ │ │ │ self.strategy.optimizer_step(self.optimizer) │ │ 111 │ │ │ │ │ self.optimizer.zero_grad() │ │ 112 │ │ │ │ │ self.scheduler.step() │ │ 113 │ │ │ │ │ wandb.log({ │ │ │ │ /home/chaojiewang/NeurIPS_2023/Chatgpt/coati/coati/trainer/strategies/colossalai.py:154 in │ │ optimizer_step │ │ │ │ 151 │ │ optimizer.backward(loss) │ │ 152 │ │ │ 153 │ def optimizer_step(self, optimizer: optim.Optimizer, **kwargs) -> None: │ │ ❱ 154 │ │ optimizer.step() │ │ 155 │ │ │ 156 │ @staticmethod │ │ 157 │ def _unwrap_actor(actor: Actor) -> nn.Module: │ │ │ │ /home/chaojiewang/anaconda3/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:68 in │ │ wrapper │ │ │ │ 65 │ │ │ │ instance = instance_ref() │ │ 66 │ │ │ │ instance._step_count += 1 │ │ 67 │ │ │ │ wrapped = func.get(instance, cls) │ │ ❱ 68 │ │ │ │ return wrapped(*args, **kwargs) │ │ 69 │ │ │ │ │ 70 │ │ │ # Note that the returned function here is no longer a bound method, │ │ 71 │ │ │ # so attributes like __func__ and __self__ no longer exist. │ │ │ │ /home/chaojiewang/anaconda3/lib/python3.10/site-packages/colossalai/zero/sharded_optim/low_level │ │ _optim.py:467 in step │ │ │ │ 464 │ │ │ flat_fp32_avg_grads = flat_fp16_avg_grads.to(dtype) │ │ 465 │ │ │ │ │ 466 │ │ │ param_shape = self._fp32_flat_param_groups_of_current_rank[group_id].shape │ │ ❱ 467 │ │ │ assert param_shape == flat_fp32_avg_grads.shape, \ │ │ 468 │ │ │ │ f'fp32 param and grad have different shape {param_shape} vs {flat_fp32_a │ │ 469 │ │ │ │ │ 470 │ │ │ single_grad_partition_groups.append(flat_fp32_avg_grads) │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ AssertionError: fp32 param and grad have different shape torch.Size([5069312]) vs torch.Size([72192])

Environment

torchrun --standalone --nproc_per_node=1 train_sft.py
--pretrain "decapoda-research/llama-7b-hf"
--model 'llama'
--strategy colossalai_zero2
--log_interval 10
--save_path ./Coati-7B
--dataset ./data/instinwild_en.json
--batch_size 1
--accimulation_steps 8
--lr 2e-5
--max_datasets_size 512
--max_epochs 1
--lora_rank 8 \

No response

Mar 29 '23 10:03 chaojiewang94

@ht-zhou I get same error

Mar 29 '23 11:03 akk-123

@ht-zhou I get same error

Mar 29 '23 13:03 ZHENG518

Thanks for your feedback. And I will try to reproduce the bug and response soon.

Mar 29 '23 15:03 ht-zhou

🐛 Describe the bug

when I set the lora_rank in example/train_sft.sh to 8, the bug happens as following:

Traceback (most recent call last): File "/home/chaojiewang/NeurIPS_2023/Chatgpt/coati/train_sft.py", line 185, in train(args) File "/home/chaojiewang/NeurIPS_2023/Chatgpt/coati/train_sft.py", line 156, in train trainer.fit(logger=logger, log_interval=args.log_interval) File "/home/chaojiewang/NeurIPS_2023/Chatgpt/coati/coati/trainer/sft.py", line 110, in fit self.strategy.optimizer_step(self.optimizer) File "/home/chaojiewang/NeurIPS_2023/Chatgpt/coati/coati/trainer/strategies/colossalai.py", line 154, in optimizer_step optimizer.step() File "/home/chaojiewang/anaconda3/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper return wrapped(*args, **kwargs) File "/home/chaojiewang/anaconda3/lib/python3.10/site-packages/colossalai/zero/sharded_optim/low_level_optim.py", line 467, in step assert param_shape == flat_fp32_avg_grads.shape, AssertionError: fp32 param and grad have different shape torch.Size([5069312]) vs torch.Size([72192]) ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /home/chaojiewang/NeurIPS_2023/Chatgpt/coati/train_sft.py:185 in │ │ │ │ 182 │ parser.add_argument('--lr', type=float, default=5e-6) │ │ 183 │ parser.add_argument('--accimulation_steps', type=int, default=8) │ │ 184 │ args = parser.parse_args() │ │ ❱ 185 │ train(args) │ │ 186 │ │ │ │ /home/chaojiewang/NeurIPS_2023/Chatgpt/coati/train_sft.py:156 in train │ │ │ │ 153 │ │ │ │ │ │ max_epochs=args.max_epochs, │ │ 154 │ │ │ │ │ │ accimulation_steps=args.accimulation_steps) │ │ 155 │ │ │ ❱ 156 │ trainer.fit(logger=logger, log_interval=args.log_interval) │ │ 157 │ │ │ 158 │ # save model checkpoint after fitting on only rank0 │ │ 159 │ trainer.save_model(path=args.save_path, only_rank0=True, tokenizer=tokenizer) │ │ │ │ /home/chaojiewang/NeurIPS_2023/Chatgpt/coati/coati/trainer/sft.py:110 in fit │ │ │ │ 107 │ │ │ │ │ │ 108 │ │ │ │ # gradient accumulation │ │ 109 │ │ │ │ if (batch_id + 1) % self.accimulation_steps == 0: │ │ ❱ 110 │ │ │ │ │ self.strategy.optimizer_step(self.optimizer) │ │ 111 │ │ │ │ │ self.optimizer.zero_grad() │ │ 112 │ │ │ │ │ self.scheduler.step() │ │ 113 │ │ │ │ │ wandb.log({ │ │ │ │ /home/chaojiewang/NeurIPS_2023/Chatgpt/coati/coati/trainer/strategies/colossalai.py:154 in │ │ optimizer_step │ │ │ │ 151 │ │ optimizer.backward(loss) │ │ 152 │ │ │ 153 │ def optimizer_step(self, optimizer: optim.Optimizer, **kwargs) -> None: │ │ ❱ 154 │ │ optimizer.step() │ │ 155 │ │ │ 156 │ @staticmethod │ │ 157 │ def _unwrap_actor(actor: Actor) -> nn.Module: │ │ │ │ /home/chaojiewang/anaconda3/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:68 in │ │ wrapper │ │ │ │ 65 │ │ │ │ instance = instance_ref() │ │ 66 │ │ │ │ instance._step_count += 1 │ │ 67 │ │ │ │ wrapped = func.get(instance, cls) │ │ ❱ 68 │ │ │ │ return wrapped(*args, **kwargs) │ │ 69 │ │ │ │ │ 70 │ │ │ # Note that the returned function here is no longer a bound method, │ │ 71 │ │ │ # so attributes like __func__ and __self__ no longer exist. │ │ │ │ /home/chaojiewang/anaconda3/lib/python3.10/site-packages/colossalai/zero/sharded_optim/low_level │ │ _optim.py:467 in step │ │ │ │ 464 │ │ │ flat_fp32_avg_grads = flat_fp16_avg_grads.to(dtype) │ │ 465 │ │ │ │ │ 466 │ │ │ param_shape = self._fp32_flat_param_groups_of_current_rank[group_id].shape │ │ ❱ 467 │ │ │ assert param_shape == flat_fp32_avg_grads.shape, \ │ │ 468 │ │ │ │ f'fp32 param and grad have different shape {param_shape} vs {flat_fp32_a │ │ 469 │ │ │ │ │ 470 │ │ │ single_grad_partition_groups.append(flat_fp32_avg_grads) │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ AssertionError: fp32 param and grad have different shape torch.Size([5069312]) vs torch.Size([72192])

Environment

torchrun --standalone --nproc_per_node=1 train_sft.py --pretrain "decapoda-research/llama-7b-hf" --model 'llama' --strategy colossalai_zero2 --log_interval 10 --save_path ./Coati-7B --dataset ./data/instinwild_en.json --batch_size 1 --accimulation_steps 8 --lr 2e-5 --max_datasets_size 512 --max_epochs 1 --lora_rank 8 \

No response

use --strategy ddp, instead of colossalai_zero2