🐛 Describe the bug
I use gemini plugin and set tp>1, the error occurs when saving optimizer.
Traceback (most recent call last):
File "/mnt/lustre/tangyang2/wangzihan/ColossalAI/applications/Colossal-LLaMA-2/pretrain_np.py", line 480, in
main()
File "/mnt/lustre/tangyang2/wangzihan/ColossalAI/applications/Colossal-LLaMA-2/pretrain_np.py", line 440, in main
save_checkpoint(
File "/mnt/lustre/tangyang2/wangzihan/ColossalAI/applications/Colossal-LLaMA-2/colossal_llama2/utils/ckpt_io.py", line 56, in save_checkpoint
booster.save_optimizer(optimizer, os.path.join(save_dir, "optimizer"), shard=True)
File "/mnt/lustre/tangyang2/wangzihan/ColossalAI/colossalai/booster/booster.py", line 307, in save_optimizer
self.checkpoint_io.save_optimizer(optimizer, checkpoint, shard, gather_dtensor, prefix, size_per_shard)
File "/mnt/lustre/tangyang2/wangzihan/ColossalAI/colossalai/checkpoint_io/checkpoint_io_base.py", line 197, in save_optimizer
self.save_sharded_optimizer(optimizer, checkpoint, gather_dtensor, prefix, size_per_shard)
File "/mnt/lustre/tangyang2/wangzihan/ColossalAI/colossalai/booster/plugin/gemini_plugin.py", line 191, in save_sharded_optimizer
total_size = save_state_dict_shards(
File "/mnt/lustre/tangyang2/wangzihan/ColossalAI/colossalai/checkpoint_io/utils.py", line 234, in save_state_dict_shards
for idx, shard_pair in enumerate(sharded_state_dict):
File "/mnt/lustre/tangyang2/wangzihan/ColossalAI/colossalai/zero/gemini/gemini_optimizer.py", line 799, in state_shard
state = self.collect_states(param_id=param_id, only_rank_0=only_rank_0)
File "/mnt/lustre/tangyang2/wangzihan/ColossalAI/colossalai/zero/gemini/gemini_optimizer.py", line 547, in collect_states
state_tensor = gather_distributed_param(state_tensor, keep_vars=False).cpu()
File "/mnt/lustre/tangyang2/wangzihan/ColossalAI/colossalai/checkpoint_io/utils.py", line 201, in gather_distributed_param
return to_global(param_)
File "/mnt/lustre/tangyang2/wangzihan/ColossalAI/colossalai/tensor/d_tensor/api.py", line 181, in to_global
global_tensor = layout_converter.apply(dtensor, dtensor.dist_layout, global_layout)
File "/mnt/lustre/tangyang2/wangzihan/ColossalAI/colossalai/tensor/d_tensor/layout_converter.py", line 607, in apply
_, comm_action_sequence = self.layout_converting(source_layout, target_layout)
File "/mnt/lustre/tangyang2/wangzihan/ColossalAI/colossalai/tensor/d_tensor/layout_converter.py", line 496, in layout_converting
if source_spec.spec_diff(target_spec) == 0:
File "/mnt/lustre/tangyang2/wangzihan/ColossalAI/colossalai/tensor/d_tensor/sharding_spec.py", line 240, in spec_diff
assert len(self.sharding_sequence) == len(
AssertionError: Cannot compare difference for two sharding specs with different length.
Environment
cuda=11.8
python=3.10.8
pytorch=2.0.0
Thanks for the issue. Could you try running that again? I pushed a fix a few weeks back