Deepspeed model engine can not load a saved checkpoint. "RuntimeError: The size of tensor a x must match the size of tensor b y at non-singleton dimension z"
During training, I would periodically save a checkpoint using model_engine.save_checkpoint
However, model_engine.load_checkpoint is resulting in this output
[2021-07-08 19:55:42,454] [INFO] [state_dict_factory.py:165:check_ckpt_list] checkpoint file list: ['/home/santosh/deepspeed_checkpoints/secondTest/global_step18825/zero_pp_rank_0_mp_rank_00_model_states.pt']
[2021-07-08 19:55:42,468] [INFO] [state_dict_factory.py:55:load] mp_world_size: 1, mp_rank: 0, module_key: auto
[2021-07-08 19:55:42,469] [INFO] [state_dict_factory.py:85:load] rank: 0 loading checkpoint: /home/santosh/deepspeed_checkpoints/secondTest/global_step18825/zero_pp_rank_0_mp_rank_00_model_states.pt
successfully loaded 4 ZeRO state_dicts for rank 0
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-2-2fbc82a1028e> in <module>
----> 1 model_engine.load_checkpoint('/home/santosh/deepspeed_checkpoints/secondTest')
/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py in load_checkpoint(self, load_dir, tag, load_module_strict, load_optimizer_states, load_lr_scheduler_states)
1675
1676 if self.zero_optimization() and load_path is not None:
-> 1677 self._load_zero_checkpoint(load_dir,
1678 tag,
1679 load_optimizer_states=load_optimizer_states)
/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py in _load_zero_checkpoint(self, load_dir, tag, load_optimizer_states)
1749 return
1750
-> 1751 self.optimizer.load_state_dict(
1752 state_dict_list=zero_sd_list,
1753 load_optimizer_states=load_optimizer_states,
/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py in load_state_dict(self, state_dict_list, load_optimizer_states, load_from_fp32_weights)
3240 )
3241
-> 3242 self._rigid_load_state_dict(
3243 state_dict_list[dist.get_rank(group=self.dp_process_group)],
3244 load_optimizer_states=load_optimizer_states)
/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py in _rigid_load_state_dict(self, state_dict, load_optimizer_states)
3183 # restore fp32 partitions
3184 for curr_param, saved_param in zip(self.fp32_partitioned_groups_flat, state_dict['fp32_flat_groups']):
-> 3185 curr_param.data.copy_(saved_param.data)
3186
3187 # restore fp16 partitions from fp32
RuntimeError: The size of tensor a (882700288) must match the size of tensor b (220675072) at non-singleton dimension 0
This is the main code I use for training
def get_optimizer(config, model):
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{
'params': [
p for n, p in model.named_parameters()
if not any(nd in n for nd in no_decay)
],
'weight_decay': config.weight_decay},
{
'params': [
p for n, p in model.named_parameters()
if any(nd in n for nd in no_decay)
],
'weight_decay': 0.0
}
]
return torch.optim.AdamW(optimizer_grouped_parameters, lr=0.0005,
betas=(0.9, 0.98), eps=config.adam_eps,
weight_decay=config.weight_decay), optimizer_grouped_parameters
def get_scheduler(config, optimizer):
return torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=config.max_lr,
steps_per_epoch=config.total_steps, epochs=num_epochs, #steps_per_epoch=100_000, epochs=1,
anneal_strategy='linear',cycle_momentum=False,
div_factor=(config.max_lr/config.init_lr),
final_div_factor=(config.init_lr/config.final_lr))
def get_kepler_config():
global config
vocab = get_vocab()
config = Config(
vocab_size_or_config_json_file=len(vocab['tokens'].vocab),
entity_size=len(vocab['entity'].vocab),
entity_type_size=len(vocab['entity_type'].vocab),
)
return config
def get_kepler_large_config():
global config
vocab = get_vocab()
config = Config(
total_steps=int(26516300/(4*batch_size)),
model_size='large',
hidden_size=1024,
max_lr=1.5e-4,
num_hidden_layers_per_gpu0=24,
num_attention_heads=16,
intermediate_size=4096,
vocab_size_or_config_json_file=len(vocab['tokens'].vocab),
entity_size=len(vocab['entity'].vocab),
entity_type_size=len(vocab['entity_type'].vocab),
)
return config
def load_model(config):
model = ModelParallel_Large_Deepspeed(config)
optimizer, optimizer_grouped_parameters = get_optimizer(config, model)
scheduler = get_scheduler(config, optimizer)
return model, optimizer, optimizer_grouped_parameters, scheduler
def train(args):
torch.distributed.init_process_group(backend="nccl")
config = get_kepler_large_config()
model, optimizer, optimizer_grouped_parameters, scheduler = load_model(config)
deepspeed_config = {
"train_micro_batch_size_per_gpu": batch_size,
"gradient_accumulation_steps": 64, #128 for original kepler large,
"optimizer": {
"type": "Adam",
"params": {
"lr": 3e-5,
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": True
},
"zero_optimization": {
"stage": 3
},
"steps_per_print": 2000,
"output_file": "/home/santosh/output_files/"
}
model_engine, optimizer, ds_training_dataloader, lr_scheduler = deepspeed.initialize(model=model,
model_parameters=optimizer_grouped_parameters,
lr_scheduler=scheduler, #automatically executed at model_engine.step()
training_data=AugmentedDataset(
DiskDataset()
),
collate_fn=collate_ds,
config_params=deepspeed_config)
def gpu_map(inn):
return inn.to(model_engine.local_rank)
for ii in range(num_epochs):
for iii, model_batch in enumerate(tqdm(ds_training_dataloader)):
model_batch = valmap(gpu_map, model_batch)
loss = model_engine(model_batch)
model_engine.backward(loss)
model_engine.step()
model_engine.save_checkpoint('/home/santosh/deepspeed_checkpoints/secondTest/')
print('Training Finished')
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser = deepspeed.add_config_arguments(parser)
args = parser.parse_args()
train(args)
And this was the code where I load a checkpoint. I am using a jupyter notebook, so the output printed above only contains the output for this line.
model_engine.load_checkpoint('/home/santosh/deepspeed_checkpoints/secondTest')
Here is the full code, which is basically the same as the training code, plus the checkpoint loading, and also commenting out deepspeed.init_distributed(dist_backend='nccl'), which results in an error, probably from not using a deepspeed launcher, but I don't believe it's necessary since I am only doing inference,
def get_optimizer(config, model):
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{
'params': [
p for n, p in model.named_parameters()
if not any(nd in n for nd in no_decay)
],
'weight_decay': config.weight_decay},
{
'params': [
p for n, p in model.named_parameters()
if any(nd in n for nd in no_decay)
],
'weight_decay': 0.0
}
]
return torch.optim.AdamW(optimizer_grouped_parameters, lr=0.0005,
betas=(0.9, 0.98), eps=config.adam_eps,
weight_decay=config.weight_decay), optimizer_grouped_parameters
def get_scheduler(config, optimizer):
return torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=config.max_lr,
steps_per_epoch=config.total_steps, epochs=num_epochs, #steps_per_epoch=100_000, epochs=1,
anneal_strategy='linear',cycle_momentum=False,
div_factor=(config.max_lr/config.init_lr),
final_div_factor=(config.init_lr/config.final_lr))
def get_kepler_config():
global config
vocab = get_vocab()
config = Config(
vocab_size_or_config_json_file=len(vocab['tokens'].vocab),
entity_size=len(vocab['entity'].vocab),
entity_type_size=len(vocab['entity_type'].vocab),
)
return config
def get_kepler_large_config():
global config
vocab = get_vocab()
config = Config(
log_steps=100,
total_steps=int(26516300/(4*batch_size)),
model_size='large',
hidden_size=1024,
max_lr=1.5e-4,
num_hidden_layers_per_gpu0=24,
num_attention_heads=16,
intermediate_size=4096,
vocab_size_or_config_json_file=len(vocab['tokens'].vocab),
entity_size=len(vocab['entity'].vocab),
entity_type_size=len(vocab['entity_type'].vocab),
)
return config
def load_model(config):
model = ModelParallel_Large_Deepspeed(config)
optimizer, optimizer_grouped_parameters = get_optimizer(config, model)
scheduler = get_scheduler(config, optimizer)
return model, optimizer, optimizer_grouped_parameters, scheduler
# deepspeed.init_distributed(dist_backend='nccl')
config = get_kepler_large_config()
model, optimizer, optimizer_grouped_parameters, scheduler = load_model(config)
deepspeed_config = {
"train_micro_batch_size_per_gpu": batch_size, #22 working for deepspeed 20 layers
"gradient_accumulation_steps": 64, #128 for original kepler large,
"optimizer": {
"type": "Adam",
"params": {
"lr": 3e-5,
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": True
},
"zero_optimization": {
"stage": 3
},
"steps_per_print": 2000,
"output_file": "/home/santosh/output_files/"
}
model_engine, optimizer, ds_training_dataloader, lr_scheduler = deepspeed.initialize(model=model,
model_parameters=optimizer_grouped_parameters,
lr_scheduler=scheduler, #automatically executed at model_engine.step()
training_data=AugmentedDataset(
DiskDataset('/workspace/dataset/deepspeed-pubmed_spans_2021_03_02-9b1905')
),
# collate_fn=lambda x: x,
collate_fn=collate_ds,
config_params=deepspeed_config)
model_engine.load_checkpoint('/home/santosh/deepspeed_checkpoints/secondTest')
I met the same problem, have you find any solutions?
I trained and save_checkpoint using 4 GPUs, but when I tried to load_checkpoint using 1 GPU, I encountered the same issue.
I suspect that ZeRO3 splits the model and saves the weights, as 882700288/220675072=4 is consistent with my case.