DeepSpeedExamples
DeepSpeedExamples copied to clipboard
RecursionError: maximum recursion depth exceeded while calling a Python object
I was trying to run Megatron with ZeRO 2 config when I encountered this error
> finished creating GPT2 datasets ...
setting training data start iteration to 0
setting validation data start iteration to 0
done with setups ...
time (ms) | model and optimizer: 1894.21 | train/valid/test data iterators: 357.88
training ...
Traceback (most recent call last):
File "pretrain_gpt2.py", line 156, in <module>
pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
File "/root/megatron-3d/megatron/training.py", line 97, in pretrain
iteration = train(forward_step_func,
File "/root/megatron-3d/megatron/training.py", line 481, in train
Traceback (most recent call last):
File "pretrain_gpt2.py", line 156, in <module>
loss_dict, skipped_iter = train_step(forward_step_func,
File "/root/megatron-3d/megatron/training.py", line 324, in train_step
return train_step_pipe(model, data_iterator)
File "/root/megatron-3d/megatron/training.py", line 358, in train_step_pipe
loss = model.train_batch(data_iter=data_iterator)
File "/root/anaconda3/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 273, in train_batch
pretrain(train_valid_test_datasets_provider, model_provider, forward_step, self._exec_schedule(sched)
File "/root/anaconda3/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 1162, in _exec_schedule
File "/root/megatron-3d/megatron/training.py", line 97, in pretrain
self._exec_instr(**cmd.kwargs)
File "/root/anaconda3/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 621, in _exec_load_micro_batch
iteration = train(forward_step_func,
File "/root/megatron-3d/megatron/training.py", line 481, in train
batch = self._next_batch()
File "/root/anaconda3/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 480, in _next_batch
return self._next_batch()
File "/root/anaconda3/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 480, in _next_batch
return self._next_batch()
File "/root/anaconda3/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 480, in _next_batch
loss_dict, skipped_iter = train_step(forward_step_func,
File "/root/megatron-3d/megatron/training.py", line 324, in train_step
return self._next_batch()
[Previous line repeated 978 more times]
File "/root/anaconda3/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 469, in _next_batch
return train_step_pipe(model, data_iterator)
File "/root/megatron-3d/megatron/training.py", line 358, in train_step_pipe
batch = self.batch_fn(batch)
File "pretrain_gpt2.py", line 110, in get_batch_pipe
return fp32_to_fp16((tokens, position_ids, attention_mask)), fp32_to_fp16((labels, loss_mask))
File "/root/megatron-3d/megatron/fp16/fp16.py", line 53, in fp32_to_fp16
return conversion_helper(val, half_conversion)
File "/root/megatron-3d/megatron/fp16/fp16.py", line 38, in conversion_helper
loss = model.train_batch(data_iter=data_iterator)
File "/root/anaconda3/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 273, in train_batch
rtn = [conversion_helper(v, conversion) for v in val]
File "/root/megatron-3d/megatron/fp16/fp16.py", line 38, in <listcomp>
rtn = [conversion_helper(v, conversion) for v in val]
File "/root/megatron-3d/megatron/fp16/fp16.py", line 37, in conversion_helper
return conversion(val)
File "/root/megatron-3d/megatron/fp16/fp16.py", line 48, in half_conversion
if isinstance(val_typecheck, (Parameter, Variable)):
File "/root/anaconda3/lib/python3.8/site-packages/torch/autograd/variable.py", line 7, in __instancecheck__
return isinstance(other, torch.Tensor)
self._exec_schedule(sched)RecursionError: maximum recursion depth exceeded while calling a Python object
This doesn't occur with the following config
{
"train_batch_size": 224,
"train_micro_batch_size_per_gpu": 4,
"steps_per_print": 10,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015,
"max_grad_norm": 1.0,
"betas": [0.9, 0.95]
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"wall_clock_breakdown": true,
"zero_allow_untested_optimizer": false
}