DeepSpeedExamples
DeepSpeedExamples copied to clipboard
HelloDeepSpeed not reproducible
Hi DeepSpeed community,
I was trying to run the HelloDeepSpeed example with a AWS p3.16x instance (8 v100 gpus). However, I was hitting this issue:
deepspeed train_bert_ds.py --checkpoint_dir .
File "train_bert_ds.py", line 809, in <module>
fire.Fire(train)
File "/home/ubuntu/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/home/ubuntu/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/fire/core.py", line 471, in _Fire
target=component.__name__)
File "/home/ubuntu/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/fire/core.py", line 681, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "train_bert_ds.py", line 783, in train
model.backward(loss)
File "/home/ubuntu/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/runtime/engine.py", line 1722, in backward
self.allreduce_gradients()
File "/home/ubuntu/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/runtime/engine.py", line 1651, in allreduce_gradients
pipeline_parallel=self.pipeline_parallelism)
File "/home/ubuntu/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 601, in reduce_gradients
self.overlapping_partition_gradients_reduce_epilogue()
File "/home/ubuntu/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 756, in overlapping_partition_gradients_reduce_epilogue
self.independent_gradient_partition_epilogue()
File "/home/ubuntu/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 643, in independent_gradient_partition_epilogue
self.reduce_ipg_grads()
File "/home/ubuntu/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1240, in reduce_ipg_grads
self.copy_grads_in_partition(param)
File "/home/ubuntu/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1158, in copy_grads_in_partition
self.set_norm_for_param_grad_in_gpu(param)
File "/home/ubuntu/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1080, in set_norm_for_param_grad_in_gpu
[i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
KeyError: 44
DeepSpeed was pip installed and the torch version does not seem to matter Would appreciate any insights!
If I catch the key error then I can train, but I suppose the model parameters will be all messed up, I wonder what's the root cause
vim /home/ubuntu/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/runtime/zero/stage_1_and_2.py
def set_norm_for_param_grad_in_gpu(self, param):
param_id = self.get_param_id(param)
accumulated_grad = param.grad
try:
[i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
start = source_offset
accumulated_grad = accumulated_grad.view(-1).narrow(0, start, num_elements)
self.norm_for_param_grads[param_id] = accumulated_grad.data.double().norm(2)
except KeyError:
pass
#print("id is", param_id)
#print("map is", self.grad_position)
def async_inplace_copy_grad_to_fp32_buffer_from_gpu(self, param):
param_id = self.get_param_id(param)
try:
[i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
0,
dest_offset,
num_elements)
src_tensor = param.grad.view(-1).narrow(0, source_offset, num_elements)
if not self.fp16_master_weights_and_gradients:
src_tensor = src_tensor.float()
dest_tensor.copy_(src_tensor, non_blocking=True)
param.grad = None #offload only
except KeyError:
pass
#print("id~~ is", param_id)
#print("map~~ is", self.grad_position)
@Zha0q1, thanks for reporting this issue. It is really strange. Do you observe the same behavior with single gpu run?