🐛 Describe the bug
an illegal memory access when finetune bloom model.
Traceback (most recent call last):
File "/data/juicefs_beijing_ai/public_data/11101470/colossal_fine_tuning/run_clm.py", line 1109, in
main()
File "/data/juicefs_beijing_ai/public_data/11101470/colossal_fine_tuning/run_clm.py", line 785, in main
loss = criterion(outputs['logits'], batch['input_ids'], batch['token_type_ids'])
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/data/juicefs_beijing_ai/public_data/11101470/colossal_fine_tuning/run_clm.py", line 114, in forward
loss = self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/loss.py", line 1164, in forward
return F.cross_entropy(input, target, weight=self.weight,
File "/opt/conda/lib/python3.9/site-packages/torch/nn/functional.py", line 3000, in cross_entropy
return handle_torch_function(
File "/opt/conda/lib/python3.9/site-packages/torch/overrides.py", line 1498, in handle_torch_function
result = torch_func_method(public_api, types, args, kwargs)
File "/opt/conda/lib/python3.9/site-packages/colossalai/tensor/colo_tensor.py", line 184, in torch_function
ret = func(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/colossalai/nn/_ops/loss.py", line 28, in colo_cross_entropy
output = F.cross_entropy(input_tensor,
File "/opt/conda/lib/python3.9/site-packages/torch/nn/functional.py", line 3014, in cross_entropy
return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
RuntimeError: CUDA error: an illegal memory access was encountered
Environment
No response
it seems there has some issue in colossal ai
ColoTensor([13.8750], device='cuda:0', dtype=torch.float16,
grad_fn=<AliasBackward0>)
ProcessGroup(ranks=[0],
rank=0, dp=1, tp=1)
DistSpec(placement=DistPlacementPattern.REPLICATE)
torch.Size([1])
ColoTensor([15.4531], device='cuda:0', dtype=torch.float16,
grad_fn=<AliasBackward0>)
ProcessGroup(ranks=[0],
rank=0, dp=1, tp=1)
DistSpec(placement=DistPlacementPattern.REPLICATE)
torch.Size([1])
ColoTensor(1.7656, device='cuda:0', dtype=torch.float16,
grad_fn=<AliasBackward0>)
ProcessGroup(ranks=[0],
rank=0, dp=1, tp=1)
DistSpec(placement=DistPlacementPattern.REPLICATE)
Traceback (most recent call last):
File "/data/juicefs_beijing_ai/public_data/11101470/colossal_fine_tuning/run_RM.py", line 817, in
main()
File "/data/juicefs_beijing_ai/public_data/11101470/colossal_fine_tuning/run_RM.py", line 544, in main
optimizer.backward(loss)
File "/opt/conda/lib/python3.9/site-packages/colossalai/nn/optimizer/zero_optimizer.py", line 240, in backward
self.module.backward(loss)
File "/opt/conda/lib/python3.9/site-packages/colossalai/nn/parallel/data_parallel.py", line 323, in backward
loss.backward()
File "/opt/conda/lib/python3.9/site-packages/torch/_tensor.py", line 388, in backward
return handle_torch_function(
File "/opt/conda/lib/python3.9/site-packages/torch/overrides.py", line 1498, in handle_torch_function
result = torch_func_method(public_api, types, args, kwargs)
File "/opt/conda/lib/python3.9/site-packages/colossalai/tensor/colo_tensor.py", line 181, in torch_function
return backward_tensor.backward(**tensor_kwargs)
File "/opt/conda/lib/python3.9/site-packages/torch/_tensor.py", line 396, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/opt/conda/lib/python3.9/site-packages/torch/autograd/init.py", line 173, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.9/site-packages/torch/autograd/function.py", line 253, in apply
return user_fn(self, *args)
File "/opt/conda/lib/python3.9/site-packages/torch/utils/checkpoint.py", line 146, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/opt/conda/lib/python3.9/site-packages/torch/autograd/init.py", line 173, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/lib/python3.9/site-packages/torch/autograd/function.py", line 253, in apply
return user_fn(self, *args)
File "/opt/conda/lib/python3.9/site-packages/colossalai/tensor/param_op_hook.py", line 133, in backward
ColoParamOpHookManager._trigger_pre_backward(ctx.params)
File "/opt/conda/lib/python3.9/site-packages/colossalai/tensor/param_op_hook.py", line 75, in _trigger_pre_backward
hook.pre_backward(params)
File "/opt/conda/lib/python3.9/site-packages/colossalai/zero/utils/gemini_hook.py", line 53, in pre_backward
self.pre_op(params)
File "/opt/conda/lib/python3.9/site-packages/colossalai/zero/utils/gemini_hook.py", line 35, in pre_op
self._chunk_manager.access_chunk(chunk)
File "/opt/conda/lib/python3.9/site-packages/colossalai/gemini/chunk/manager.py", line 103, in access_chunk
chunk.shard_move(get_current_device())
File "/opt/conda/lib/python3.9/site-packages/colossalai/gemini/chunk/chunk.py", line 332, in shard_move
self.cuda_shard = self.cpu_shard.to(get_current_device())
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
0%| | 0/12000 [00:08<?, ?it/s]
Hi @zhanghaoie Could you please provide more details about your script. The available information is not sufficient to allow us to reproduce. Thanks.
We have updated a lot. This issue was closed due to inactivity. Thanks.