ColossalAI
ColossalAI copied to clipboard
[BUG]: Got a nccl timeout when use save_checkpoint function
🐛 Describe the bug
When I use 1d tp and set the size as n > 1, I got a nccl timeout. opt_config = dict(parallel=dict(tensor=dict(mode='1d', size=8)), fp16=dict(mode=AMP_TYPE.TORCH)) colossalai.launch_from_torch(config=opt_config) But when I set the size as 1, it worked. opt_config = dict(parallel=dict(tensor=dict(mode='1d', size=1)), fp16=dict(mode=AMP_TYPE.TORCH)) colossalai.launch_from_torch(config=opt_config)
RuntimeError: NCCL communicator was aborted on rank 0. Original reason for failure was: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=691, OpType=ALLGATHER, Timeout(ms)=1800000) ran for 1801053 milliseconds before timing out.
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /workspace/opt/train_opt.py:619 in state_dict
with parameter, keep_vars=True
, i │
│ ❱ 352 │ │ │ torch_model = get_static_torch_model(zero_ddp_model=self, only_rank_0=only_r │
│ 353 │ │ │ return torch_model.state_dict(destination=destination, prefix=prefix, keep_v │
│ 354 │ │ return self._non_strict_state_dict(destination=destination, │
│ 355 │ │ │ │ │ │ │ │ │ │ prefix=prefix, │
│ │
│ /opt/conda/lib/python3.9/site-packages/colossalai/nn/parallel/utils.py:83 in │
│ get_static_torch_model │
│ │
│ 80 │ from colossalai.nn.parallel import ZeroDDP │
│ 81 │ assert isinstance(zero_ddp_model, ZeroDDP) │
│ 82 │ │
│ ❱ 83 │ state_dict = zero_ddp_model.state_dict(only_rank_0=only_rank_0, strict=False) │
│ 84 │ colo_model = zero_ddp_model.module │
│ 85 │ torch_model = _get_shallow_copy_model(colo_model) │
│ 86 │
│ │
│ /opt/conda/lib/python3.9/site-packages/colossalai/nn/parallel/data_parallel.py:354 in state_dict │
│ │
│ 351 │ │ │ assert keep_vars is False, "state_dict
with parameter, keep_vars=True
, i │
│ 352 │ │ │ torch_model = get_static_torch_model(zero_ddp_model=self, only_rank_0=only_r │
│ 353 │ │ │ return torch_model.state_dict(destination=destination, prefix=prefix, keep_v │
│ ❱ 354 │ │ return self._non_strict_state_dict(destination=destination, │
│ 355 │ │ │ │ │ │ │ │ │ │ prefix=prefix, │
│ 356 │ │ │ │ │ │ │ │ │ │ keep_vars=keep_vars, │
│ 357 │ │ │ │ │ │ │ │ │ │ only_rank_0=only_rank_0) │
│ │
│ /opt/conda/lib/python3.9/site-packages/colossalai/nn/parallel/data_parallel.py:378 in │
│ _non_strict_state_dict │
│ │
│ 375 │ │ │ destination = OrderedDict() │
│ 376 │ │ │ destination._metadata = OrderedDict() │
│ 377 │ │ destination._metadata[prefix[:-1]] = local_metadata = dict(version=self._version │
│ ❱ 378 │ │ self._save_to_state_dict(destination, prefix, keep_vars, only_rank_0) │
│ 379 │ │ │
│ 380 │ │ for hook in self._state_dict_hooks.values(): │
│ 381 │ │ │ hook_result = hook(self, destination, prefix, local_metadata) │
│ │
│ /opt/conda/lib/python3.9/site-packages/colossalai/nn/parallel/data_parallel.py:430 in │
│ _save_to_state_dict │
│ │
│ 427 │ │ """ │
│ 428 │ │ assert keep_vars is False, "state_dict
with parameter, keep_vars=True
, is no │
│ 429 │ │ │
│ ❱ 430 │ │ param_to_save_data = self._get_param_to_save_data(self.fp32_params, only_rank_0) │
│ 431 │ │ # TODO: (HELSON) deal with ddp ignored parameters │
│ 432 │ │ for (name, p), fp32_p in zip(self.named_parameters(), self.fp32_params): │
│ 433 │ │ │ if p is not None: │
│ │
│ /opt/conda/lib/python3.9/site-packages/colossalai/nn/parallel/data_parallel.py:401 in │
│ _get_param_to_save_data │
│ │
│ 398 │ │ param_to_save_data = dict() │
│ 399 │ │ chunk_list = self.chunk_manager.get_chunks(param_list) │
│ 400 │ │ for chunk in chunk_list: │
│ ❱ 401 │ │ │ temp_chunk = get_temp_total_chunk_on_cuda(chunk) │
│ 402 │ │ │ │
│ 403 │ │ │ for tensor, tensor_info in chunk.tensors_info.items(): │
│ 404 │ │ │ │ record_tensor = torch.empty([0]) │
│ │
│ /opt/conda/lib/python3.9/site-packages/colossalai/nn/parallel/utils.py:24 in │
│ get_temp_total_chunk_on_cuda │
│ │
│ 21 │ │
│ 22 │ total_temp = torch.zeros(chunk.chunk_size, dtype=chunk.dtype, device=get_current_dev │
│ 23 │ gather_list = list(torch.chunk(input=total_temp, chunks=chunk.pg_size, dim=0)) │
│ ❱ 24 │ dist.all_gather(tensor_list=gather_list, tensor=shard_temp, group=chunk.torch_pg) │
│ 25 │ │
│ 26 │ return total_temp │
│ 27 │
│ │
│ /opt/conda/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:2070 in all_gather │
│ │
│ 2067 │ │ default_pg = _get_default_group() │
│ 2068 │ │ work = default_pg.allgather([tensor_list], [tensor]) │
│ 2069 │ else: │
│ ❱ 2070 │ │ work = group.allgather([tensor_list], [tensor]) │
│ 2071 │ │
│ 2072 │ if async_op: │
│ 2073 │ │ return work │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
Environment
cuda 11.3 docker 0.22
Hi @TexasRangers86 , we do not recommend to use tensor parallelism size as 1, since this case does not need tensor parallelism at all. You could just set tensor
as an empty dictionary. We will also check the settings more carefully in our codes to avoid such bugs in the future. Thank you so much for reporting the bug anyway.
We have updated a lot. Please check the latest code. This issue was closed due to inactivity. Thanks.