cssegmentation icon indicating copy to clipboard operation
cssegmentation copied to clipboard

All tensors must be on devices[0]: 0

Open JFJ-Bin opened this issue 1 year ago • 1 comments
trafficstars

Hello, I encountered an error during the reproduction process. Could you help me? Training with your own dataset reports an error: (ID2) ubuntu@s414g1:~/cssegmentation$ bash scripts/dist_train.sh 4 /home/ubuntu/cssegmentation/csseg/co nfigs/mib/mib_r101iabnd16_aspp_512x512_tile2-1_overlap.py


Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.


Filtering Images: 100%|█████████████████████████████████████████████| 274/274 [00:00<00:00, 391.28it/s] Filtering Images: 100%|███████████████████████████████████████████████| 39/39 [00:00<00:00, 411.89it/s] Selected optimization level O1: Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are: enabled : True opt_level : O1 cast_model_type : None patch_torch_functions : True keep_batchnorm_fp32 : None master_weights : None loss_scale : dynamic Processing user overrides (additional kwargs that are not None)... After processing overrides, optimization options are: enabled : True opt_level : O1 cast_model_type : None patch_torch_functions : True keep_batchnorm_fp32 : None master_weights : None loss_scale : dynamic Warning: multi_tensor_applier fused unscale kernel is unavailable, possibly because apex was installed without --cuda_ext --cpp_ext. Using Python fallback. Original ImportError was: ModuleNotFoundError("No module named 'amp_C'") Traceback (most recent call last): File "/home/ubuntu/cssegmentation/csseg/train.py", line 62, in trainer_client.start() File "/home/ubuntu/cssegmentation/csseg/train.py", line 54, in start runner_client = BuildRunner(mode='TRAIN', cmd_args=cmd_args, runner_cfg=runner_cfg_task) File "/home/ubuntu/cssegmentation/csseg/modules/runners/builder.py", line 29, in build return super().build(module_cfg) File "/home/ubuntu/cssegmentation/csseg/modules/utils/modulebuilder.py", line 26, in build module = self.REGISTERED_MODULESmodule_type File "/home/ubuntu/cssegmentation/csseg/modules/runners/mib.py", line 18, in init super(MIBRunner, self).init( File "/home/ubuntu/cssegmentation/csseg/modules/runners/base.py", line 120, in init self.segmentor = BuildDistributedModel(model=self.segmentor, model_cfg=parallel_cfg['model_cfg']) File "/home/ubuntu/cssegmentation/csseg/modules/parallel/model.py", line 12, in BuildDistributedModel return nn.parallel.DistributedDataParallel(model, **model_cfg) File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 448, in init self._ddp_init_helper() File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 503, in _ddp_init_helper Traceback (most recent call last): File "/home/ubuntu/cssegmentation/csseg/train.py", line 62, in self._module_copies = replicate(self.module, self.device_ids, detach=True) File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 91, in replicate param_copies = _broadcast_coalesced_reshape(params, devices, detach) File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 67, in _broadcast_coalesced_reshape return comm.broadcast_coalesced(tensors, devices) File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 58, in broadcast_coalesced return torch._C._broadcast_coalesced(tensors, devices, buffer_size) RuntimeError: All tensors must be on devices[0]: 0 trainer_client.start() File "/home/ubuntu/cssegmentation/csseg/train.py", line 54, in start runner_client = BuildRunner(mode='TRAIN', cmd_args=cmd_args, runner_cfg=runner_cfg_task) File "/home/ubuntu/cssegmentation/csseg/modules/runners/builder.py", line 29, in build return super().build(module_cfg) File "/home/ubuntu/cssegmentation/csseg/modules/utils/modulebuilder.py", line 26, in build module = self.REGISTERED_MODULESmodule_type File "/home/ubuntu/cssegmentation/csseg/modules/runners/mib.py", line 18, in init super(MIBRunner, self).init( File "/home/ubuntu/cssegmentation/csseg/modules/runners/base.py", line 120, in init self.segmentor = BuildDistributedModel(model=self.segmentor, model_cfg=parallel_cfg['model_cfg']) File "/home/ubuntu/cssegmentation/csseg/modules/parallel/model.py", line 12, in BuildDistributedModel return nn.parallel.DistributedDataParallel(model, **model_cfg) File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 448, in init self._ddp_init_helper() File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 503, in _ddp_init_helper self._module_copies = replicate(self.module, self.device_ids, detach=True) File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 91, in replicate param_copies = _broadcast_coalesced_reshape(params, devices, detach) File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 67, in _broadcast_coalesced_reshape return comm.broadcast_coalesced(tensors, devices) File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 58, in broadcast_coalesced return torch._C._broadcast_coalesced(tensors, devices, buffer_size) RuntimeError: All tensors must be on devices[0]: 0 Traceback (most recent call last): File "/home/ubuntu/cssegmentation/csseg/train.py", line 62, in trainer_client.start() File "/home/ubuntu/cssegmentation/csseg/train.py", line 54, in start runner_client = BuildRunner(mode='TRAIN', cmd_args=cmd_args, runner_cfg=runner_cfg_task) File "/home/ubuntu/cssegmentation/csseg/modules/runners/builder.py", line 29, in build return super().build(module_cfg) File "/home/ubuntu/cssegmentation/csseg/modules/utils/modulebuilder.py", line 26, in build module = self.REGISTERED_MODULESmodule_type File "/home/ubuntu/cssegmentation/csseg/modules/runners/mib.py", line 18, in init super(MIBRunner, self).init( File "/home/ubuntu/cssegmentation/csseg/modules/runners/base.py", line 120, in init self.segmentor = BuildDistributedModel(model=self.segmentor, model_cfg=parallel_cfg['model_cfg']) File "/home/ubuntu/cssegmentation/csseg/modules/parallel/model.py", line 12, in BuildDistributedModel return nn.parallel.DistributedDataParallel(model, **model_cfg) File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 448, in init self._ddp_init_helper() File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 503, in _ddp_init_helper self._module_copies = replicate(self.module, self.device_ids, detach=True) File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 91, in replicate param_copies = _broadcast_coalesced_reshape(params, devices, detach) File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 67, in _broadcast_coalesced_reshape return comm.broadcast_coalesced(tensors, devices) File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 58, in broadcast_coalesced return torch._C._broadcast_coalesced(tensors, devices, buffer_size) RuntimeError: All tensors must be on devices[0]: 0 Killing subprocess 1686028 Killing subprocess 1686029 Killing subprocess 1686030 Killing subprocess 1686031 Traceback (most recent call last): File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/runpy.py", line 197, in _run_module_as_main return _run_code(code, main_globals, None, File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/runpy.py", line 87, in _run_code exec(code, run_globals) File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/distributed/launch.py", line 340, in main() File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/distributed/launch.py", line 326, in main sigkill_handler(signal.SIGTERM, None) # not coming back File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) subprocess.CalledProcessError: Command '['/home/ubuntu/miniconda3/envs/ID2/bin/python', '-u', 'csseg/train.py', '--local_rank=3', '--nproc_per_node', '4', '--cfgfilepath', '/home/ubuntu/cssegmentation/csseg/configs/mib/mib_r101iabnd16_aspp_512x512_tile2-1_overlap.py']' returned non-zero exit status 1.

JFJ-Bin avatar Feb 23 '24 08:02 JFJ-Bin