CenterNet
CenterNet copied to clipboard
ConnectionResetError: [Errno 104] Connection reset by peer
training start...
0%| | 0/480000 [00:00<?, ?it/s]Traceback (most recent call last):
File "train.py", line 203, in
Exception in thread Thread-3: Traceback (most recent call last): File "/home/rencong/anaconda3/envs/CenterNet/lib/python3.6/threading.py", line 916, in _bootstrap_inner self.run() File "/home/rencong/anaconda3/envs/CenterNet/lib/python3.6/threading.py", line 864, in run self._target(*self._args, **self._kwargs) File "train.py", line 51, in pin_memory data = data_queue.get() File "/home/rencong/anaconda3/envs/CenterNet/lib/python3.6/multiprocessing/queues.py", line 113, in get return _ForkingPickler.loads(res) File "/home/rencong/anaconda3/envs/CenterNet/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 151, in rebuild_storage_fd fd = df.detach() File "/home/rencong/anaconda3/envs/CenterNet/lib/python3.6/multiprocessing/resource_sharer.py", line 57, in detach with _resource_sharer.get_connection(self._id) as conn: File "/home/rencong/anaconda3/envs/CenterNet/lib/python3.6/multiprocessing/resource_sharer.py", line 87, in get_connection c = Client(address, authkey=process.current_process().authkey) File "/home/rencong/anaconda3/envs/CenterNet/lib/python3.6/multiprocessing/connection.py", line 493, in Client answer_challenge(c, authkey) File "/home/rencong/anaconda3/envs/CenterNet/lib/python3.6/multiprocessing/connection.py", line 737, in answer_challenge response = connection.recv_bytes(256) # reject large message File "/home/rencong/anaconda3/envs/CenterNet/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes buf = self._recv_bytes(maxlength) File "/home/rencong/anaconda3/envs/CenterNet/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes buf = self._recv(4) File "/home/rencong/anaconda3/envs/CenterNet/lib/python3.6/multiprocessing/connection.py", line 379, in _recv chunk = read(handle, remaining) ConnectionResetError: [Errno 104] Connection reset by peer
Can i see your full log?
have you solved this error?
I have the same issue as @Kangzf1996 and @MichaelCong - multiprocessing ConnectionResetError
triggered by RuntimeError: CUDAStream_setStreamOnDevice
from PyTorch. I included my full log below.
My only changes to the default CenterNet-52.json
are decreasing the batch size to 4, decreasing the chunk sizes accordingly to [2, 2]
, and updating my data_dir
. I went with a small batch size to try and avoid any memory issues (my system only has 6 GB on GPU and 16 GB on CPU).
Seems like this issue shouldn't happen out-of-the-box. Any thoughts?
loading all datasets...
using 4 threads
loading from cache file: cache/coco_trainval2014.pkl
loading annotations into memory...
Done (t=9.31s)
creating index...
index created!
loading from cache file: cache/coco_trainval2014.pkl
loading annotations into memory...
Done (t=9.47s)
creating index...
index created!
loading from cache file: cache/coco_trainval2014.pkl
loading annotations into memory...
Done (t=10.42s)
creating index...
index created!
loading from cache file: cache/coco_trainval2014.pkl
loading annotations into memory...
Done (t=9.14s)
creating index...
index created!
loading from cache file: cache/coco_minival2014.pkl
loading annotations into memory...
Done (t=0.29s)
creating index...
index created!
system config...
{'batch_size': 4,
'cache_dir': 'cache',
'chunk_sizes': [2, 2],
'config_dir': 'config',
'data_dir': './data',
'data_rng': RandomState(MT19937) at 0x7F7B50916258,
'dataset': 'MSCOCO',
'decay_rate': 10,
'display': 5,
'learning_rate': 0.00025,
'max_iter': 480000,
'nnet_rng': RandomState(MT19937) at 0x7F7B50916150,
'opt_algo': 'adam',
'prefetch_size': 6,
'pretrain': None,
'result_dir': 'results',
'sampling_function': 'kp_detection',
'snapshot': 5000,
'snapshot_name': 'CenterNet-52',
'stepsize': 450000,
'test_split': 'testdev',
'train_split': 'trainval',
'val_iter': 500,
'val_split': 'minival',
'weight_decay': False,
'weight_decay_rate': 1e-05,
'weight_decay_type': 'l2'}
db config...
{'ae_threshold': 0.5,
'border': 128,
'categories': 80,
'data_aug': True,
'gaussian_bump': True,
'gaussian_iou': 0.7,
'gaussian_radius': -1,
'input_size': [511, 511],
'kp_categories': 1,
'lighting': True,
'max_per_image': 100,
'merge_bbox': False,
'nms_algorithm': 'exp_soft_nms',
'nms_kernel': 3,
'nms_threshold': 0.5,
'output_sizes': [[128, 128]],
'rand_color': True,
'rand_crop': True,
'rand_pushes': False,
'rand_samples': False,
'rand_scale_max': 1.4,
'rand_scale_min': 0.6,
'rand_scale_step': 0.1,
'rand_scales': array([0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3]),
'special_crop': False,
'test_scales': [1],
'top_k': 70,
'weight_exp': 8}
len of db: 118287
start prefetching data...
shuffling indices...
start prefetching data...
shuffling indices...
start prefetching data...
shuffling indices...
start prefetching data...
shuffling indices...
building model...
module_file: models.CenterNet-52
start prefetching data...
shuffling indices...
total parameters: 104844152
setting learning rate to: 0.00025
training start...
0%| | 0/480000 [00:00<?, ?it/s]
Traceback (most recent call last):
File "train.py", line 203, in <module>
train(training_dbs, validation_db, args.start_iter)
File "train.py", line 138, in train
training_loss, focal_loss, pull_loss, push_loss, regr_loss = nnet.train(**training)
File "/storage/projects/alpr/centernet/nnet/py_factory.py", line 82, in train
loss_kp = self.network(xs, ys)
File "/home/addison/miniconda3/envs/centernet/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in __call__
result = self.forward(*input, **kwargs)
File "/storage/projects/alpr/centernet/models/py_utils/data_parallel.py", line 66, in forward
inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids, self.chunk_sizes)
File "/storage/projects/alpr/centernet/models/py_utils/data_parallel.py", line 77, in scatter
return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim, chunk_sizes=self.chunk_sizes)
File "/storage/projects/alpr/centernet/models/py_utils/scatter_gather.py", line 30, in scatter_kwargs
inputs = scatter(inputs, target_gpus, dim, chunk_sizes) if inputs else []
File "/storage/projects/alpr/centernet/models/py_utils/scatter_gather.py", line 25, in scatter
return scatter_map(inputs)
File "/storage/projects/alpr/centernet/models/py_utils/scatter_gather.py", line 18, in scatter_map
return list(zip(*map(scatter_map, obj)))
File "/storage/projects/alpr/centernet/models/py_utils/scatter_gather.py", line 20, in scatter_map
return list(map(list, zip(*map(scatter_map, obj))))
File "/storage/projects/alpr/centernet/models/py_utils/scatter_gather.py", line 15, in scatter_map
return Scatter.apply(target_gpus, chunk_sizes, dim, obj)
File "/home/addison/miniconda3/envs/centernet/lib/python3.6/site-packages/torch/nn/parallel/_functions.py", line 87, in forward
outputs = comm.scatter(input, ctx.target_gpus, ctx.chunk_sizes, ctx.dim, streams)
File "/home/addison/miniconda3/envs/centernet/lib/python3.6/site-packages/torch/cuda/comm.py", line 142, in scatter
return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams))
RuntimeError: (CUDAStream_setStreamOnDevice at /opt/conda/conda-bld/pytorch_1532581333611/work/aten/src/ATen/CUDAStream.cpp:123)
frame #0: at::detail::CUDAStream_setStream(CUDAStreamInternals*) + 0x23 (0x7f7b2eac2663 in /home/addison/miniconda3/envs/centernet/lib/python3.6/site-packages/torch/lib/libcaffe2.so)
frame #1: torch::cuda::scatter(at::Tensor const&, at::ArrayRef<long>, at::optional<std::vector<long, std::allocator<long> > > const&, long, at::optional<std::vector<CUDAStreamInternals*, std::allocator<CUDAStreamInternals*> > > const&) + 0x526 (0x7f7b307bca56 in /home/addison/miniconda3/envs/centernet/lib/python3.6/site-packages/torch/_C.cpython-36m-x86_64-linux-gnu.so)
frame #2: <unknown function> + 0xc42bab (0x7f7b307c4bab in /home/addison/miniconda3/envs/centernet/lib/python3.6/site-packages/torch/_C.cpython-36m-x86_64-linux-gnu.so)
frame #3: <unknown function> + 0x38a52b (0x7f7b2ff0c52b in /home/addison/miniconda3/envs/centernet/lib/python3.6/site-packages/torch/_C.cpython-36m-x86_64-linux-gnu.so)
<omitting python frames>
frame #14: THPFunction_apply(_object*, _object*) + 0x38f (0x7f7b302eabcf in /home/addison/miniconda3/envs/centernet/lib/python3.6/site-packages/torch/_C.cpython-36m-x86_64-linux-gnu.so)
Exception in thread Thread-1:
Traceback (most recent call last):
File "/home/addison/miniconda3/envs/centernet/lib/python3.6/threading.py", line 916, in _bootstrap_inner
self.run()
File "/home/addison/miniconda3/envs/centernet/lib/python3.6/threading.py", line 864, in run
self._target(*self._args, **self._kwargs)
File "train.py", line 51, in pin_memory
data = data_queue.get()
File "/home/addison/miniconda3/envs/centernet/lib/python3.6/multiprocessing/queues.py", line 113, in get
return _ForkingPickler.loads(res)
File "/home/addison/miniconda3/envs/centernet/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 151, in rebuild_storage_fd
fd = df.detach()
File "/home/addison/miniconda3/envs/centernet/lib/python3.6/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/home/addison/miniconda3/envs/centernet/lib/python3.6/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/home/addison/miniconda3/envs/centernet/lib/python3.6/multiprocessing/connection.py", line 493, in Client
answer_challenge(c, authkey)
File "/home/addison/miniconda3/envs/centernet/lib/python3.6/multiprocessing/connection.py", line 732, in answer_challenge
message = connection.recv_bytes(256) # reject large message
File "/home/addison/miniconda3/envs/centernet/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/home/addison/miniconda3/envs/centernet/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/home/addison/miniconda3/envs/centernet/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
ConnectionResetError: [Errno 104] Connection reset by peer