bonnetal
bonnetal copied to clipboard
Can't Fix RuntimeError: cuDNN error: CUDNN_STATUS_NOT_INITIALIZED ?
I am finding the same error but do not seem to be able to solve it. I have changes the labels and preprocessed the label file (changed labels.py and ran python createTrainIdLabelImgs.py ) but the code still exits before completing
File ../../tasks/segmentation/modules/trainer.py, line 488, in train_epoch loss.backward()
Do you have any idea what I could do to solve this issue?
My labels.py file in cityscapes:
labels = [
# name id trainId category catId hasInstances ignoreInEval color
Label( 'unlabeled' , 0 , 19 , 'void' , 0 , False , True , ( 0, 0, 0) ),
Label( 'ego vehicle' , 1 , 19 , 'void' , 0 , False , True , ( 0, 0, 0) ),
Label( 'rectification border' , 2 , 19 , 'void' , 0 , False , True , ( 0, 0, 0) ),
Label( 'out of roi' , 3 , 19 , 'void' , 0 , False , True , ( 0, 0, 0) ),
Label( 'static' , 4 , 19 , 'void' , 0 , False , True , ( 0, 0, 0) ),
Label( 'dynamic' , 5 , 19 , 'void' , 0 , False , True , (111, 74, 0) ),
Label( 'ground' , 6 , 19 , 'void' , 0 , False , True , ( 81, 0, 81) ),
Label( 'road' , 7 , 0 , 'flat' , 1 , False , False , (128, 64,128) ),
Label( 'sidewalk' , 8 , 1 , 'flat' , 1 , False , False , (244, 35,232) ),
Label( 'parking' , 9 , 19 , 'flat' , 1 , False , True , (250,170,160) ),
Label( 'rail track' , 10 , 19 , 'flat' , 1 , False , True , (230,150,140) ),
Label( 'building' , 11 , 2 , 'construction' , 2 , False , False , ( 70, 70, 70) ),
Label( 'wall' , 12 , 3 , 'construction' , 2 , False , False , (102,102,156) ),
Label( 'fence' , 13 , 4 , 'construction' , 2 , False , False , (190,153,153) ),
Label( 'guard rail' , 14 , 19 , 'construction' , 2 , False , True , (180,165,180) ),
Label( 'bridge' , 15 , 19 , 'construction' , 2 , False , True , (150,100,100) ),
Label( 'tunnel' , 16 , 19 , 'construction' , 2 , False , True , (150,120, 90) ),
Label( 'pole' , 17 , 5 , 'object' , 3 , False , False , (153,153,153) ),
Label( 'polegroup' , 18 , 19 , 'object' , 3 , False , True , (153,153,153) ),
Label( 'traffic light' , 19 , 6 , 'object' , 3 , False , False , (250,170, 30) ),
Label( 'traffic sign' , 20 , 7 , 'object' , 3 , False , False , (220,220, 0) ),
Label( 'vegetation' , 21 , 8 , 'nature' , 4 , False , False , (107,142, 35) ),
Label( 'terrain' , 22 , 9 , 'nature' , 4 , False , False , (152,251,152) ),
Label( 'sky' , 23 , 10 , 'sky' , 5 , False , False , ( 70,130,180) ),
Label( 'person' , 24 , 11 , 'human' , 6 , True , False , (220, 20, 60) ),
Label( 'rider' , 25 , 12 , 'human' , 6 , True , False , (255, 0, 0) ),
Label( 'car' , 26 , 13 , 'vehicle' , 7 , True , False , ( 0, 0,142) ),
Label( 'truck' , 27 , 14 , 'vehicle' , 7 , True , False , ( 0, 0, 70) ),
Label( 'bus' , 28 , 15 , 'vehicle' , 7 , True , False , ( 0, 60,100) ),
Label( 'caravan' , 29 , 19 , 'vehicle' , 7 , True , True , ( 0, 0, 90) ),
Label( 'trailer' , 30 , 19 , 'vehicle' , 7 , True , True , ( 0, 0,110) ),
Label( 'train' , 31 , 16 , 'vehicle' , 7 , True , False , ( 0, 80,100) ),
Label( 'motorcycle' , 32 , 17 , 'vehicle' , 7 , True , False , ( 0, 0,230) ),
Label( 'bicycle' , 33 , 18 , 'vehicle' , 7 , True , False , (119, 11, 32) ),
Label( 'license plate' , -1 , 19 , 'vehicle' , 7 , False , True , ( 0, 0,142) ),
]
Traceback:
./train.py -c ~/bonnetal/train/tasks/segmentation/config/cityscapes/ERFNet.yaml -l ~/bonnetal/train/tasks/segmentation/log1
/home/cris/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_qint8 = np.dtype([("qint8", np.int8, 1)])
/home/cris/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/home/cris/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_qint16 = np.dtype([("qint16", np.int16, 1)])
/home/cris/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/home/cris/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_qint32 = np.dtype([("qint32", np.int32, 1)])
/home/cris/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
np_resource = np.dtype([("resource", np.ubyte, 1)])
/home/cris/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_qint8 = np.dtype([("qint8", np.int8, 1)])
/home/cris/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/home/cris/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_qint16 = np.dtype([("qint16", np.int16, 1)])
/home/cris/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/home/cris/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_qint32 = np.dtype([("qint32", np.int32, 1)])
/home/cris/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
np_resource = np.dtype([("resource", np.ubyte, 1)])
----------
INTERFACE:
config yaml: /home/cris/bonnetal/train/tasks/segmentation/config/cityscapes/ERFNet.yaml
log dir /home/cris/bonnetal/train/tasks/segmentation/log1
model path None
eval only False
No batchnorm False
----------
Commit hash (training version): b'5368eed'
----------
Opening config file /home/cris/bonnetal/train/tasks/segmentation/config/cityscapes/ERFNet.yaml
No pretrained directory found.
Copying files to /home/cris/bonnetal/train/tasks/segmentation/log1 for further reference.
WARNING:tensorflow:From ../../common/logger.py:16: The name tf.summary.FileWriter is deprecated. Please use tf.compat.v1.summary.FileWriter instead.
Images from: ~/bonnetal/cityscapes/leftImg8bit/train
Labels from: ~/bonnetal/cityscapes/gtFine/train
LENGTH 2975 2975
Inference batch size: 4
Images from: ~/bonnetal/cityscapes/leftImg8bit/val
Labels from: ~/bonnetal/cityscapes/gtFine/val
LENGTH 500 500
Original OS: 8
New OS: 8
Trying to get backbone weights online from Bonnetal server.
Using pretrained weights from bonnetal server for backbone
OS: 1 , channels: 16
OS: 2 , channels: 16
OS: 4 , channels: 64
[Decoder] os: 4 in: 128 skip: 64 out: 64
[Decoder] os: 2 in: 64 skip: 16 out: 16
[Decoder] os: 1 in: 16 skip: 3 out: 16
Using normalized weights as bias for head.
No path to pretrained, using bonnetal Imagenet backbone weights and random decoder.
Total number of parameters: 2252148
Total number of parameters requires_grad: 2252148
Param encoder 1913168
Param decoder 338640
Param head 340
Training in device: cuda
/home/cris/.local/lib/python3.6/site-packages/torch/optim/lr_scheduler.py:100: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule.See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)
Ignoring class 19 in IoU evaluation
[IOU EVAL] IGNORE: tensor([19])
[IOU EVAL] INCLUDE: tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18])
Let's see if it finishes this
/pytorch/aten/src/THCUNN/SpatialClassNLLCriterion.cu:104: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T *, T *, T *, long *, T *, int, int, int, int, int, long) [with T = float, AccumT = float]: block: [1,0,0], thread: [576,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/THCUNN/SpatialClassNLLCriterion.cu:104: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T *, T *, T *, long *, T *, int, int, int, int, int, long) [with T = float, AccumT = float]: block: [1,0,0], thread: [577,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/THCUNN/SpatialClassNLLCriterion.cu:104: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T *, T *, T *, long *, T *, int, int, int, int, int, long) [with T = float, AccumT = float]: block: [1,0,0], thread: [578,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/THCUNN/SpatialClassNLLCriterion.cu:104: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T *, T *, T *, long *, T *, int, int, int, int, int, long) [with T = float, AccumT = float]: block: [1,0,0], thread: [579,0,0] Assertion `t >= 0 && t < n_classes` failed.
Traceback (most recent call last):
File "./train.py", line 117, in <module>
trainer.train()
File "../../tasks/segmentation/modules/trainer.py", line 302, in train
scheduler=self.scheduler)
File "../../tasks/segmentation/modules/trainer.py", line 488, in train_epoch
loss.backward()
File "/home/cris/.local/lib/python3.6/site-packages/torch/tensor.py", line 166, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/cris/.local/lib/python3.6/site-packages/torch/autograd/__init__.py", line 99, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: cuDNN error: CUDNN_STATUS_NOT_INITIALIZED