inference error

Open P3ngLiu opened this issue 3 years ago • 1 comments

The command i used is:
python train_net.py --config-file configs/sparse_inst_r50_base.yaml --num-gpus 8 OUTPUT_DIR output/sparse_inst_r50_base2

Traceback (most recent call last): File "/hd-4t/peng/anaconda3/envs/sparseInst/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap fn(i, *args) File "/hd-4t/peng/anaconda3/envs/sparseInst/lib/python3.8/site-packages/detectron2/engine/launch.py", line 126, in _distributed_worker main_func(*args) File "/hd-4t/fcx/SparseInst/train_net.py", line 178, in main return trainer.train() File "/hd-4t/peng/anaconda3/envs/sparseInst/lib/python3.8/site-packages/detectron2/engine/defaults.py", line 484, in train super().train(self.start_iter, self.max_iter) File "/hd-4t/peng/anaconda3/envs/sparseInst/lib/python3.8/site-packages/detectron2/engine/train_loop.py", line 150, in train self.after_step() File "/hd-4t/peng/anaconda3/envs/sparseInst/lib/python3.8/site-packages/detectron2/engine/train_loop.py", line 180, in after_step h.after_step() File "/hd-4t/peng/anaconda3/envs/sparseInst/lib/python3.8/site-packages/detectron2/engine/hooks.py", line 552, in after_step self._do_eval() File "/hd-4t/peng/anaconda3/envs/sparseInst/lib/python3.8/site-packages/detectron2/engine/hooks.py", line 525, in _do_eval results = self._func() File "/hd-4t/peng/anaconda3/envs/sparseInst/lib/python3.8/site-packages/detectron2/engine/defaults.py", line 453, in test_and_save_results self._last_eval_results = self.test(self.cfg, self.model) File "/hd-4t/peng/anaconda3/envs/sparseInst/lib/python3.8/site-packages/detectron2/engine/defaults.py", line 608, in test results_i = inference_on_dataset(model, data_loader, evaluator) File "/hd-4t/peng/anaconda3/envs/sparseInst/lib/python3.8/site-packages/detectron2/evaluation/evaluator.py", line 158, in inference_on_dataset outputs = model(inputs) File "/hd-4t/peng/anaconda3/envs/sparseInst/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl result = self.forward(*input, **kwargs) File "/hd-4t/peng/anaconda3/envs/sparseInst/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 705, in forward output = self.module(*inputs[0], **kwargs[0]) File "/hd-4t/peng/anaconda3/envs/sparseInst/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl result = self.forward(*input, **kwargs) File "/hd-4t/fcx/SparseInst/sparseinst/sparseinst.py", line 110, in forward results = self.inference( File "/hd-4t/fcx/SparseInst/sparseinst/sparseinst.py", line 161, in inference scores = rescoring_mask( RuntimeError: nvrtc: error: failed to open libnvrtc-builtins.so.11.1. Make sure that libnvrtc-builtins.so.11.1 is installed correctly. nvrtc compilation failed:

#define NAN __int_as_float(0x7fffffff) #define POS_INFINITY __int_as_float(0x7f800000) #define NEG_INFINITY __int_as_float(0xff800000)

template<typename T> device T maximum(T a, T b) { return isnan(a) ? a : (a > b ? a : b); }

template<typename T> device T minimum(T a, T b) { return isnan(a) ? a : (a < b ? a : b); }

extern "C" global void fused_to_mul(float* t0, bool* t1, float* aten_mul, float* aten_to) { { bool t1_1 = t1[(((512 * blockIdx.x + threadIdx.x) / 31104) * 31104 + (512 * blockIdx.x + threadIdx.x) % 216) + 216 * (((512 * blockIdx.x + threadIdx.x) / 216) % 144)]; aten_to[(((512 * blockIdx.x + threadIdx.x) / 31104) * 31104 + (512 * blockIdx.x + threadIdx.x) % 216) + 216 * (((512 * blockIdx.x + threadIdx.x) / 216) % 144)] = (float)(t1_1); float v = __ldg(t0 + (((512 * blockIdx.x + threadIdx.x) / 31104) * 31104 + (512 * blockIdx.x + threadIdx.x) % 216) + 216 * (((512 * blockIdx.x + threadIdx.x) / 216) % 144)); aten_mul[(((512 * blockIdx.x + threadIdx.x) / 31104) * 31104 + (512 * blockIdx.x + threadIdx.x) % 216) + 216 * (((512 * blockIdx.x + threadIdx.x) / 216) % 144)] = v * (float)(t1_1); } } @wondervictor

Jul 20 '22 03:07 P3ngLiu

Hi @P3ngLiu, thanks for your interest in SparseInst! It seems that the libnvrtc.so in your environment has some problems. You may try the methods mentioned in https://github.com/pytorch/pytorch/issues/69689 or https://github.com/chainer/chainer/issues/4813. Hope these solutions can help you~

Jul 26 '22 12:07 wondervictor