libtorch-yolov3 could run with cpu but not with CUDA

thanks for your great work. I could run it with cpu and gpu in my PC, but when i planted the code from PC to NVIDIA TX2, errors came. I could run with cpu but not with cuda. details was that: terminate called after throwing an instance of 'std::runtime_error' what():
Tensor for argument #4 'running_mean' is on CPU, but expected it to be on GPU (while checking arguments for cudnn_batch_norm) (checkSameGPU at /home/nvidia/workspace/wangh/libraries/pytorch/aten/src/ATen/TensorUtils.cpp:122) frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) + 0x7c (0x7f7dc40374 in /home/nvidia/workspace/wangh/libraries/pytorch/torch/lib/libc10.so) frame #1: at::checkSameGPU(char const*, at::TensorArg const&, at::TensorArg const&) + 0xbec (0x7f904c771c in /home/nvidia/workspace/wangh/libraries/pytorch/torch/lib/libcaffe2.so) frame #2: at::checkAllSameGPU(char const*, c10::ArrayRefat::TensorArg) + 0x44 (0x7f904c8374 in /home/nvidia/workspace/wangh/libraries/pytorch/torch/lib/libcaffe2.so) frame #3: at::native::cudnn_batch_norm(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, bool, double, double) + 0x3b8 (0x7f7e9ef918 in /home/nvidia/workspace/wangh/libraries/pytorch/torch/lib/libcaffe2_gpu.so) frame #4: at::CUDAFloatType::cudnn_batch_norm(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, bool, double, double) const + 0xd8 (0x7f7eadd968 in /home/nvidia/workspace/wangh/libraries/pytorch/torch/lib/libcaffe2_gpu.so) frame #5: torch::autograd::VariableType::cudnn_batch_norm(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, bool, double, double) const + 0x568 (0x7f93da2160 in /home/nvidia/workspace/wangh/libraries/pytorch/torch/lib/libtorch.so.1) frame #6: at::native::_batch_norm_impl_index(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, bool, double, double, bool) + 0x3a0 (0x7f905463e0 in /home/nvidia/workspace/wangh/libraries/pytorch/torch/lib/libcaffe2.so) frame #7: at::TypeDefault::_batch_norm_impl_index(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, bool, double, double, bool) const + 0xe4 (0x7f90976f74 in /home/nvidia/workspace/wangh/libraries/pytorch/torch/lib/libcaffe2.so) frame #8: torch::autograd::VariableType::_batch_norm_impl_index(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, bool, double, double, bool) const + 0x2a4 (0x7f93d1d1ac in /home/nvidia/workspace/wangh/libraries/pytorch/torch/lib/libtorch.so.1) frame #9: at::native::batch_norm(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, bool, double, double, bool) + 0x8c (0x7f90544dac in /home/nvidia/workspace/wangh/libraries/pytorch/torch/lib/libcaffe2.so) frame #10: at::TypeDefault::batch_norm(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, bool, double, double, bool) const + 0xe4 (0x7f90976e04 in /home/nvidia/workspace/wangh/libraries/pytorch/torch/lib/libcaffe2.so) frame #11: torch::autograd::VariableType::batch_norm(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, bool, double, double, bool) const + 0x21c (0x7f93d1ecdc in /home/nvidia/workspace/wangh/libraries/pytorch/torch/lib/libtorch.so.1) frame #12: + 0x7cc8ac (0x7f93e6d8ac in /home/nvidia/workspace/wangh/libraries/pytorch/torch/lib/libtorch.so.1) frame #13: + 0x88a660 (0x7f93f2b660 in /home/nvidia/workspace/wangh/libraries/pytorch/torch/lib/libtorch.so.1) frame #14: torch::jit::InterpreterState::run(std::vector<c10::IValue, std::allocatorc10::IValue >&) + 0x38 (0x7f93f26838 in /home/nvidia/workspace/wangh/libraries/pytorch/torch/lib/libtorch.so.1) frame #15: torch::jit::GraphExecutor::run(std::vector<c10::IValue, std::allocatorc10::IValue >&) + 0x188 (0x7f93f06cd0 in /home/nvidia/workspace/wangh/libraries/pytorch/torch/lib/libtorch.so.1) frame #16: torch::jit::script::Method::run(std::vector<c10::IValue, std::allocatorc10::IValue >&) + 0x98 (0x4dfe64 in ./yolo-app) frame #17: torch::jit::script::Method::operator()(std::vector<c10::IValue, std::allocatorc10::IValue >) + 0x30 (0x4dfec0 in ./yolo-app) frame #18: torch::jit::script::Module::forward(std::vector<c10::IValue, std::allocatorc10::IValue >) + 0x84 (0x4e0fa4 in ./yolo-app) frame #19: classifier(std::shared_ptrtorch::jit::script::Module, at::Tensor) + 0xfc (0x4d6760 in ./yolo-app) frame #20: main + 0xc88 (0x4d8954 in ./yolo-app) frame #21: __libc_start_main + 0xe0 (0x7f7cf718a0 in /lib/aarch64-linux-gnu/libc.so.6) : operation failed in interpreter: op_version_set = 0 def forward(self, input_1: Tensor) -> Tensor: input_2 = torch.convolution(input_1, getattr(self.layer1, "0").weight, None, [1, 3], [0, 0], [1, 1], False, [0, 0], 1, False, False, True) 0 = torch.add(getattr(self.layer1, "1").num_batches_tracked, CONSTANTS.c0, alpha=1) input_3 = torch.batch_norm(input_2, getattr(self.layer1, "1").weight, getattr(self.layer1, "1").bias, getattr(self.layer1, "1").running_mean, getattr(self.layer1, "1").running_var, True, 0.10000000000000001, 1.0000000000000001e-05, True) ~~~~~~~~~~~~~~~~ <--- HERE input_4 = torch.threshold(input_3, 0., 0.) input_5, _1 = torch.max_pool2d_with_indices(input_4, [2, 2], [2, 2], [0, 0], [1, 1], False) input_6 = torch.convolution(input_5, getattr(self.layer2, "0").weight, None, [1, 1], [2, 0], [1, 1], False, [0, 0], 2, False, False, True) 2 = torch.add(getattr(self.layer2, "1").num_batches_tracked, CONSTANTS.c0, alpha=1) input_7 = torch.batch_norm(input_6, getattr(self.layer2, "1").weight, getattr(self.layer2, "1").bias, getattr(self.layer2, "1").running_mean, getattr(self.layer2, "1").running_var, True, 0.10000000000000001, 1.0000000000000001e-05, True) input_8 = torch.threshold(input_7, 0., 0.) input_9, _3 = torch.max_pool2d_with_indices(input_8, [3, 3], [2, 2], [1, 0], [1, 1], False) input_10 = torch._convolution(input_9, getattr(self.layer3, "0").weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 1, False, False, True) 4 = torch.add(getattr(self.layer3, "1").num_batches_tracked, CONSTANTS.c0, alpha=1) Aborted (core dumped) why?

Mar 20 '19 07:03 nsjiow

I encountered almost the same question, have you solved it? if so, how? @nsjiow thanks very much

Apr 18 '19 05:04 asa008

you can try load model directly into GPU, like:

std::shared_ptr<torch::jit::script::Module> module = torch::jit::load(argv[1], torch::kCUDA);

Apr 19 '19 05:04 asa008

libtorch-yolov3 libtorch-yolov3 copied to clipboard

could run with cpu but not with CUDA

libtorch-yolov3
libtorch-yolov3 copied to clipboard