i train an asr model, the batch size is 32, and input shape as follows:
input size: torch.Size([386, 32, 1426]) target size: torch.Size([32, 22])
the asg module try to allocate 93G memory. even i set the batch size to 2, it also out of memory.
Traceback (most recent call last):
File "/home/sid/code/masr/train.py", line 157, in
train(model)
File "/home/sid/code/masr/train.py", line 86, in train
loss = asg_loss.forward(out, y_reshape, out_lens, y_lens)
File "/home/sid/software/conda3/envs/torch/lib/python3.6/site-packages/torch_asg/asg.py", line 135, in forward
target_lengths)
File "/home/sid/software/conda3/envs/torch/lib/python3.6/site-packages/torch_asg/asg.py", line 77, in forward
batch_input_len, num_batches, num_labels, batch_output_len)
RuntimeError: CUDA out of memory. Tried to allocate 93.33 GiB (GPU 0; 10.92 GiB total capacity; 522.15 MiB already allocated; 9.42 GiB free; 99.85 MiB cached) (malloc at /pytorch/c10/cuda/CUDACachingAllocator.cpp:267)
frame #0: std::function<std::string ()>::operator()() const + 0x11 (0x7facd45ea441 in /home/sid/software/conda3/envs/torch/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x2a (0x7facd45e9d7a in /home/sid/software/conda3/envs/torch/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #2: + 0x15bc0 (0x7fac78874bc0 in /home/sid/software/conda3/envs/torch/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #3: + 0x16247 (0x7fac78875247 in /home/sid/software/conda3/envs/torch/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #4: at::native::empty_cuda(c10::ArrayRef, c10::TensorOptions const&) + 0x121 (0x7fac875d5761 in /home/sid/software/conda3/envs/torch/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so)
frame #5: at::CUDAType::empty(c10::ArrayRef, c10::TensorOptions const&) const + 0x19b (0x7fac860e297b in /home/sid/software/conda3/envs/torch/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so)
frame #6: torch::autograd::VariableType::empty(c10::ArrayRef, c10::TensorOptions const&) const + 0x284 (0x7fac7974b094 in /home/sid/software/conda3/envs/torch/lib/python3.6/site-packages/torch/lib/libtorch.so.1)
frame #7: at::native::zeros(c10::ArrayRef, c10::TensorOptions const&) + 0x40 (0x7fac7b1eba40 in /home/sid/software/conda3/envs/torch/lib/python3.6/site-packages/torch/lib/libcaffe2.so)
frame #8: at::TypeDefault::zeros(c10::ArrayRef, c10::TensorOptions const&) const + 0x49 (0x7fac7b48e319 in /home/sid/software/conda3/envs/torch/lib/python3.6/site-packages/torch/lib/libcaffe2.so)
frame #9: torch::autograd::VariableType::zeros(c10::ArrayRef, c10::TensorOptions const&) const + 0x213 (0x7fac79732603 in /home/sid/software/conda3/envs/torch/lib/python3.6/site-packages/torch/lib/libtorch.so.1)
frame #10: torch_asg::fast_asg_gpu_forward(at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, long, long, long, long) + 0x4ef (0x7fac627d94ef in /home/sid/software/conda3/envs/torch/lib/python3.6/site-packages/torch_asg_native.cpython-36m-x86_64-linux-gnu.so)
frame #11: + 0x214e9 (0x7fac627d74e9 in /home/sid/software/conda3/envs/torch/lib/python3.6/site-packages/torch_asg_native.cpython-36m-x86_64-linux-gnu.so)
frame #12: + 0x1ddb1 (0x7fac627d3db1 in /home/sid/software/conda3/envs/torch/lib/python3.6/site-packages/torch_asg_native.cpython-36m-x86_64-linux-gnu.so)
frame #13: _PyCFunction_FastCallDict + 0x154 (0x55e57cc95744 in /home/sid/software/conda3/envs/torch/bin/python)
frame #14: + 0x19842c (0x55e57cd1c42c in /home/sid/software/conda3/envs/torch/bin/python)
frame #15: _PyEval_EvalFrameDefault + 0x30a (0x55e57cd4138a in /home/sid/software/conda3/envs/torch/bin/python)
frame #16: PyEval_EvalCodeEx + 0x329 (0x55e57cd17289 in /home/sid/software/conda3/envs/torch/bin/python)
frame #17: + 0x194094 (0x55e57cd18094 in /home/sid/software/conda3/envs/torch/bin/python)
frame #18: PyObject_Call + 0x3e (0x55e57cc9554e in /home/sid/software/conda3/envs/torch/bin/python)
frame #19: THPFunction_apply(_object*, _object*) + 0x6b1 (0x7facd4df1481 in /home/sid/software/conda3/envs/torch/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #20: _PyCFunction_FastCallDict + 0x91 (0x55e57cc95681 in /home/sid/software/conda3/envs/torch/bin/python)
frame #21: + 0x19842c (0x55e57cd1c42c in /home/sid/software/conda3/envs/torch/bin/python)
frame #22: _PyEval_EvalFrameDefault + 0x30a (0x55e57cd4138a in /home/sid/software/conda3/envs/torch/bin/python)
frame #23: + 0x1918e4 (0x55e57cd158e4 in /home/sid/software/conda3/envs/torch/bin/python)
frame #24: + 0x192771 (0x55e57cd16771 in /home/sid/software/conda3/envs/torch/bin/python)
frame #25: + 0x198505 (0x55e57cd1c505 in /home/sid/software/conda3/envs/torch/bin/python)
frame #26: _PyEval_EvalFrameDefault + 0x30a (0x55e57cd4138a in /home/sid/software/conda3/envs/torch/bin/python)
frame #27: + 0x1918e4 (0x55e57cd158e4 in /home/sid/software/conda3/envs/torch/bin/python)
frame #28: + 0x192771 (0x55e57cd16771 in /home/sid/software/conda3/envs/torch/bin/python)
frame #29: + 0x198505 (0x55e57cd1c505 in /home/sid/software/conda3/envs/torch/bin/python)
frame #30: _PyEval_EvalFrameDefault + 0x30a (0x55e57cd4138a in /home/sid/software/conda3/envs/torch/bin/python)
frame #31: PyEval_EvalCodeEx + 0x329 (0x55e57cd17289 in /home/sid/software/conda3/envs/torch/bin/python)
frame #32: PyEval_EvalCode + 0x1c (0x55e57cd1801c in /home/sid/software/conda3/envs/torch/bin/python)
frame #33: + 0x2163c4 (0x55e57cd9a3c4 in /home/sid/software/conda3/envs/torch/bin/python)
frame #34: PyRun_FileExFlags + 0xa1 (0x55e57cd9a7c1 in /home/sid/software/conda3/envs/torch/bin/python)
frame #35: PyRun_SimpleFileExFlags + 0x1c3 (0x55e57cd9a9c3 in /home/sid/software/conda3/envs/torch/bin/python)
frame #36: Py_Main + 0x613 (0x55e57cd9e4b3 in /home/sid/software/conda3/envs/torch/bin/python)
frame #37: main + 0xee (0x55e57cc6702e in /home/sid/software/conda3/envs/torch/bin/python)
frame #38: __libc_start_main + 0xe7 (0x7facd968ab97 in /lib/x86_64-linux-gnu/libc.so.6)
frame #39: + 0x1c3e0e (0x55e57cd47e0e in /home/sid/software/conda3/envs/torch/bin/python)