taichi
taichi copied to clipboard
CUDA Error on RTX 5090
Describe the bug
Got CUDA_ERROR_ILLEGAL_ADDRESS on NVIDIA GeForce RTX 5090 (32G GPU memory) . Same code works fine on CPU or 4090 (24G GPU memory).
To Reproduce
import taichi as ti
@ti.data_oriented
class ConstraintSolver:
def __init__(self):
self._B = 4098
self.n_dofs = 9
self.len_constraints_ = 5513
self.jac = ti.field(
dtype=ti.f32, shape=self._batch_shape((self.len_constraints_, self.n_dofs))
)
self.diag = ti.field(dtype=ti.f32, shape=self._batch_shape(self.len_constraints_))
self.aref = ti.field(dtype=ti.f32, shape=self._batch_shape(self.len_constraints_))
self.jac_relevant_dofs = ti.field(
ti.i32, shape=self._batch_shape((self.len_constraints_, self.n_dofs))
)
self.jac_n_relevant_dofs = ti.field(ti.i32, shape=self._batch_shape(self.len_constraints_))
self.Jaref = ti.field(dtype=ti.f32, shape=self._batch_shape(self.len_constraints_))
self.prev_active = ti.field(dtype=ti.i32, shape=self._batch_shape(self.len_constraints_))
self.qfrc_constraint = ti.field(dtype=ti.f32, shape=self._batch_shape(self.n_dofs))
#######################################
self.n_constraints = ti.field(ti.i32, shape=self._batch_shape())
self.active = ti.field(dtype=ti.i32, shape=self._batch_shape(self.len_constraints_))
self.cost_ws = ti.field(ti.f32, shape=self._batch_shape())
self.gauss = ti.field(ti.f32, shape=self._batch_shape())
#######################################
self.n_constraints.fill(0)
def _batch_shape(self, shape=None):
B = self._B
if shape is None:
return (B,)
elif type(shape) in [list, tuple]:
return (B,) + shape
else:
return (B, shape)
@ti.kernel
def _func_init_solver(self):
for i_b in range(self._B):
self.cost_ws[i_b] = ti.f32(0.0)
self.gauss[i_b] = ti.f32(0.0)
for i_c in range(self.n_constraints[i_b]):
self.active[i_c, i_b] = 1
for i_d in range(self.n_dofs):
k = self.gauss[i_b]
self.gauss[i_b] = k
@ti.kernel
def _func_solve(self):
for i_b in range(self._B):
pass
ti.init(ti.gpu)
c = ConstraintSolver()
c._func_init_solver()
c._func_solve()
Log/Screenshots
[Taichi] version 1.8.0, llvm 15.0.4, commit e2b5e9d1, linux, python 3.12.2
[Taichi] Starting on arch=cuda
[E 05/23/25 13:41:36.335 1157121] [cuda_driver.h:operator()@92] CUDA Error CUDA_ERROR_ILLEGAL_ADDRESS: an illegal memory a
ccess was encountered while calling module_load_data_ex (cuModuleLoadDataEx)
Traceback (most recent call last):
File "/home/yiling/github/Genesis/debug/250523_memory/memory_5090.py", line 65, in <module>
c._func_solve()
File "/home/yiling/miniconda3/envs/gs312/lib/python3.12/site-packages/taichi/lang/kernel_impl.py", line 1174, in __call_
_
return self._primal(self._kernel_owner, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/yiling/miniconda3/envs/gs312/lib/python3.12/site-packages/taichi/lang/kernel_impl.py", line 1045, in __call_
_
return self.launch_kernel(kernel_cpp, *args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/yiling/miniconda3/envs/gs312/lib/python3.12/site-packages/taichi/lang/kernel_impl.py", line 976, in launch_k
ernel
raise e from None
File "/home/yiling/miniconda3/envs/gs312/lib/python3.12/site-packages/taichi/lang/kernel_impl.py", line 971, in launch_k
ernel
prog.launch_kernel(compiled_kernel_data, launch_ctx)
RuntimeError: [cuda_driver.h:operator()@92] CUDA Error CUDA_ERROR_ILLEGAL_ADDRESS: an illegal memory access was encountere
d while calling module_load_data_ex (cuModuleLoadDataEx)
[E 05/23/25 13:41:36.346 1157121] [cuda_driver.h:operator()@92] CUDA Error CUDA_ERROR_ILLEGAL_ADDRESS: an illegal memory a
ccess was encountered while calling stream_synchronize (cuStreamSynchronize)
terminate called after throwing an instance of 'std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<ch
ar> >'
Aborted (core dumped)
Additional comments
Ubuntu 24
Python 3.12
Driver Version: 575.51.03 CUDA Version: 12.9