taichi icon indicating copy to clipboard operation
taichi copied to clipboard

CUDA Error on RTX 5090

Open YilingQiao opened this issue 7 months ago • 0 comments

Describe the bug

Got CUDA_ERROR_ILLEGAL_ADDRESS on NVIDIA GeForce RTX 5090 (32G GPU memory) . Same code works fine on CPU or 4090 (24G GPU memory).

To Reproduce

import taichi as ti

@ti.data_oriented
class ConstraintSolver:
    def __init__(self):
        self._B = 4098
        self.n_dofs = 9
        self.len_constraints_ = 5513

        self.jac = ti.field(
            dtype=ti.f32, shape=self._batch_shape((self.len_constraints_, self.n_dofs))
        )
        self.diag = ti.field(dtype=ti.f32, shape=self._batch_shape(self.len_constraints_))
        self.aref = ti.field(dtype=ti.f32, shape=self._batch_shape(self.len_constraints_))
        self.jac_relevant_dofs = ti.field(
            ti.i32, shape=self._batch_shape((self.len_constraints_, self.n_dofs))
        )
        self.jac_n_relevant_dofs = ti.field(ti.i32, shape=self._batch_shape(self.len_constraints_))
        self.Jaref = ti.field(dtype=ti.f32, shape=self._batch_shape(self.len_constraints_))

        self.prev_active = ti.field(dtype=ti.i32, shape=self._batch_shape(self.len_constraints_))
        self.qfrc_constraint = ti.field(dtype=ti.f32, shape=self._batch_shape(self.n_dofs))
        #######################################
        self.n_constraints = ti.field(ti.i32, shape=self._batch_shape())
        self.active = ti.field(dtype=ti.i32, shape=self._batch_shape(self.len_constraints_))
        self.cost_ws = ti.field(ti.f32, shape=self._batch_shape())
        self.gauss = ti.field(ti.f32, shape=self._batch_shape())
        #######################################
        self.n_constraints.fill(0)



    def _batch_shape(self, shape=None):
        B = self._B
        if shape is None:
            return (B,)
        elif type(shape) in [list, tuple]:
            return (B,) + shape
        else:
            return (B, shape)


    @ti.kernel
    def _func_init_solver(self):
        for i_b in range(self._B):
            self.cost_ws[i_b] = ti.f32(0.0)
            self.gauss[i_b] = ti.f32(0.0)

            for i_c in range(self.n_constraints[i_b]):
                self.active[i_c, i_b] = 1

            for i_d in range(self.n_dofs):
                    k = self.gauss[i_b]
                    self.gauss[i_b] = k

    @ti.kernel
    def _func_solve(self):
        for i_b in range(self._B):
            pass

ti.init(ti.gpu)

c = ConstraintSolver()
c._func_init_solver()
c._func_solve()

Log/Screenshots

[Taichi] version 1.8.0, llvm 15.0.4, commit e2b5e9d1, linux, python 3.12.2                 
[Taichi] Starting on arch=cuda                                                                                            
[E 05/23/25 13:41:36.335 1157121] [cuda_driver.h:operator()@92] CUDA Error CUDA_ERROR_ILLEGAL_ADDRESS: an illegal memory a
ccess was encountered while calling module_load_data_ex (cuModuleLoadDataEx)               
                                                                                                                          
                                                                                                                          
Traceback (most recent call last):                                                                                        
  File "/home/yiling/github/Genesis/debug/250523_memory/memory_5090.py", line 65, in <module>
    c._func_solve()                                                                                                       
  File "/home/yiling/miniconda3/envs/gs312/lib/python3.12/site-packages/taichi/lang/kernel_impl.py", line 1174, in __call_
_                                                                                                                         
    return self._primal(self._kernel_owner, *args, **kwargs)                                                              
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                              
  File "/home/yiling/miniconda3/envs/gs312/lib/python3.12/site-packages/taichi/lang/kernel_impl.py", line 1045, in __call_
_                                                                                                                         
    return self.launch_kernel(kernel_cpp, *args)                                                                          
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                                          
  File "/home/yiling/miniconda3/envs/gs312/lib/python3.12/site-packages/taichi/lang/kernel_impl.py", line 976, in launch_k
ernel                                                                                                                     
    raise e from None                                                                                                     
  File "/home/yiling/miniconda3/envs/gs312/lib/python3.12/site-packages/taichi/lang/kernel_impl.py", line 971, in launch_k
ernel                                                                                                                     
    prog.launch_kernel(compiled_kernel_data, launch_ctx)                                                                  
RuntimeError: [cuda_driver.h:operator()@92] CUDA Error CUDA_ERROR_ILLEGAL_ADDRESS: an illegal memory access was encountere
d while calling module_load_data_ex (cuModuleLoadDataEx)                                                                  
[E 05/23/25 13:41:36.346 1157121] [cuda_driver.h:operator()@92] CUDA Error CUDA_ERROR_ILLEGAL_ADDRESS: an illegal memory a
ccess was encountered while calling stream_synchronize (cuStreamSynchronize)


terminate called after throwing an instance of 'std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<ch
ar> >'
Aborted (core dumped)

Additional comments

Ubuntu 24
Python 3.12
Driver Version: 575.51.03      CUDA Version: 12.9

YilingQiao avatar May 23 '25 20:05 YilingQiao