curobo Encounter Runtime Error CUDA error invalid argument while trying "Motion Generation" example

Hi, i encounter the bug "RuntimeError: CUDA error: invalid argument' while trying the example "Motion Generation" in webpage https://curobo.org/get_started/2a_python_examples.html. Hope for your help~

1.Basic Environments:
Ubuntu 22.04, NVIDIA_DRIVER 550.90.07 Cuda11.8, Cudnn 8.9, python=3.10.12 pytorch=2.0.1+cu118, curobo=0.7.4.post1.dev0+dirty g++ 11.4.0 gcc 11.4.0

I try torch 2.0.0, 2.2, also failed and same error.

The code i try is the example supplied in "Using in Python" Page.

# Third Party
import torch

# cuRobo
from curobo.types.math import Pose
from curobo.types.robot import JointState
from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig

world_config = {
    "mesh": {
        "base_scene": {
            "pose": [10.5, 0.080, 1.6, 0.043, -0.471, 0.284, 0.834],
            "file_path": "scene/nvblox/srl_ur10_bins.obj",
        },
    },
    "cuboid": {
        "table": {
            "dims": [5.0, 5.0, 0.2],  # x, y, z
            "pose": [0.0, 0.0, -0.1, 1, 0, 0, 0.0],  # x, y, z, qw, qx, qy, qz
        },
    },
}

motion_gen_config = MotionGenConfig.load_from_robot_config(
    "ur5e.yml",
    world_config,
    interpolation_dt=0.01,
)
motion_gen = MotionGen(motion_gen_config)
motion_gen.warmup()

retract_cfg = motion_gen.get_retract_config()

state = motion_gen.rollout_fn.compute_kinematics(
    JointState.from_position(retract_cfg.view(1, -1))
)

goal_pose = Pose.from_list([-0.4, 0.0, 0.4, 1.0, 0.0, 0.0, 0.0])  # x, y, z, qw, qx, qy, qz
start_state = JointState.from_position(
    torch.zeros(1, 6).cuda(),
    joint_names=[
        "shoulder_pan_joint",
        "shoulder_lift_joint",
        "elbow_joint",
        "wrist_1_joint",
        "wrist_2_joint",
        "wrist_3_joint",
    ],
)

result = motion_gen.plan_single(start_state, goal_pose, MotionGenPlanConfig(max_attempts=1))
traj = result.get_interpolated_plan()  # result.interpolation_dt has the dt between timesteps
print("Trajectory Generated: ", result.success)

Detailed Error Info:

{
	"name": "RuntimeError",
	"message": "CUDA error: invalid argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
",
	"stack": "---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[7], line 30
     24 motion_gen_config = MotionGenConfig.load_from_robot_config(
     25     \"ur5e.yml\",
     26     world_config,
     27     interpolation_dt=0.01,
     28 )
     29 motion_gen = MotionGen(motion_gen_config)
---> 30 motion_gen.warmup()
     32 retract_cfg = motion_gen.get_retract_config()
     34 state = motion_gen.rollout_fn.compute_kinematics(
     35     JointState.from_position(retract_cfg.view(1, -1))
     36 )

File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/motion_gen.py:1863, in MotionGen.warmup(self, enable_graph, batch, warmup_js_trajopt, batch_env_mode, parallel_finetune, n_goalset, warmup_joint_index, warmup_joint_delta)
   1861     goal_state.position[..., warmup_joint_index] += warmup_joint_delta
   1862     for _ in range(3):
-> 1863         self.plan_single_js(
   1864             start_state.clone(),
   1865             goal_state.clone(),
   1866             MotionGenPlanConfig(max_attempts=1, enable_finetune_trajopt=True),
   1867         )
   1869 if enable_graph:
   1870     start_state = JointState.from_position(
   1871         self.rollout_fn.dynamics_model.retract_config.view(1, -1).clone(),
   1872         joint_names=self.rollout_fn.joint_names,
   1873     )

File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/motion_gen.py:2073, in MotionGen.plan_single_js(self, start_state, goal_state, plan_config)
   2070         return result
   2072 for n in range(plan_config.max_attempts):
-> 2073     result = self._plan_js_from_solve_state(
   2074         solve_state, start_state, goal_state, plan_config=plan_config
   2075     )
   2076     time_dict[\"trajopt_time\"] += result.trajopt_time
   2077     time_dict[\"graph_time\"] += result.graph_time

File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/motion_gen.py:3734, in MotionGen._plan_js_from_solve_state(self, solve_state, start_state, goal_state, plan_config)
   3732 if self.optimize_dt:
   3733     self.finetune_js_trajopt_solver.update_solver_dt(scaled_dt.item())
-> 3734 traj_result = self._solve_trajopt_from_solve_state(
   3735     goal,
   3736     solve_state,
   3737     seed_traj,
   3738     trajopt_instance=self.finetune_js_trajopt_solver,
   3739     num_seeds_override=solve_state.num_trajopt_seeds,
   3740     newton_iters=newton_iters,
   3741     return_all_solutions=False,
   3742 )
   3744 finetune_time += traj_result.solve_time
   3745 if torch.count_nonzero(traj_result.success) > 0 or not self.optimize_dt:

File /usr/lib/python3.10/contextlib.py:79, in ContextDecorator.__call__.<locals>.inner(*args, **kwds)
     76 @wraps(func)
     77 def inner(*args, **kwds):
     78     with self._recreate_cm():
---> 79         return func(*args, **kwds)

File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/motion_gen.py:2820, in MotionGen._solve_trajopt_from_solve_state(self, goal, solve_state, act_seed, use_nn_seed, return_all_solutions, seed_success, newton_iters, trajopt_instance, num_seeds_override)
   2818 if num_seeds_override is None:
   2819     num_seeds_override = solve_state.num_trajopt_seeds
-> 2820 traj_result = trajopt_instance.solve_any(
   2821     solve_state.solve_type,
   2822     goal,
   2823     act_seed,
   2824     use_nn_seed,
   2825     return_all_solutions,
   2826     num_seeds_override,
   2827     seed_success,
   2828     newton_iters=newton_iters,
   2829 )
   2830 return traj_result

File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/trajopt.py:805, in TrajOptSolver.solve_any(self, solve_type, goal, seed_traj, use_nn_seed, return_all_solutions, num_seeds, seed_success, newton_iters)
    783 \"\"\"Solve trajectory optimization problem with any solve type.
    784 
    785 Args:
   (...)
    802     TrajOptResult: Result of the trajectory optimization.
    803 \"\"\"
    804 if solve_type == ReacherSolveType.SINGLE:
--> 805     return self.solve_single(
    806         goal,
    807         seed_traj,
    808         use_nn_seed,
    809         return_all_solutions,
    810         num_seeds,
    811         newton_iters=newton_iters,
    812     )
    813 elif solve_type == ReacherSolveType.GOALSET:
    814     return self.solve_goalset(
    815         goal,
    816         seed_traj,
   (...)
    820         newton_iters=newton_iters,
    821     )

File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/trajopt.py:978, in TrajOptSolver.solve_single(self, goal, seed_traj, use_nn_seed, return_all_solutions, num_seeds, newton_iters)
    969     num_seeds = self.num_seeds
    970 solve_state = ReacherSolveState(
    971     ReacherSolveType.SINGLE,
    972     num_trajopt_seeds=num_seeds,
   (...)
    975     n_goalset=1,
    976 )
--> 978 return self._solve_from_solve_state(
    979     solve_state,
    980     goal,
    981     seed_traj,
    982     use_nn_seed,
    983     return_all_solutions,
    984     num_seeds,
    985     newton_iters=newton_iters,
    986 )

File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/trajopt.py:918, in TrajOptSolver._solve_from_solve_state(self, solve_state, goal, seed_traj, use_nn_seed, return_all_solutions, num_seeds, seed_success, newton_iters)
    916     goal_buffer.goal_state = None
    917 self.solver.reset()
--> 918 result = self.solver.solve(goal_buffer, seed_traj)
    919 log_info(\"Ran TO\")
    920 traj_result = self._get_result(
    921     result,
    922     return_all_solutions,
   (...)
    926     solve_state.batch_mode,
    927 )

File ~/桌面/lhy/code/curobo/src/curobo/wrap/wrap_base.py:155, in WrapBase.solve(self, goal, seed)
    153     log_info(\"Solver was not initialized, warming up solver\")
    154     for _ in range(2):
--> 155         act_seq = self.optimize(seed, shift_steps=0)
    156     self._init_solver = True
    157 act_seq = self.optimize(seed, shift_steps=0)

File ~/桌面/lhy/code/curobo/src/curobo/wrap/wrap_base.py:77, in WrapBase.optimize(self, act_seq, shift_steps)
     75 def optimize(self, act_seq: torch.Tensor, shift_steps: int = 0) -> torch.Tensor:
     76     for opt in self.optimizers:
---> 77         act_seq = opt.optimize(act_seq, shift_steps)
     78     return act_seq

File ~/桌面/lhy/code/curobo/src/curobo/opt/opt_base.py:171, in Optimizer.optimize(self, opt_tensor, shift_steps, n_iters)
    169     self.COLD_START = False
    170 st_time = time.time()
--> 171 out = self._optimize(opt_tensor, shift_steps, n_iters)
    172 if self.sync_cuda_time:
    173     torch.cuda.synchronize(device=self.tensor_args.device)

File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:144, in NewtonOptBase._optimize(self, q, shift_steps, n_iters)
    142 # run opt graph
    143 if not self.cu_opt_init:
--> 144     self._initialize_opt_iters_graph(q, grad_q, shift_steps=shift_steps)
    145 for i in range(self.outer_iters):
    146     best_q, best_cost, q, grad_q = self._call_opt_iters_graph(q, grad_q)

File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:499, in NewtonOptBase._initialize_opt_iters_graph(self, q, grad_q, shift_steps)
    497 def _initialize_opt_iters_graph(self, q, grad_q, shift_steps):
    498     if self.use_cuda_graph:
--> 499         self._create_opt_iters_graph(q, grad_q, shift_steps)
    500     self.cu_opt_init = True

File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:541, in NewtonOptBase._create_opt_iters_graph(self, q, grad_q, shift_steps)
    539 with torch.cuda.stream(s):
    540     for _ in range(3):
--> 541         self._cu_opt_q, self._cu_opt_cost, self._cu_q, self._cu_gq = self._opt_iters(
    542             self._cu_opt_q_in, self._cu_opt_gq_in, shift_steps
    543         )
    544 torch.cuda.current_stream(device=self.tensor_args.device).wait_stream(s)
    545 self.reset()

File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:173, in NewtonOptBase._opt_iters(self, q, grad_q, shift_steps)
    171 for _ in range(self.inner_iters):
    172     self.i += 1
--> 173     cost_n, q, grad_q = self._opt_step(q.detach(), grad_q.detach())
    174 if self.store_debug:
    175     self.debug.append(self.best_q.view(-1, self.action_horizon, self.d_action).clone())

File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:184, in NewtonOptBase._opt_step(self, q, grad_q)
    182     q_n, cost_n, grad_q_n = self._approx_line_search(q, grad_q)
    183 with profiler.record_function(\"newton/step_direction\"):
--> 184     grad_q = self._get_step_direction(cost_n, q_n, grad_q_n)
    185 with profiler.record_function(\"newton/update_best\"):
    186     self._update_best(q_n, grad_q_n, cost_n)

File ~/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    112 @functools.wraps(func)
    113 def decorate_context(*args, **kwargs):
    114     with ctx_factory():
--> 115         return func(*args, **kwargs)

File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/lbfgs.py:167, in LBFGSOpt._get_step_direction(self, cost, q, grad_q)
    165 if self.use_cuda_kernel:
    166     with profiler.record_function(\"lbfgs/fused\"):
--> 167         dq = LBFGScu.apply(
    168             self.step_q_buffer,
    169             self.rho_buffer,
    170             self.y_buffer,
    171             self.s_buffer,
    172             q,
    173             grad_q,
    174             self.x_0,
    175             self.grad_0,
    176             self.epsilon,
    177             self.stable_mode,
    178             self.use_shared_buffers_kernel,
    179         )
    181 else:
    183     self._update_buffers(q, grad_q)

File ~/.local/lib/python3.10/site-packages/torch/autograd/function.py:506, in Function.apply(cls, *args, **kwargs)
    503 if not torch._C._are_functorch_transforms_active():
    504     # See NOTE: [functorch vjp and autograd interaction]
    505     args = _functorch.utils.unwrap_dead_wrappers(args)
--> 506     return super().apply(*args, **kwargs)  # type: ignore[misc]
    508 if cls.setup_context == _SingleLevelFunction.setup_context:
    509     raise RuntimeError(
    510         'In order to use an autograd.Function with functorch transforms '
    511         '(vmap, grad, jvp, jacrev, ...), it must override the setup_context '
    512         'staticmethod. For more details, please see '
    513         'https://pytorch.org/docs/master/notes/extending.func.html')

File ~/桌面/lhy/code/curobo/src/curobo/curobolib/opt.py:58, in LBFGScu.forward(ctx, step_vec, rho_buffer, y_buffer, s_buffer, q, grad_q, x_0, grad_0, epsilon, stable_mode, use_shared_buffers)
     41 @staticmethod
     42 def forward(
     43     ctx,
   (...)
     54     use_shared_buffers=True,
     55 ):
     56     m, b, v_dim, _ = y_buffer.shape
---> 58     R = lbfgs_step_cu.forward(
     59         step_vec,  # .view(-1),
     60         rho_buffer,  # .view(-1),
     61         y_buffer,  # .view(-1),
     62         s_buffer,  # .view(-1),
     63         q,
     64         grad_q,  # .view(-1),
     65         x_0,
     66         grad_0,
     67         epsilon,
     68         b,
     69         m,
     70         v_dim,
     71         stable_mode,
     72         use_shared_buffers,
     73     )
     74     step_v = R[0].view(step_vec.shape)
     76     # ctx.save_for_backward(batch_spheres, robot_spheres, link_mats, link_sphere_map)

RuntimeError: CUDA error: invalid argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
"
}

Jul 31 '24 10:07 linhy97

Did you install git lfs? My guess is that the mesh file was not cloned. Try sudo apt install git-lfs followed by cd curobo && git lfs pull.

If that still doesn't work, does the below example work?

# Third Party
import torch

# cuRobo
from curobo.types.math import Pose
from curobo.types.robot import JointState
from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig

world_config = {
    "cuboid": {
        "table": {
            "dims": [5.0, 5.0, 0.2],  # x, y, z
            "pose": [0.0, 0.0, -0.1, 1, 0, 0, 0.0],  # x, y, z, qw, qx, qy, qz
        },
    },
}

motion_gen_config = MotionGenConfig.load_from_robot_config(
    "ur5e.yml",
    world_config,
    interpolation_dt=0.01,
)
motion_gen = MotionGen(motion_gen_config)
motion_gen.warmup()

retract_cfg = motion_gen.get_retract_config()

state = motion_gen.rollout_fn.compute_kinematics(
    JointState.from_position(retract_cfg.view(1, -1))
)

goal_pose = Pose.from_list([-0.4, 0.0, 0.4, 1.0, 0.0, 0.0, 0.0])  # x, y, z, qw, qx, qy, qz
start_state = JointState.from_position(
    torch.zeros(1, 6).cuda(),
    joint_names=[
        "shoulder_pan_joint",
        "shoulder_lift_joint",
        "elbow_joint",
        "wrist_1_joint",
        "wrist_2_joint",
        "wrist_3_joint",
    ],
)

result = motion_gen.plan_single(start_state, goal_pose, MotionGenPlanConfig(max_attempts=1))
traj = result.get_interpolated_plan()  # result.interpolation_dt has the dt between timesteps
print("Trajectory Generated: ", result.success)

Jul 31 '24 21:07 balakumar-s

I check the git-lfs and it was installed. And then i try the code you supply, however it also doesn't work. The error is the same while execute motion_gen.warmup() It seems not the error of "srl_ur10_bins.obj" (87.2MB)

Error Info detail:

{
	"name": "RuntimeError",
	"message": "CUDA error: invalid argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
",
	"stack": "---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[1], line 24
     18 motion_gen_config = MotionGenConfig.load_from_robot_config(
     19     \"ur5e.yml\",
     20     world_config,
     21     interpolation_dt=0.01,
     22 )
     23 motion_gen = MotionGen(motion_gen_config)
---> 24 motion_gen.warmup()
     26 retract_cfg = motion_gen.get_retract_config()
     28 state = motion_gen.rollout_fn.compute_kinematics(
     29     JointState.from_position(retract_cfg.view(1, -1))
     30 )

File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/motion_gen.py:1863, in MotionGen.warmup(self, enable_graph, batch, warmup_js_trajopt, batch_env_mode, parallel_finetune, n_goalset, warmup_joint_index, warmup_joint_delta)
   1861     goal_state.position[..., warmup_joint_index] += warmup_joint_delta
   1862     for _ in range(3):
-> 1863         self.plan_single_js(
   1864             start_state.clone(),
   1865             goal_state.clone(),
   1866             MotionGenPlanConfig(max_attempts=1, enable_finetune_trajopt=True),
   1867         )
   1869 if enable_graph:
   1870     start_state = JointState.from_position(
   1871         self.rollout_fn.dynamics_model.retract_config.view(1, -1).clone(),
   1872         joint_names=self.rollout_fn.joint_names,
   1873     )

File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/motion_gen.py:2073, in MotionGen.plan_single_js(self, start_state, goal_state, plan_config)
   2070         return result
   2072 for n in range(plan_config.max_attempts):
-> 2073     result = self._plan_js_from_solve_state(
   2074         solve_state, start_state, goal_state, plan_config=plan_config
   2075     )
   2076     time_dict[\"trajopt_time\"] += result.trajopt_time
   2077     time_dict[\"graph_time\"] += result.graph_time

File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/motion_gen.py:3734, in MotionGen._plan_js_from_solve_state(self, solve_state, start_state, goal_state, plan_config)
   3732 if self.optimize_dt:
   3733     self.finetune_js_trajopt_solver.update_solver_dt(scaled_dt.item())
-> 3734 traj_result = self._solve_trajopt_from_solve_state(
   3735     goal,
   3736     solve_state,
   3737     seed_traj,
   3738     trajopt_instance=self.finetune_js_trajopt_solver,
   3739     num_seeds_override=solve_state.num_trajopt_seeds,
   3740     newton_iters=newton_iters,
   3741     return_all_solutions=False,
   3742 )
   3744 finetune_time += traj_result.solve_time
   3745 if torch.count_nonzero(traj_result.success) > 0 or not self.optimize_dt:

File /usr/lib/python3.10/contextlib.py:79, in ContextDecorator.__call__.<locals>.inner(*args, **kwds)
     76 @wraps(func)
     77 def inner(*args, **kwds):
     78     with self._recreate_cm():
---> 79         return func(*args, **kwds)

File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/motion_gen.py:2820, in MotionGen._solve_trajopt_from_solve_state(self, goal, solve_state, act_seed, use_nn_seed, return_all_solutions, seed_success, newton_iters, trajopt_instance, num_seeds_override)
   2818 if num_seeds_override is None:
   2819     num_seeds_override = solve_state.num_trajopt_seeds
-> 2820 traj_result = trajopt_instance.solve_any(
   2821     solve_state.solve_type,
   2822     goal,
   2823     act_seed,
   2824     use_nn_seed,
   2825     return_all_solutions,
   2826     num_seeds_override,
   2827     seed_success,
   2828     newton_iters=newton_iters,
   2829 )
   2830 return traj_result

File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/trajopt.py:805, in TrajOptSolver.solve_any(self, solve_type, goal, seed_traj, use_nn_seed, return_all_solutions, num_seeds, seed_success, newton_iters)
    783 \"\"\"Solve trajectory optimization problem with any solve type.
    784 
    785 Args:
   (...)
    802     TrajOptResult: Result of the trajectory optimization.
    803 \"\"\"
    804 if solve_type == ReacherSolveType.SINGLE:
--> 805     return self.solve_single(
    806         goal,
    807         seed_traj,
    808         use_nn_seed,
    809         return_all_solutions,
    810         num_seeds,
    811         newton_iters=newton_iters,
    812     )
    813 elif solve_type == ReacherSolveType.GOALSET:
    814     return self.solve_goalset(
    815         goal,
    816         seed_traj,
   (...)
    820         newton_iters=newton_iters,
    821     )

File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/trajopt.py:978, in TrajOptSolver.solve_single(self, goal, seed_traj, use_nn_seed, return_all_solutions, num_seeds, newton_iters)
    969     num_seeds = self.num_seeds
    970 solve_state = ReacherSolveState(
    971     ReacherSolveType.SINGLE,
    972     num_trajopt_seeds=num_seeds,
   (...)
    975     n_goalset=1,
    976 )
--> 978 return self._solve_from_solve_state(
    979     solve_state,
    980     goal,
    981     seed_traj,
    982     use_nn_seed,
    983     return_all_solutions,
    984     num_seeds,
    985     newton_iters=newton_iters,
    986 )

File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/trajopt.py:918, in TrajOptSolver._solve_from_solve_state(self, solve_state, goal, seed_traj, use_nn_seed, return_all_solutions, num_seeds, seed_success, newton_iters)
    916     goal_buffer.goal_state = None
    917 self.solver.reset()
--> 918 result = self.solver.solve(goal_buffer, seed_traj)
    919 log_info(\"Ran TO\")
    920 traj_result = self._get_result(
    921     result,
    922     return_all_solutions,
   (...)
    926     solve_state.batch_mode,
    927 )

File ~/桌面/lhy/code/curobo/src/curobo/wrap/wrap_base.py:155, in WrapBase.solve(self, goal, seed)
    153     log_info(\"Solver was not initialized, warming up solver\")
    154     for _ in range(2):
--> 155         act_seq = self.optimize(seed, shift_steps=0)
    156     self._init_solver = True
    157 act_seq = self.optimize(seed, shift_steps=0)

File ~/桌面/lhy/code/curobo/src/curobo/wrap/wrap_base.py:77, in WrapBase.optimize(self, act_seq, shift_steps)
     75 def optimize(self, act_seq: torch.Tensor, shift_steps: int = 0) -> torch.Tensor:
     76     for opt in self.optimizers:
---> 77         act_seq = opt.optimize(act_seq, shift_steps)
     78     return act_seq

File ~/桌面/lhy/code/curobo/src/curobo/opt/opt_base.py:171, in Optimizer.optimize(self, opt_tensor, shift_steps, n_iters)
    169     self.COLD_START = False
    170 st_time = time.time()
--> 171 out = self._optimize(opt_tensor, shift_steps, n_iters)
    172 if self.sync_cuda_time:
    173     torch.cuda.synchronize(device=self.tensor_args.device)

File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:144, in NewtonOptBase._optimize(self, q, shift_steps, n_iters)
    142 # run opt graph
    143 if not self.cu_opt_init:
--> 144     self._initialize_opt_iters_graph(q, grad_q, shift_steps=shift_steps)
    145 for i in range(self.outer_iters):
    146     best_q, best_cost, q, grad_q = self._call_opt_iters_graph(q, grad_q)

File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:499, in NewtonOptBase._initialize_opt_iters_graph(self, q, grad_q, shift_steps)
    497 def _initialize_opt_iters_graph(self, q, grad_q, shift_steps):
    498     if self.use_cuda_graph:
--> 499         self._create_opt_iters_graph(q, grad_q, shift_steps)
    500     self.cu_opt_init = True

File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:541, in NewtonOptBase._create_opt_iters_graph(self, q, grad_q, shift_steps)
    539 with torch.cuda.stream(s):
    540     for _ in range(3):
--> 541         self._cu_opt_q, self._cu_opt_cost, self._cu_q, self._cu_gq = self._opt_iters(
    542             self._cu_opt_q_in, self._cu_opt_gq_in, shift_steps
    543         )
    544 torch.cuda.current_stream(device=self.tensor_args.device).wait_stream(s)
    545 self.reset()

File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:173, in NewtonOptBase._opt_iters(self, q, grad_q, shift_steps)
    171 for _ in range(self.inner_iters):
    172     self.i += 1
--> 173     cost_n, q, grad_q = self._opt_step(q.detach(), grad_q.detach())
    174 if self.store_debug:
    175     self.debug.append(self.best_q.view(-1, self.action_horizon, self.d_action).clone())

File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:184, in NewtonOptBase._opt_step(self, q, grad_q)
    182     q_n, cost_n, grad_q_n = self._approx_line_search(q, grad_q)
    183 with profiler.record_function(\"newton/step_direction\"):
--> 184     grad_q = self._get_step_direction(cost_n, q_n, grad_q_n)
    185 with profiler.record_function(\"newton/update_best\"):
    186     self._update_best(q_n, grad_q_n, cost_n)

File ~/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    112 @functools.wraps(func)
    113 def decorate_context(*args, **kwargs):
    114     with ctx_factory():
--> 115         return func(*args, **kwargs)

File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/lbfgs.py:167, in LBFGSOpt._get_step_direction(self, cost, q, grad_q)
    165 if self.use_cuda_kernel:
    166     with profiler.record_function(\"lbfgs/fused\"):
--> 167         dq = LBFGScu.apply(
    168             self.step_q_buffer,
    169             self.rho_buffer,
    170             self.y_buffer,
    171             self.s_buffer,
    172             q,
    173             grad_q,
    174             self.x_0,
    175             self.grad_0,
    176             self.epsilon,
    177             self.stable_mode,
    178             self.use_shared_buffers_kernel,
    179         )
    181 else:
    183     self._update_buffers(q, grad_q)

File ~/.local/lib/python3.10/site-packages/torch/autograd/function.py:506, in Function.apply(cls, *args, **kwargs)
    503 if not torch._C._are_functorch_transforms_active():
    504     # See NOTE: [functorch vjp and autograd interaction]
    505     args = _functorch.utils.unwrap_dead_wrappers(args)
--> 506     return super().apply(*args, **kwargs)  # type: ignore[misc]
    508 if cls.setup_context == _SingleLevelFunction.setup_context:
    509     raise RuntimeError(
    510         'In order to use an autograd.Function with functorch transforms '
    511         '(vmap, grad, jvp, jacrev, ...), it must override the setup_context '
    512         'staticmethod. For more details, please see '
    513         'https://pytorch.org/docs/master/notes/extending.func.html')

File ~/桌面/lhy/code/curobo/src/curobo/curobolib/opt.py:58, in LBFGScu.forward(ctx, step_vec, rho_buffer, y_buffer, s_buffer, q, grad_q, x_0, grad_0, epsilon, stable_mode, use_shared_buffers)
     41 @staticmethod
     42 def forward(
     43     ctx,
   (...)
     54     use_shared_buffers=True,
     55 ):
     56     m, b, v_dim, _ = y_buffer.shape
---> 58     R = lbfgs_step_cu.forward(
     59         step_vec,  # .view(-1),
     60         rho_buffer,  # .view(-1),
     61         y_buffer,  # .view(-1),
     62         s_buffer,  # .view(-1),
     63         q,
     64         grad_q,  # .view(-1),
     65         x_0,
     66         grad_0,
     67         epsilon,
     68         b,
     69         m,
     70         v_dim,
     71         stable_mode,
     72         use_shared_buffers,
     73     )
     74     step_v = R[0].view(step_vec.shape)
     76     # ctx.save_for_backward(batch_spheres, robot_spheres, link_mats, link_sphere_map)

RuntimeError: CUDA error: invalid argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
"
}

Aug 01 '24 02:08 linhy97

Is it possible for you to run this inside a docker? That would help us reproduce the issue on our end.

Aug 09 '24 06:08 balakumar-s

I encountered the same issue on a cluster. My workaround was to manually set LBFGSOpt.use_cuda_kernel to False, regardless of the conditions in the code:

https://github.com/NVlabs/curobo/blob/2fbffc35225398cf9d5f382804faa9de2608753b/src/curobo/opt/newton/lbfgs.py#L116

In theory, it is compiled using nvcc 11.8.0 and gcc 11.3.0. When I check with nvidia-smi, the system reports:

+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  Tesla V100-PCIE-32GB           Off |   00000000:3B:00.0 Off |                    0 |
| N/A   28C    P0             36W /  250W |       1MiB /  32768MiB |      1%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

However, since this is a cluster setup, there are additional details that I don’t fully understand.

Feb 13 '25 07:02 hesic73

We improved error handling for the CUDA kernels, hoping this resolves this issue. I am closing for now as we don't have a way to reproduce. Please re-open if issue persists and we can figure out a setup to reproduce.

Apr 25 '25 19:04 balakumar-s