Encounter Runtime Error CUDA error invalid argument while trying "Motion Generation" example
Hi, i encounter the bug "RuntimeError: CUDA error: invalid argument' while trying the example "Motion Generation" in webpage https://curobo.org/get_started/2a_python_examples.html. Hope for your help~
1.Basic Environments:
Ubuntu 22.04,
NVIDIA_DRIVER 550.90.07
Cuda11.8, Cudnn 8.9,
python=3.10.12
pytorch=2.0.1+cu118,
curobo=0.7.4.post1.dev0+dirty
g++ 11.4.0
gcc 11.4.0
I try torch 2.0.0, 2.2, also failed and same error.
- The code i try is the example supplied in "Using in Python" Page.
# Third Party
import torch
# cuRobo
from curobo.types.math import Pose
from curobo.types.robot import JointState
from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
world_config = {
"mesh": {
"base_scene": {
"pose": [10.5, 0.080, 1.6, 0.043, -0.471, 0.284, 0.834],
"file_path": "scene/nvblox/srl_ur10_bins.obj",
},
},
"cuboid": {
"table": {
"dims": [5.0, 5.0, 0.2], # x, y, z
"pose": [0.0, 0.0, -0.1, 1, 0, 0, 0.0], # x, y, z, qw, qx, qy, qz
},
},
}
motion_gen_config = MotionGenConfig.load_from_robot_config(
"ur5e.yml",
world_config,
interpolation_dt=0.01,
)
motion_gen = MotionGen(motion_gen_config)
motion_gen.warmup()
retract_cfg = motion_gen.get_retract_config()
state = motion_gen.rollout_fn.compute_kinematics(
JointState.from_position(retract_cfg.view(1, -1))
)
goal_pose = Pose.from_list([-0.4, 0.0, 0.4, 1.0, 0.0, 0.0, 0.0]) # x, y, z, qw, qx, qy, qz
start_state = JointState.from_position(
torch.zeros(1, 6).cuda(),
joint_names=[
"shoulder_pan_joint",
"shoulder_lift_joint",
"elbow_joint",
"wrist_1_joint",
"wrist_2_joint",
"wrist_3_joint",
],
)
result = motion_gen.plan_single(start_state, goal_pose, MotionGenPlanConfig(max_attempts=1))
traj = result.get_interpolated_plan() # result.interpolation_dt has the dt between timesteps
print("Trajectory Generated: ", result.success)
- Detailed Error Info:
{
"name": "RuntimeError",
"message": "CUDA error: invalid argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
",
"stack": "---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[7], line 30
24 motion_gen_config = MotionGenConfig.load_from_robot_config(
25 \"ur5e.yml\",
26 world_config,
27 interpolation_dt=0.01,
28 )
29 motion_gen = MotionGen(motion_gen_config)
---> 30 motion_gen.warmup()
32 retract_cfg = motion_gen.get_retract_config()
34 state = motion_gen.rollout_fn.compute_kinematics(
35 JointState.from_position(retract_cfg.view(1, -1))
36 )
File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/motion_gen.py:1863, in MotionGen.warmup(self, enable_graph, batch, warmup_js_trajopt, batch_env_mode, parallel_finetune, n_goalset, warmup_joint_index, warmup_joint_delta)
1861 goal_state.position[..., warmup_joint_index] += warmup_joint_delta
1862 for _ in range(3):
-> 1863 self.plan_single_js(
1864 start_state.clone(),
1865 goal_state.clone(),
1866 MotionGenPlanConfig(max_attempts=1, enable_finetune_trajopt=True),
1867 )
1869 if enable_graph:
1870 start_state = JointState.from_position(
1871 self.rollout_fn.dynamics_model.retract_config.view(1, -1).clone(),
1872 joint_names=self.rollout_fn.joint_names,
1873 )
File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/motion_gen.py:2073, in MotionGen.plan_single_js(self, start_state, goal_state, plan_config)
2070 return result
2072 for n in range(plan_config.max_attempts):
-> 2073 result = self._plan_js_from_solve_state(
2074 solve_state, start_state, goal_state, plan_config=plan_config
2075 )
2076 time_dict[\"trajopt_time\"] += result.trajopt_time
2077 time_dict[\"graph_time\"] += result.graph_time
File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/motion_gen.py:3734, in MotionGen._plan_js_from_solve_state(self, solve_state, start_state, goal_state, plan_config)
3732 if self.optimize_dt:
3733 self.finetune_js_trajopt_solver.update_solver_dt(scaled_dt.item())
-> 3734 traj_result = self._solve_trajopt_from_solve_state(
3735 goal,
3736 solve_state,
3737 seed_traj,
3738 trajopt_instance=self.finetune_js_trajopt_solver,
3739 num_seeds_override=solve_state.num_trajopt_seeds,
3740 newton_iters=newton_iters,
3741 return_all_solutions=False,
3742 )
3744 finetune_time += traj_result.solve_time
3745 if torch.count_nonzero(traj_result.success) > 0 or not self.optimize_dt:
File /usr/lib/python3.10/contextlib.py:79, in ContextDecorator.__call__.<locals>.inner(*args, **kwds)
76 @wraps(func)
77 def inner(*args, **kwds):
78 with self._recreate_cm():
---> 79 return func(*args, **kwds)
File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/motion_gen.py:2820, in MotionGen._solve_trajopt_from_solve_state(self, goal, solve_state, act_seed, use_nn_seed, return_all_solutions, seed_success, newton_iters, trajopt_instance, num_seeds_override)
2818 if num_seeds_override is None:
2819 num_seeds_override = solve_state.num_trajopt_seeds
-> 2820 traj_result = trajopt_instance.solve_any(
2821 solve_state.solve_type,
2822 goal,
2823 act_seed,
2824 use_nn_seed,
2825 return_all_solutions,
2826 num_seeds_override,
2827 seed_success,
2828 newton_iters=newton_iters,
2829 )
2830 return traj_result
File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/trajopt.py:805, in TrajOptSolver.solve_any(self, solve_type, goal, seed_traj, use_nn_seed, return_all_solutions, num_seeds, seed_success, newton_iters)
783 \"\"\"Solve trajectory optimization problem with any solve type.
784
785 Args:
(...)
802 TrajOptResult: Result of the trajectory optimization.
803 \"\"\"
804 if solve_type == ReacherSolveType.SINGLE:
--> 805 return self.solve_single(
806 goal,
807 seed_traj,
808 use_nn_seed,
809 return_all_solutions,
810 num_seeds,
811 newton_iters=newton_iters,
812 )
813 elif solve_type == ReacherSolveType.GOALSET:
814 return self.solve_goalset(
815 goal,
816 seed_traj,
(...)
820 newton_iters=newton_iters,
821 )
File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/trajopt.py:978, in TrajOptSolver.solve_single(self, goal, seed_traj, use_nn_seed, return_all_solutions, num_seeds, newton_iters)
969 num_seeds = self.num_seeds
970 solve_state = ReacherSolveState(
971 ReacherSolveType.SINGLE,
972 num_trajopt_seeds=num_seeds,
(...)
975 n_goalset=1,
976 )
--> 978 return self._solve_from_solve_state(
979 solve_state,
980 goal,
981 seed_traj,
982 use_nn_seed,
983 return_all_solutions,
984 num_seeds,
985 newton_iters=newton_iters,
986 )
File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/trajopt.py:918, in TrajOptSolver._solve_from_solve_state(self, solve_state, goal, seed_traj, use_nn_seed, return_all_solutions, num_seeds, seed_success, newton_iters)
916 goal_buffer.goal_state = None
917 self.solver.reset()
--> 918 result = self.solver.solve(goal_buffer, seed_traj)
919 log_info(\"Ran TO\")
920 traj_result = self._get_result(
921 result,
922 return_all_solutions,
(...)
926 solve_state.batch_mode,
927 )
File ~/桌面/lhy/code/curobo/src/curobo/wrap/wrap_base.py:155, in WrapBase.solve(self, goal, seed)
153 log_info(\"Solver was not initialized, warming up solver\")
154 for _ in range(2):
--> 155 act_seq = self.optimize(seed, shift_steps=0)
156 self._init_solver = True
157 act_seq = self.optimize(seed, shift_steps=0)
File ~/桌面/lhy/code/curobo/src/curobo/wrap/wrap_base.py:77, in WrapBase.optimize(self, act_seq, shift_steps)
75 def optimize(self, act_seq: torch.Tensor, shift_steps: int = 0) -> torch.Tensor:
76 for opt in self.optimizers:
---> 77 act_seq = opt.optimize(act_seq, shift_steps)
78 return act_seq
File ~/桌面/lhy/code/curobo/src/curobo/opt/opt_base.py:171, in Optimizer.optimize(self, opt_tensor, shift_steps, n_iters)
169 self.COLD_START = False
170 st_time = time.time()
--> 171 out = self._optimize(opt_tensor, shift_steps, n_iters)
172 if self.sync_cuda_time:
173 torch.cuda.synchronize(device=self.tensor_args.device)
File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:144, in NewtonOptBase._optimize(self, q, shift_steps, n_iters)
142 # run opt graph
143 if not self.cu_opt_init:
--> 144 self._initialize_opt_iters_graph(q, grad_q, shift_steps=shift_steps)
145 for i in range(self.outer_iters):
146 best_q, best_cost, q, grad_q = self._call_opt_iters_graph(q, grad_q)
File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:499, in NewtonOptBase._initialize_opt_iters_graph(self, q, grad_q, shift_steps)
497 def _initialize_opt_iters_graph(self, q, grad_q, shift_steps):
498 if self.use_cuda_graph:
--> 499 self._create_opt_iters_graph(q, grad_q, shift_steps)
500 self.cu_opt_init = True
File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:541, in NewtonOptBase._create_opt_iters_graph(self, q, grad_q, shift_steps)
539 with torch.cuda.stream(s):
540 for _ in range(3):
--> 541 self._cu_opt_q, self._cu_opt_cost, self._cu_q, self._cu_gq = self._opt_iters(
542 self._cu_opt_q_in, self._cu_opt_gq_in, shift_steps
543 )
544 torch.cuda.current_stream(device=self.tensor_args.device).wait_stream(s)
545 self.reset()
File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:173, in NewtonOptBase._opt_iters(self, q, grad_q, shift_steps)
171 for _ in range(self.inner_iters):
172 self.i += 1
--> 173 cost_n, q, grad_q = self._opt_step(q.detach(), grad_q.detach())
174 if self.store_debug:
175 self.debug.append(self.best_q.view(-1, self.action_horizon, self.d_action).clone())
File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:184, in NewtonOptBase._opt_step(self, q, grad_q)
182 q_n, cost_n, grad_q_n = self._approx_line_search(q, grad_q)
183 with profiler.record_function(\"newton/step_direction\"):
--> 184 grad_q = self._get_step_direction(cost_n, q_n, grad_q_n)
185 with profiler.record_function(\"newton/update_best\"):
186 self._update_best(q_n, grad_q_n, cost_n)
File ~/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/lbfgs.py:167, in LBFGSOpt._get_step_direction(self, cost, q, grad_q)
165 if self.use_cuda_kernel:
166 with profiler.record_function(\"lbfgs/fused\"):
--> 167 dq = LBFGScu.apply(
168 self.step_q_buffer,
169 self.rho_buffer,
170 self.y_buffer,
171 self.s_buffer,
172 q,
173 grad_q,
174 self.x_0,
175 self.grad_0,
176 self.epsilon,
177 self.stable_mode,
178 self.use_shared_buffers_kernel,
179 )
181 else:
183 self._update_buffers(q, grad_q)
File ~/.local/lib/python3.10/site-packages/torch/autograd/function.py:506, in Function.apply(cls, *args, **kwargs)
503 if not torch._C._are_functorch_transforms_active():
504 # See NOTE: [functorch vjp and autograd interaction]
505 args = _functorch.utils.unwrap_dead_wrappers(args)
--> 506 return super().apply(*args, **kwargs) # type: ignore[misc]
508 if cls.setup_context == _SingleLevelFunction.setup_context:
509 raise RuntimeError(
510 'In order to use an autograd.Function with functorch transforms '
511 '(vmap, grad, jvp, jacrev, ...), it must override the setup_context '
512 'staticmethod. For more details, please see '
513 'https://pytorch.org/docs/master/notes/extending.func.html')
File ~/桌面/lhy/code/curobo/src/curobo/curobolib/opt.py:58, in LBFGScu.forward(ctx, step_vec, rho_buffer, y_buffer, s_buffer, q, grad_q, x_0, grad_0, epsilon, stable_mode, use_shared_buffers)
41 @staticmethod
42 def forward(
43 ctx,
(...)
54 use_shared_buffers=True,
55 ):
56 m, b, v_dim, _ = y_buffer.shape
---> 58 R = lbfgs_step_cu.forward(
59 step_vec, # .view(-1),
60 rho_buffer, # .view(-1),
61 y_buffer, # .view(-1),
62 s_buffer, # .view(-1),
63 q,
64 grad_q, # .view(-1),
65 x_0,
66 grad_0,
67 epsilon,
68 b,
69 m,
70 v_dim,
71 stable_mode,
72 use_shared_buffers,
73 )
74 step_v = R[0].view(step_vec.shape)
76 # ctx.save_for_backward(batch_spheres, robot_spheres, link_mats, link_sphere_map)
RuntimeError: CUDA error: invalid argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
"
}
Did you install git lfs? My guess is that the mesh file was not cloned. Try sudo apt install git-lfs followed by cd curobo && git lfs pull.
If that still doesn't work, does the below example work?
# Third Party
import torch
# cuRobo
from curobo.types.math import Pose
from curobo.types.robot import JointState
from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
world_config = {
"cuboid": {
"table": {
"dims": [5.0, 5.0, 0.2], # x, y, z
"pose": [0.0, 0.0, -0.1, 1, 0, 0, 0.0], # x, y, z, qw, qx, qy, qz
},
},
}
motion_gen_config = MotionGenConfig.load_from_robot_config(
"ur5e.yml",
world_config,
interpolation_dt=0.01,
)
motion_gen = MotionGen(motion_gen_config)
motion_gen.warmup()
retract_cfg = motion_gen.get_retract_config()
state = motion_gen.rollout_fn.compute_kinematics(
JointState.from_position(retract_cfg.view(1, -1))
)
goal_pose = Pose.from_list([-0.4, 0.0, 0.4, 1.0, 0.0, 0.0, 0.0]) # x, y, z, qw, qx, qy, qz
start_state = JointState.from_position(
torch.zeros(1, 6).cuda(),
joint_names=[
"shoulder_pan_joint",
"shoulder_lift_joint",
"elbow_joint",
"wrist_1_joint",
"wrist_2_joint",
"wrist_3_joint",
],
)
result = motion_gen.plan_single(start_state, goal_pose, MotionGenPlanConfig(max_attempts=1))
traj = result.get_interpolated_plan() # result.interpolation_dt has the dt between timesteps
print("Trajectory Generated: ", result.success)
I check the git-lfs and it was installed.
And then i try the code you supply, however it also doesn't work.
The error is the same while execute motion_gen.warmup()
It seems not the error of "srl_ur10_bins.obj" (87.2MB)
Error Info detail:
{
"name": "RuntimeError",
"message": "CUDA error: invalid argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
",
"stack": "---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[1], line 24
18 motion_gen_config = MotionGenConfig.load_from_robot_config(
19 \"ur5e.yml\",
20 world_config,
21 interpolation_dt=0.01,
22 )
23 motion_gen = MotionGen(motion_gen_config)
---> 24 motion_gen.warmup()
26 retract_cfg = motion_gen.get_retract_config()
28 state = motion_gen.rollout_fn.compute_kinematics(
29 JointState.from_position(retract_cfg.view(1, -1))
30 )
File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/motion_gen.py:1863, in MotionGen.warmup(self, enable_graph, batch, warmup_js_trajopt, batch_env_mode, parallel_finetune, n_goalset, warmup_joint_index, warmup_joint_delta)
1861 goal_state.position[..., warmup_joint_index] += warmup_joint_delta
1862 for _ in range(3):
-> 1863 self.plan_single_js(
1864 start_state.clone(),
1865 goal_state.clone(),
1866 MotionGenPlanConfig(max_attempts=1, enable_finetune_trajopt=True),
1867 )
1869 if enable_graph:
1870 start_state = JointState.from_position(
1871 self.rollout_fn.dynamics_model.retract_config.view(1, -1).clone(),
1872 joint_names=self.rollout_fn.joint_names,
1873 )
File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/motion_gen.py:2073, in MotionGen.plan_single_js(self, start_state, goal_state, plan_config)
2070 return result
2072 for n in range(plan_config.max_attempts):
-> 2073 result = self._plan_js_from_solve_state(
2074 solve_state, start_state, goal_state, plan_config=plan_config
2075 )
2076 time_dict[\"trajopt_time\"] += result.trajopt_time
2077 time_dict[\"graph_time\"] += result.graph_time
File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/motion_gen.py:3734, in MotionGen._plan_js_from_solve_state(self, solve_state, start_state, goal_state, plan_config)
3732 if self.optimize_dt:
3733 self.finetune_js_trajopt_solver.update_solver_dt(scaled_dt.item())
-> 3734 traj_result = self._solve_trajopt_from_solve_state(
3735 goal,
3736 solve_state,
3737 seed_traj,
3738 trajopt_instance=self.finetune_js_trajopt_solver,
3739 num_seeds_override=solve_state.num_trajopt_seeds,
3740 newton_iters=newton_iters,
3741 return_all_solutions=False,
3742 )
3744 finetune_time += traj_result.solve_time
3745 if torch.count_nonzero(traj_result.success) > 0 or not self.optimize_dt:
File /usr/lib/python3.10/contextlib.py:79, in ContextDecorator.__call__.<locals>.inner(*args, **kwds)
76 @wraps(func)
77 def inner(*args, **kwds):
78 with self._recreate_cm():
---> 79 return func(*args, **kwds)
File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/motion_gen.py:2820, in MotionGen._solve_trajopt_from_solve_state(self, goal, solve_state, act_seed, use_nn_seed, return_all_solutions, seed_success, newton_iters, trajopt_instance, num_seeds_override)
2818 if num_seeds_override is None:
2819 num_seeds_override = solve_state.num_trajopt_seeds
-> 2820 traj_result = trajopt_instance.solve_any(
2821 solve_state.solve_type,
2822 goal,
2823 act_seed,
2824 use_nn_seed,
2825 return_all_solutions,
2826 num_seeds_override,
2827 seed_success,
2828 newton_iters=newton_iters,
2829 )
2830 return traj_result
File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/trajopt.py:805, in TrajOptSolver.solve_any(self, solve_type, goal, seed_traj, use_nn_seed, return_all_solutions, num_seeds, seed_success, newton_iters)
783 \"\"\"Solve trajectory optimization problem with any solve type.
784
785 Args:
(...)
802 TrajOptResult: Result of the trajectory optimization.
803 \"\"\"
804 if solve_type == ReacherSolveType.SINGLE:
--> 805 return self.solve_single(
806 goal,
807 seed_traj,
808 use_nn_seed,
809 return_all_solutions,
810 num_seeds,
811 newton_iters=newton_iters,
812 )
813 elif solve_type == ReacherSolveType.GOALSET:
814 return self.solve_goalset(
815 goal,
816 seed_traj,
(...)
820 newton_iters=newton_iters,
821 )
File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/trajopt.py:978, in TrajOptSolver.solve_single(self, goal, seed_traj, use_nn_seed, return_all_solutions, num_seeds, newton_iters)
969 num_seeds = self.num_seeds
970 solve_state = ReacherSolveState(
971 ReacherSolveType.SINGLE,
972 num_trajopt_seeds=num_seeds,
(...)
975 n_goalset=1,
976 )
--> 978 return self._solve_from_solve_state(
979 solve_state,
980 goal,
981 seed_traj,
982 use_nn_seed,
983 return_all_solutions,
984 num_seeds,
985 newton_iters=newton_iters,
986 )
File ~/桌面/lhy/code/curobo/src/curobo/wrap/reacher/trajopt.py:918, in TrajOptSolver._solve_from_solve_state(self, solve_state, goal, seed_traj, use_nn_seed, return_all_solutions, num_seeds, seed_success, newton_iters)
916 goal_buffer.goal_state = None
917 self.solver.reset()
--> 918 result = self.solver.solve(goal_buffer, seed_traj)
919 log_info(\"Ran TO\")
920 traj_result = self._get_result(
921 result,
922 return_all_solutions,
(...)
926 solve_state.batch_mode,
927 )
File ~/桌面/lhy/code/curobo/src/curobo/wrap/wrap_base.py:155, in WrapBase.solve(self, goal, seed)
153 log_info(\"Solver was not initialized, warming up solver\")
154 for _ in range(2):
--> 155 act_seq = self.optimize(seed, shift_steps=0)
156 self._init_solver = True
157 act_seq = self.optimize(seed, shift_steps=0)
File ~/桌面/lhy/code/curobo/src/curobo/wrap/wrap_base.py:77, in WrapBase.optimize(self, act_seq, shift_steps)
75 def optimize(self, act_seq: torch.Tensor, shift_steps: int = 0) -> torch.Tensor:
76 for opt in self.optimizers:
---> 77 act_seq = opt.optimize(act_seq, shift_steps)
78 return act_seq
File ~/桌面/lhy/code/curobo/src/curobo/opt/opt_base.py:171, in Optimizer.optimize(self, opt_tensor, shift_steps, n_iters)
169 self.COLD_START = False
170 st_time = time.time()
--> 171 out = self._optimize(opt_tensor, shift_steps, n_iters)
172 if self.sync_cuda_time:
173 torch.cuda.synchronize(device=self.tensor_args.device)
File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:144, in NewtonOptBase._optimize(self, q, shift_steps, n_iters)
142 # run opt graph
143 if not self.cu_opt_init:
--> 144 self._initialize_opt_iters_graph(q, grad_q, shift_steps=shift_steps)
145 for i in range(self.outer_iters):
146 best_q, best_cost, q, grad_q = self._call_opt_iters_graph(q, grad_q)
File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:499, in NewtonOptBase._initialize_opt_iters_graph(self, q, grad_q, shift_steps)
497 def _initialize_opt_iters_graph(self, q, grad_q, shift_steps):
498 if self.use_cuda_graph:
--> 499 self._create_opt_iters_graph(q, grad_q, shift_steps)
500 self.cu_opt_init = True
File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:541, in NewtonOptBase._create_opt_iters_graph(self, q, grad_q, shift_steps)
539 with torch.cuda.stream(s):
540 for _ in range(3):
--> 541 self._cu_opt_q, self._cu_opt_cost, self._cu_q, self._cu_gq = self._opt_iters(
542 self._cu_opt_q_in, self._cu_opt_gq_in, shift_steps
543 )
544 torch.cuda.current_stream(device=self.tensor_args.device).wait_stream(s)
545 self.reset()
File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:173, in NewtonOptBase._opt_iters(self, q, grad_q, shift_steps)
171 for _ in range(self.inner_iters):
172 self.i += 1
--> 173 cost_n, q, grad_q = self._opt_step(q.detach(), grad_q.detach())
174 if self.store_debug:
175 self.debug.append(self.best_q.view(-1, self.action_horizon, self.d_action).clone())
File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/newton_base.py:184, in NewtonOptBase._opt_step(self, q, grad_q)
182 q_n, cost_n, grad_q_n = self._approx_line_search(q, grad_q)
183 with profiler.record_function(\"newton/step_direction\"):
--> 184 grad_q = self._get_step_direction(cost_n, q_n, grad_q_n)
185 with profiler.record_function(\"newton/update_best\"):
186 self._update_best(q_n, grad_q_n, cost_n)
File ~/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File ~/桌面/lhy/code/curobo/src/curobo/opt/newton/lbfgs.py:167, in LBFGSOpt._get_step_direction(self, cost, q, grad_q)
165 if self.use_cuda_kernel:
166 with profiler.record_function(\"lbfgs/fused\"):
--> 167 dq = LBFGScu.apply(
168 self.step_q_buffer,
169 self.rho_buffer,
170 self.y_buffer,
171 self.s_buffer,
172 q,
173 grad_q,
174 self.x_0,
175 self.grad_0,
176 self.epsilon,
177 self.stable_mode,
178 self.use_shared_buffers_kernel,
179 )
181 else:
183 self._update_buffers(q, grad_q)
File ~/.local/lib/python3.10/site-packages/torch/autograd/function.py:506, in Function.apply(cls, *args, **kwargs)
503 if not torch._C._are_functorch_transforms_active():
504 # See NOTE: [functorch vjp and autograd interaction]
505 args = _functorch.utils.unwrap_dead_wrappers(args)
--> 506 return super().apply(*args, **kwargs) # type: ignore[misc]
508 if cls.setup_context == _SingleLevelFunction.setup_context:
509 raise RuntimeError(
510 'In order to use an autograd.Function with functorch transforms '
511 '(vmap, grad, jvp, jacrev, ...), it must override the setup_context '
512 'staticmethod. For more details, please see '
513 'https://pytorch.org/docs/master/notes/extending.func.html')
File ~/桌面/lhy/code/curobo/src/curobo/curobolib/opt.py:58, in LBFGScu.forward(ctx, step_vec, rho_buffer, y_buffer, s_buffer, q, grad_q, x_0, grad_0, epsilon, stable_mode, use_shared_buffers)
41 @staticmethod
42 def forward(
43 ctx,
(...)
54 use_shared_buffers=True,
55 ):
56 m, b, v_dim, _ = y_buffer.shape
---> 58 R = lbfgs_step_cu.forward(
59 step_vec, # .view(-1),
60 rho_buffer, # .view(-1),
61 y_buffer, # .view(-1),
62 s_buffer, # .view(-1),
63 q,
64 grad_q, # .view(-1),
65 x_0,
66 grad_0,
67 epsilon,
68 b,
69 m,
70 v_dim,
71 stable_mode,
72 use_shared_buffers,
73 )
74 step_v = R[0].view(step_vec.shape)
76 # ctx.save_for_backward(batch_spheres, robot_spheres, link_mats, link_sphere_map)
RuntimeError: CUDA error: invalid argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
"
}
Is it possible for you to run this inside a docker? That would help us reproduce the issue on our end.
I encountered the same issue on a cluster. My workaround was to manually set LBFGSOpt.use_cuda_kernel to False, regardless of the conditions in the code:
https://github.com/NVlabs/curobo/blob/2fbffc35225398cf9d5f382804faa9de2608753b/src/curobo/opt/newton/lbfgs.py#L116
In theory, it is compiled using nvcc 11.8.0 and gcc 11.3.0. When I check with nvidia-smi, the system reports:
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03 Driver Version: 560.35.03 CUDA Version: 12.6 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 Tesla V100-PCIE-32GB Off | 00000000:3B:00.0 Off | 0 |
| N/A 28C P0 36W / 250W | 1MiB / 32768MiB | 1% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
However, since this is a cluster setup, there are additional details that I don’t fully understand.
We improved error handling for the CUDA kernels, hoping this resolves this issue. I am closing for now as we don't have a way to reproduce. Please re-open if issue persists and we can figure out a setup to reproduce.