rclpy
rclpy copied to clipboard
Fixes Action.*_async futures never complete
Replaces #1125 since original contributor went inactive
Closes #1123
If two separate client server actions are running in separate executors the future given to the ActionClient will never complete due to a race condition on the rcl handles
Tested using this from @apockill which was adapted to match rolling then test with and without locks
To reproduce initial issue remove the lock/sleep in the RaceyAction client
Adapted example Code here
client.py
import rclpy
from rclpy import Future
from rclpy.action import ActionClient
from rclpy.action.client import ClientGoalHandle
from rclpy.callback_groups import MutuallyExclusiveCallbackGroup
from rclpy.executors import MultiThreadedExecutor
from rclpy.node import Node
from test_msgs.action import Fibonacci as SomeMsg
from time import sleep
class RaceyActionClient(ActionClient):
def _get_result_async(self, goal_handle):
"""
Request the result for an active goal asynchronously.
:param goal_handle: Handle to the goal to cancel.
:type goal_handle: :class:`ClientGoalHandle`
:return: a Future instance that completes when the get result request has been processed.
:rtype: :class:`rclpy.task.Future` instance
"""
if not isinstance(goal_handle, ClientGoalHandle):
raise TypeError(
'Expected type ClientGoalHandle but received {}'.format(type(goal_handle)))
result_request = self._action_type.Impl.GetResultService.Request()
result_request.goal_id = goal_handle.goal_id
future = Future()
with self._lock:
sleep(1)
sequence_number = self._client_handle.send_result_request(result_request)
if sequence_number in self._pending_result_requests:
raise RuntimeError(
'Sequence ({}) conflicts with pending result request'.format(sequence_number))
sleep(1)
self._pending_result_requests[sequence_number] = future
sleep(1)
self._result_sequence_number_to_goal_id[sequence_number] = result_request.goal_id
sleep(1)
future.add_done_callback(self._remove_pending_result_request)
sleep(1)
# Add future so executor is aware
self.add_future(future)
return future
class SomeActionClient(Node):
def __init__(self):
super().__init__("action_client")
self._action_client = RaceyActionClient(
self,
SomeMsg,
"some_action",
callback_group=MutuallyExclusiveCallbackGroup(),
)
self.call_action_timer = self.create_timer(
timer_period_sec=10,
callback=self.send_goal,
callback_group=MutuallyExclusiveCallbackGroup(),
)
def send_goal(self):
goal_msg = SomeMsg.Goal()
# self._action_client.wait_for_server()
result = self._action_client.send_goal(goal_msg)
print(result)
def main(args=None):
rclpy.init(args=args)
action_client = SomeActionClient()
executor = MultiThreadedExecutor()
rclpy.spin(action_client, executor)
if __name__ == "__main__":
main()
server.py
import rclpy
from rclpy.action import ActionServer
from rclpy.node import Node
from test_msgs.action import Fibonacci as SomeMsg
class SomeActionServer(Node):
def __init__(self):
super().__init__("fibonacci_action_server")
self._action_server = ActionServer(
self, SomeMsg, "some_action", self.execute_callback
)
def execute_callback(self, goal_handle):
self.get_logger().info("Executing goal...")
result = SomeMsg.Result()
goal_handle.succeed()
return result
def main(args=None):
rclpy.init(args=args)
action_server = SomeActionServer()
rclpy.spin(action_server)
if __name__ == "__main__":
main()
Before and after log results
Before client output
[WARN] [1720739707.125040450] [action_client.action_client]: Ignoring unexpected result response. There may be more than one action server for the action 'some_action'
^C
server output
[INFO] [1720739706.054013974] [fibonacci_action_server]: Executing goal...
^C
After client output
[INFO] [1720739524.873096289] [fibonacci_action_server]: Executing goal...
[INFO] [1720739534.791491183] [fibonacci_action_server]: Executing goal...
[INFO] [1720739544.789914632] [fibonacci_action_server]: Executing goal...
[INFO] [1720739554.791130534] [fibonacci_action_server]: Executing goal...
[INFO] [1720739564.791638162] [fibonacci_action_server]: Executing goal...
[INFO] [1720739574.793290290] [fibonacci_action_server]: Executing goal...
[INFO] [1720739584.790164903] [fibonacci_action_server]: Executing goal...
[INFO] [1720739594.794427458] [fibonacci_action_server]: Executing goal...
^C
test_msgs.action.Fibonacci_GetResult_Response(status=4, result=test_msgs.action.Fibonacci_Result(sequence=[]))
test_msgs.action.Fibonacci_GetResult_Response(status=4, result=test_msgs.action.Fibonacci_Result(sequence=[]))
test_msgs.action.Fibonacci_GetResult_Response(status=4, result=test_msgs.action.Fibonacci_Result(sequence=[]))
test_msgs.action.Fibonacci_GetResult_Response(status=4, result=test_msgs.action.Fibonacci_Result(sequence=[]))
test_msgs.action.Fibonacci_GetResult_Response(status=4, result=test_msgs.action.Fibonacci_Result(sequence=[]))
test_msgs.action.Fibonacci_GetResult_Response(status=4, result=test_msgs.action.Fibonacci_Result(sequence=[]))
test_msgs.action.Fibonacci_GetResult_Response(status=4, result=test_msgs.action.Fibonacci_Result(sequence=[]))
test_msgs.action.Fibonacci_GetResult_Response(status=4, result=test_msgs.action.Fibonacci_Result(sequence=[]))
^C
I also initially tried to add locks to just the action client, but I was getting test failures in test_action_graph on the get_names_and_types function, so I added for action server as well.
i would add @aditya2592 as co-author, since this borrows the code from #1125.
Done
Throwing this back to draft because it still doesn't fix all conditions where this breaks. If others have ideas on how to more reliably reproduce this that would be appreciated.
Throwing this back to draft because it still doesn't fix all conditions where this breaks. If others have ideas on how to more reliably reproduce this that would be appreciated.
What kinds of failures are you seeing still? This is the "fix" I've been using in production, and at least with our use cases of services we haven't seen failures since:
class PatchRclpyIssue1123(RobustActionClient):
_lock: RLock = None # type: ignore
@property
def _cpp_client_handle_lock(self) -> RLock:
if self._lock is None:
self._lock = RLock()
return self._lock
async def execute(self, *args: Any, **kwargs: Any) -> None:
# This is ugly- holding on to a lock in an async environment feels gross
with self._cpp_client_handle_lock:
return await super().execute(*args, **kwargs) # type: ignore
def send_goal_async(self, *args: Any, **kwargs: Any) -> Future:
with self._cpp_client_handle_lock:
return super().send_goal_async(*args, **kwargs)
def _cancel_goal_async(self, *args: Any, **kwargs: Any) -> Future:
with self._cpp_client_handle_lock:
return super()._cancel_goal_async(*args, **kwargs)
def _get_result_async(self, *args: Any, **kwargs: Any) -> Future:
with self._cpp_client_handle_lock:
return super()._get_result_async(*args, **kwargs)
I'm not sure if these matter, but here are the differences I can note between this above fix and this PR:
- In my implementation I lock the async
execute, since multithreaded executors are still running async tasks "concurrently", leading to possible race conditions with the client handle. - I use an
RLockvs a Lock. Not sure if this matters in this case, I figured I'd note it
I was defining a python action client server pair in two seperate terminals, and the custom server also contained a python client to another cpp nav2 action. After calling the custom python action server once it would work correctly the first time, but after wasn't even accepting the goal request to any subsequent calls. If I removed the nav2 action client from my custom action server it wouldn't deadlock. I suspect a lock wasn't correctly being released with the call to another action client inside the action server. I'd like to look more into try to see if the issue persists with either locking on calls to entire execute funciton as you've suggested or using Rlocks to see if the behavior persists.
After calling the custom python action server once it would work correctly the first time, but after wasn't even accepting the goal request to any subsequent calls.
Ahh understood. I would hazard a guess that an RLock would work in this situation.
I tried to recreate an example outside of my environment, but have been unsuccessful
I tried to recreate an example outside of my environment, but have been unsuccessful
I was able to recreate this issue outside my environment now #1313, and looks like there is an issue with rclcpp server, and a nested rclpy action client nested inside an rcply action server, so that seems unrelated, so I'll pull this back out of draft
@apockill if you are willing to do 2nd review on this, that would be really appreciated.
Who ever wants to test the patch on iron. I cherry picked it on iron in this fork: https://github.com/firesurfer/rclpy/tree/iron
Btw. from our tests today it seems to work fine on iron with our setup.
Just bumping this, to see if other maintainers can review/merge this
@jmblixt3 can you rebase this on rolling? and i will start the CI.
@fujitatomoya Rebased
Anything else needed from me?
@sloretz @clalancette @adityapande-1995 can either of you take a look at this?
I'm experiencing the same issue on ROS 2 Humble. Could you backport this fix to humble? Thank you!
I'm experiencing the same issue on ROS 2 Humble. Could you backport this fix to humble? Thank you!
+1
I am having also this issue with Humble. Is there a timeframe on when the fix will be merged? Thanks!
Just rebased this again, are there any issues with that still need to be resolved
@jmblixt3 no, we are just waiting for the 2nd review.
@clalancette @ahcorde @sloretz friendly ping.
Ship it? :eyes::pleading_face:
can you resolve the conflicts?
Rebased
Pulls: ros2/rclpy#1308 Gist: https://gist.githubusercontent.com/fujitatomoya/ab875209528c930c1b3326c2dd17ff3d/raw/6cc6ea2542fc247416e3772cb4af9bbd307c539f/ros2.repos BUILD args: --packages-above-and-dependencies rclpy TEST args: --packages-above rclpy ROS Distro: rolling Job: ci_launcher ci_launcher ran: https://ci.ros2.org/job/ci_launcher/14878
Anything missing to get this finally merged @fujitatomoya and @apockill ? Would be nice not having to run a custom fork of rclpy with this patch on industrial machines.
LGTM, I think we're waiting on @ahcorde approval?
https://ci.ros2.org/job/ci_linux-rhel/1711/testReport/junit/(root)/ros2cli/pytest_missing_result/ and https://ci.ros2.org/job/ci_windows/22940/ are unrelated to this PR. @ahcorde i will leave this to you, can you merge this after your approval?
We just got a very strange error related to this patch (as of bc4025273892a471564ebd16f1f92d149a43c54e). I suspect it's a very rare race condition, as we've been using this for a long time without any issues 'till now.
When this occurred, we were either creating, using, or deleting an ActionClient, with using being the most likely.
Traceback (most recent call last):
File "my_node.py", line XXX, in main
executor.spin()
File "/lib/python3.12/site-packages/rclpy/executors.py", line 308, in spin
self.spin_once()
File "/lib/python3.12/site-packages/rclpy/executors.py", line 899, in spin_once
self._spin_once_impl(timeout_sec)
File "/lib/python3.12/site-packages/rclpy/executors.py", line 878, in _spin_once_impl
handler, entity, node = self.wait_for_ready_callbacks(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/lib/python3.12/site-packages/rclpy/executors.py", line 781, in wait_for_ready_callbacks
return next(self._cb_iter)
^^^^^^^^^^^^^^^^^^^
File "/lib/python3.12/site-packages/rclpy/executors.py", line 655, in _wait_for_ready_callbacks
entity_count += waitable.get_num_entities()
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/lib/python3.12/site-packages/rclpy/action/client.py", line 365, in get_num_entities
with self._lock:
^^^^^^^^^^
AttributeError: 'ActionClient' object has no attribute '_lock'
We just got a very strange error related to this patch (as of bc40252). I suspect it's a very rare race condition, as we've been using this for a long time without any issues 'till now.
When this occurred, we were either creating, using, or deleting an
ActionClient, with using being the most likely.Traceback (most recent call last): File "my_node.py", line XXX, in main executor.spin() File "/lib/python3.12/site-packages/rclpy/executors.py", line 308, in spin self.spin_once() File "/lib/python3.12/site-packages/rclpy/executors.py", line 899, in spin_once self._spin_once_impl(timeout_sec) File "/lib/python3.12/site-packages/rclpy/executors.py", line 878, in _spin_once_impl handler, entity, node = self.wait_for_ready_callbacks( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/lib/python3.12/site-packages/rclpy/executors.py", line 781, in wait_for_ready_callbacks return next(self._cb_iter) ^^^^^^^^^^^^^^^^^^^ File "/lib/python3.12/site-packages/rclpy/executors.py", line 655, in _wait_for_ready_callbacks entity_count += waitable.get_num_entities() ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/lib/python3.12/site-packages/rclpy/action/client.py", line 365, in get_num_entities with self._lock: ^^^^^^^^^^ AttributeError: 'ActionClient' object has no attribute '_lock'
That is certainly quite strange considering that variable is instantiated in the constructor. My only thought is that this object is being instantiated as a different type of Waitable then interpreted as an ActionClient causing the call to get_num_entities to fail. That is a wild guess, but with the information given that's my best theory.
Any update on this?
@ahcorde I just rebased again