sofa-pbrpc icon indicating copy to clipboard operation
sofa-pbrpc copied to clipboard

dead lock when stop rpc server

Open bigwatermelondurian opened this issue 8 years ago • 3 comments

发现一个dead lock场景:

stack 1: #0 __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135 #1 0x00007f845ed69dbd in _GI__pthread_mutex_lock (mutex=0x3df3c50) at ../nptl/pthread_mutex_lock.c:80 #2 0x00000000017d18a0 in sofa::pbrpc::RpcServerImpl::RestartListen() () #3 0x00000000017d275e in sofa::pbrpc::RpcServerImpl::TimerMaintain(boost::posix_time::ptime const&) () #4 0x00000000017baf3d in sofa::pbrpc::TimerWorker::on_timeout(boost::system::error_code const&) () #5 0x00000000017c0ca6 in void boost::asio::detail::strand_service::dispatch<boost::asio::detail::binder1<boost::_bi::bind_t<void, boost::_mfi::mf1<void, sofa::pbrpc::TimerWorker, boost::system::error_code const&>, boost::_bi::list2<boost::_bi::value<sofa::pbrpc::shared_ptrsofa::pbrpc::TimerWorker >, boost::arg<1> > >, boost::system::error_code> >(boost::asio::detail::strand_service::strand_impl*&, boost::asio::detail::binder1<boost::_bi::bind_t<void, boost::_mfi::mf1<void, sofa::pbrpc::TimerWorker, boost::system::error_code const&>, boost::_bi::list2<boost::_bi::value<sofa::pbrpc::shared_ptrsofa::pbrpc::TimerWorker >, boost::arg<1> > >, boost::system::error_code>&) () #6 0x00000000017c0f33 in void boost::asio::detail::wrapped_handler<boost::asio::io_service::strand, boost::_bi::bind_t<void, boost::_mfi::mf1<void, sofa::pbrpc::TimerWorker, boost::system::error_code const&>, boost::_bi::list2<boost::_bi::value<sofa::pbrpc::shared_ptrsofa::pbrpc::TimerWorker >, boost::arg<1> > >, boost::asio::detail::is_continuation_if_running>::operator()boost::system::error_code(boost::system::error_code const&) () #7 0x00000000017c118e in boost::asio::detail::completion_handler<boost::asio::detail::rewrapped_handler<boost::asio::detail::binder1<boost::asio::detail::wrapped_handler<boost::asio::io_service::strand, boost::_bi::bind_t<void, boost::_mfi::mf1<void, sofa::pbrpc::TimerWorker, boost::system::error_code const&>, boost::_bi::list2<boost::_bi::value<sofa::pbrpc::shared_ptrsofa::pbrpc::TimerWorker >, boost::arg<1> > >, boost::asio::detail::is_continuation_if_running>, boost::system::error_code>, boost::_bi::bind_t<void, boost::_mfi::mf1<void, sofa::pbrpc::TimerWorker, boost::system::error_code const&>, boost::_bi::list2<boost::_bi::value<sofa::pbrpc::shared_ptrsofa::pbrpc::TimerWorker >, boost::arg<1> > > > >::do_complete(boost::asio::detail::task_io_service*, boost::asio::detail::task_io_service_operation*, boost::system::error_code const&, unsigned long) () #8 0x00000000017c1484 in void boost::asio::detail::strand_service::dispatch<boost::asio::detail::rewrapped_handler<boost::asio::detail::binder1<boost::asio::detail::wrapped_handler<boost::asio::io_service::strand, boost::_bi::bind_t<void, boost::_mfi::mf1<void, sofa::pbrpc::TimerWorker, boost::system::error_code const&>, boost::_bi::list2<boost::_bi::value<sofa::pbrpc::shared_ptrsofa::pbrpc::TimerWorker >, boost::arg<1> > >, boost::asio::detail::is_continuation_if_running>, boost::system::error_code>, boost::_bi::bind_t<void, boost::_mfi::mf1<void, sofa::pbrpc::TimerWorker, boost::system::error_code const&>, boost::_bi::list2<boost::_bi::value<sofa::pbrpc::shared_ptrsofa::pbrpc::TimerWorker >, boost::arg<1> > > > >(boost::asio::detail::strand_service::strand_impl*&, boost::asio::detail::rewrapped_handler<boost::asio::detail::binder1<boost::asio::detail::wrapped_handler<boost::asio::io_service::strand, boost::_bi::bind_t<void, boost::_mfi::mf1<void, sofa::pbrpc::TimerWorker, boost::system::error_code const&>, boost::_bi::list2<boost::_bi::value<sofa::pbrpc::shared_ptrsofa::pbrpc::TimerWorker >, boost::arg<1> > >, boost::asio::detail::is_continuation_if_running>, boost::system::error_code>, boost::_bi::bind_t<void, boost::_mfi::mf1<void, sofa::pbrpc::TimerWorker, boost::system::error_code const&>, boost::_bi::list2<boost::_bi::value<sofa::pbrpc::shared_ptrsofa::pbrpc::TimerWorker >, boost::arg<1> > > >&) () #9 0x00000000017c1805 in boost::asio::detail::wait_handler<boost::asio::detail::wrapped_handler<boost::asio::io_service::strand, boost::_bi::bind_t<void, boost::_mfi::mf1<void, sofa::pbrpc::TimerWorker, boost::system::error_code const&>, boost::_bi::list2<boost::_bi::value<sofa::pbrpc::shared_ptrsofa::pbrpc::TimerWorker >, boost::arg<1> > >, boost::asio::detail::is_continuation_if_running> >::do_complete(boost::asio::detail::task_io_service*, boost::asio::detail::task_io_service_operation*, boost::system::error_code const&, unsigned long) () #10 0x00000000017b1d71 in sofa::pbrpc::ThreadGroupImpl::thread_run(void*) () #11 0x00007f845ed676ba in start_thread (arg=0x7f84458c3700) at pthread_create.c:333 #12 0x00007f845e1fc3dd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

stack 2: #0 0x00007f845ed6898d in pthread_join (threadid=140206079227648, thread_return=0x0) at pthread_join.c:90 #1 0x00000000017afc1b in sofa::pbrpc::ThreadGroupImpl::stop() () #2 0x00000000017d169e in sofa::pbrpc::RpcServerImpl::Stop() ()

RpcServerImpl::Stop逻辑: 加_start_stop_lock锁,调用 _maintain_thread_group->stop(); _maintain_thread_group->stop()需要join thread RestartListen()正在进行,且wati在_start_stop_lock锁。 造成dead lock。

bigwatermelondurian avatar Nov 16 '17 04:11 bigwatermelondurian

这个无所谓了吧

LazyPlanet avatar Nov 21 '17 06:11 LazyPlanet

问题不大,影响是会导致进程stop不掉

bigwatermelondurian avatar Dec 05 '17 04:12 bigwatermelondurian

是有问题的,RestartListen之前应该判断下server的_is_running状态

cyshi avatar Dec 05 '17 07:12 cyshi