grpc-rs
grpc-rs copied to clipboard
server destruct before every grpc-poll thread exist
https://github.com/pingcap/grpc-rs/blob/1c64b3d16ffcbfc37efcd060cfa03a8ad69f7f46/src/server.rs#L499
If server fork more than one grpc-poll thread, server may destruct before all the grpc-poll thread exist. @overvenus
why the wait in the Drop can't work?
I guess only one of grpc-poll threads can receive the shutdown msg and quit and trigger the promise
We shutdown a server by calling grpc_server_shutdown_and_notify
. So all poll threads should be an shutdown as long as they are registered to the server. You can test it whether we shutdown server correctly.
/** Begin shutting down a server.
After completion, no new calls or connections will be admitted.
Existing calls will be allowed to complete.
Send a GRPC_OP_COMPLETE event when there are no more calls being serviced.
Shutdown is idempotent, and all tags will be notified at once if multiple
grpc_server_shutdown_and_notify calls are made. 'cq' must have been
registered to this server via grpc_server_register_completion_queue. */
GRPCAPI void grpc_server_shutdown_and_notify(grpc_server* server,
grpc_completion_queue* cq,
void* tag);
@DorianZheng
please give a test to reproduce this bug, so @overvenus can help fix it.
@overvenus https://github.com/messense/crash-kv, repeat run cargo test
and it will crash. From strace output, there are still a grpc-poll-1 thread running after main thread exists.
#0 0x00007fa9d7abe277 in raise () from /lib64/libc.so.6
#1 0x00007fa9d7abf968 in abort () from /lib64/libc.so.6
#2 0x00007fa9d8726575 in __gnu_cxx::__verbose_terminate_handler() () from /opt/scylladb/lib64/libstdc++.so.6
#3 0x00007fa9d8724166 in ?? () from /opt/scylladb/lib64/libstdc++.so.6
#4 0x00007fa9d87241b1 in std::terminate() () from /opt/scylladb/lib64/libstdc++.so.6
#5 0x00007fa9d8724f9f in __cxa_pure_virtual () from /opt/scylladb/lib64/libstdc++.so.6
#6 0x00007fa9d8dca02f in rocksdb::DBImpl::FindObsoleteFiles (this=0x7fa9d6a3c000, job_context=0x7fa9d4dfcaa0, force=true, no_full_scan=false) at /home/pingcap/.cargo/git/checkouts/rust-rocksdb-82ef6e5337b3fbe6/d2fe0a9/librocksdb_sys/rocksdb/db/db_impl_files.cc:201
#7 0x00007fa9d8d789d1 in rocksdb::DBImpl::~DBImpl (this=0x7fa9d6a3c000, __in_chrg=<optimized out>) at /home/pingcap/.cargo/git/checkouts/rust-rocksdb-82ef6e5337b3fbe6/d2fe0a9/librocksdb_sys/rocksdb/db/db_impl.cc:308
#8 0x00007fa9d8d79036 in rocksdb::DBImpl::~DBImpl (this=0x7fa9d6a3c000, __in_chrg=<optimized out>) at /home/pingcap/.cargo/git/checkouts/rust-rocksdb-82ef6e5337b3fbe6/d2fe0a9/librocksdb_sys/rocksdb/db/db_impl.cc:357
#9 0x00007fa9d8d3b09c in crocksdb_close (db=0x7fa9d6a0f0a8) at crocksdb/c.cc:625
#10 0x00007fa9d8d116a2 in _$LT$rocksdb..rocksdb..DB$u20$as$u20$core..ops..drop..Drop$GT$::drop::hd2c7a756268c8813 (self=0x7fa9d6a5ea98) at /home/pingcap/.cargo/git/checkouts/rust-rocksdb-82ef6e5337b3fbe6/d2fe0a9/src/rocksdb.rs:1583
#11 0x00007fa9d8ca7061 in core::ptr::drop_in_place::hf2a2c3fc2a5a02ec () at /checkout/src/libcore/ptr.rs:59
#12 0x00007fa9d8ca690e in core::ptr::drop_in_place::hc9f584cc0b0f6dcb () at /checkout/src/libcore/ptr.rs:59
#13 0x00007fa9d8ca6b72 in core::ptr::drop_in_place::hdb1d809efd738a22 () at /checkout/src/libcore/ptr.rs:59
#14 0x00007fa9d8cc1559 in _$LT$alloc..sync..Arc$LT$T$GT$$GT$::drop_slow::h14aec936067e8ad9 (self=0x7fa9d6a0d170) at /checkout/src/liballoc/sync.rs:523
#15 0x00007fa9d8cc1818 in _$LT$alloc..sync..Arc$LT$T$GT$$u20$as$u20$core..ops..drop..Drop$GT$::drop::hdff1224b3f1fe6f5 (self=0x7fa9d6a0d170) at /checkout/src/liballoc/sync.rs:976
#16 0x00007fa9d8ca688e in core::ptr::drop_in_place::hc6c271f39e74f266 () at /checkout/src/libcore/ptr.rs:59
#17 0x00007fa9d8ca467e in core::ptr::drop_in_place::h0a945c7aabecefbf () at /checkout/src/libcore/ptr.rs:59
#18 0x00007fa9d8ca637e in core::ptr::drop_in_place::hb14b2ce43ce6b626 () at /checkout/src/libcore/ptr.rs:59
#19 0x00007fa9d8ca5e52 in core::ptr::drop_in_place::h96b2689f939b992b () at /checkout/src/libcore/ptr.rs:59
#20 0x00007fa9d9171167 in core::ptr::drop_in_place::h471244251de39761 () at /checkout/src/libcore/ptr.rs:59
#21 0x00007fa9d9172cbe in core::ptr::drop_in_place::hfb58367d9737c5ab () at /checkout/src/libcore/ptr.rs:59
#22 0x00007fa9d9170ee2 in core::ptr::drop_in_place::h39cb27c4c6ab2451 () at /checkout/src/libcore/ptr.rs:59
#23 0x00007fa9d916babe in _$LT$std..collections..hash..table..RawTable$LT$K$C$$u20$V$GT$$GT$::rev_drop_buckets::haae6044a0544c6d1 (self=0x7fa9d6a6a130) at /checkout/src/libstd/collections/hash/table.rs:838
#24 0x00007fa9d916fe7f in _$LT$std..collections..hash..table..RawTable$LT$K$C$$u20$V$GT$$u20$as$u20$core..ops..drop..Drop$GT$::drop::h39603141b4bc4878 (self=0x7fa9d6a6a130) at /checkout/src/libstd/collections/hash/table.rs:1121
#25 0x00007fa9d9171abe in core::ptr::drop_in_place::h860c1af54175be58 () at /checkout/src/libcore/ptr.rs:59
#26 0x00007fa9d9172c32 in core::ptr::drop_in_place::hf54b63c7d9d504ba () at /checkout/src/libcore/ptr.rs:59
#27 0x00007fa9d9172556 in core::ptr::drop_in_place::hd36e51a2aefe9e03 () at /checkout/src/libcore/ptr.rs:59
#28 0x00007fa9d917bd49 in _$LT$alloc..sync..Arc$LT$T$GT$$GT$::drop_slow::h44fb76dc69c4009b (self=0x7fa9d4dfd1a0) at /checkout/src/liballoc/sync.rs:523
#29 0x00007fa9d917cd08 in _$LT$alloc..sync..Arc$LT$T$GT$$u20$as$u20$core..ops..drop..Drop$GT$::drop::ha42da6bfb8fb20b1 (self=0x7fa9d4dfd1a0) at /checkout/src/liballoc/sync.rs:976
#30 0x00007fa9d91722ce in core::ptr::drop_in_place::hc4cb2b49e54c7e13 () at /checkout/src/libcore/ptr.rs:59
#31 0x00007fa9d9196a25 in grpcio::server::request_call::he3018b8c840eb78b (inner=..., cq=0x7fa9d4dfd558) at /home/pingcap/.cargo/registry/src/github.com-1ecc6299db9ec823/grpcio-0.3.0/src/server.rs:389
#32 0x00007fa9d9175d3e in grpcio::async::callback::Request::resolve::hc4b3c871a48174e6 (self=..., cq=0x7fa9d4dfd558, success=false) at /home/pingcap/.cargo/registry/src/github.com-1ecc6299db9ec823/grpcio-0.3.0/src/async/callback.rs:38
#33 0x00007fa9d917524d in grpcio::async::CallTag::resolve::hc4ddf910dca377b3 (self=..., cq=0x7fa9d4dfd558, success=false) at /home/pingcap/.cargo/registry/src/github.com-1ecc6299db9ec823/grpcio-0.3.0/src/async/mod.rs:188
#34 0x00007fa9d9179fe5 in grpcio::env::poll_queue::hb0d34a71cdd6cfe4 (cq=...) at /home/pingcap/.cargo/registry/src/github.com-1ecc6299db9ec823/grpcio-0.3.0/src/env.rs:38
#35 0x00007fa9d917ab4e in grpcio::env::EnvBuilder::build::_$u7b$$u7b$closure$u7d$$u7d$::haf2492bc9a01b343 () at /home/pingcap/.cargo/registry/src/github.com-1ecc6299db9ec823/grpcio-0.3.0/src/env.rs:79
#36 0x00007fa9d918a703 in std::sys_common::backtrace::__rust_begin_short_backtrace::h5d08d0ab96272d38 (f=...) at /checkout/src/libstd/sys_common/backtrace.rs:136
#37 0x00007fa9d918b25e in std::thread::Builder::spawn::_$u7b$$u7b$closure$u7d$$u7d$::_$u7b$$u7b$closure$u7d$$u7d$::h4c29a7c87ec63054 () at /checkout/src/libstd/thread/mod.rs:409
#38 0x00007fa9d918a5b1 in _$LT$std..panic..AssertUnwindSafe$LT$F$GT$$u20$as$u20$core..ops..function..FnOnce$LT$$LP$$RP$$GT$$GT$::call_once::h3fa18d9c309d6b26 (self=..., _args=0) at /checkout/src/libstd/panic.rs:313
#39 0x00007fa9d918b365 in std::panicking::try::do_call::h9992f3688909cec0 (data=0x7fa9d4dfd6e8 "\300\325\240֩\177") at /checkout/src/libstd/panicking.rs:310
#40 0x00007fa9d9406f4a in __rust_maybe_catch_panic () at libpanic_unwind/lib.rs:105
#41 0x00007fa9d918b2a3 in std::panicking::try::h13ef73327f295d7c (f=...) at /checkout/src/libstd/panicking.rs:289
#42 0x00007fa9d918a943 in std::panic::catch_unwind::hd1c13d41c5a7c446 (f=...) at /checkout/src/libstd/panic.rs:392
#43 0x00007fa9d918b078 in std::thread::Builder::spawn::_$u7b$$u7b$closure$u7d$$u7d$::hfd0d1a2c523fbf44 () at /checkout/src/libstd/thread/mod.rs:408
#44 0x00007fa9d918c107 in _$LT$F$u20$as$u20$alloc..boxed..FnBox$LT$A$GT$$GT$::call_box::he41d6f73c6ef6096 (self=0x7fa9d6a0df20, args=0) at /checkout/src/liballoc/boxed.rs:640
#45 0x00007fa9d93e8e4b in call_once<(),()> () at /checkout/src/liballoc/boxed.rs:650
#46 std::sys_common::thread::start_thread::h03db540309e1b328 () at libstd/sys_common/thread.rs:24
#47 0x00007fa9d93e7ac6 in std::sys::unix::thread::Thread::new::thread_start::h0213a326a5a2e658 () at libstd/sys/unix/thread.rs:90
#48 0x00007fa9d8073e25 in start_thread () from /lib64/libpthread.so.0
#49 0x00007fa9d7b86bad in clone () from /lib64/libc.so.6
Looks like there is something wrong with the rocksdb.
What is your test output? @DorianZheng
oh, in my MacOS, the test can pass.
rustc -V
rustc 1.29.0-nightly (e5f6498d3 2018-07-10)
I have reproduced the problem, if we add drop(server)
at the end of the function, the panic will throw every time.
The problem is that when the server is dropped, it can't guarantee all the gRPC threads are closed, only sends a quit message. So we may meet a case that the server is dropped but the thread is not.
We need to join all gRPC threads exit in the server Drop. @overvenus
@overvenus ping
I've updated the code to grpcio 0.4.0, the bug still exists but seems harder to reproduce.
I will take a look later this week.
We need to join all gRPC threads exit in the server Drop.
I don't think so, gRPC threads(Environment
) may be shared by mutilpe servers or clients.
Rocksdb panicked becase there is a static local variable(env_
) inside rocksdb instance, it is destructed when we call exit
[1], gRPC threads may destruct the rocksdb instance after the env_
is destructed and a call to env_
leads to __cxa_pure_virtual
.
Here is a workaround for crash-kv
:
diff --git a/Cargo.toml b/Cargo.toml
index 7931695..e7b6ece 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,3 +16,5 @@ features = ["sse"]
[dev-dependencies]
tempfile = "3.0.3"
+lazy_static = "0.2"
+
diff --git a/src/main.rs b/src/main.rs
index 1f2955b..70c2968 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -5,6 +5,7 @@ extern crate grpcio_proto;
extern crate parking_lot;
extern crate rocksdb;
#[cfg(test)] extern crate tempfile;
+#[cfg(test)] #[macro_use] extern crate lazy_static;
mod kv;
mod kv_grpc;
@@ -70,11 +71,17 @@ mod test {
use super::KVDBService;
fn start_server() -> Server {
- let dir = tempdir().unwrap();
- let data_path = dir.path().to_str().unwrap();
- let db = DB::open_default(data_path).unwrap();
+ lazy_static! {
+ static ref SDB: Arc<RwLock<DB>> = {
+ let dir = tempdir().unwrap();
+ let data_path = dir.path().to_str().unwrap();
+ let db = DB::open_default(data_path).unwrap();
+ Arc::new(RwLock::new(db))
+ };
+ }
+
BTW, if we join gRPC threads when we drop Environment
, it may lead to dead lock. Because env may be dropped in a gRPC thread => failed to join thread: Resource deadlock avoided (os error 35)
.
[1] https://en.cppreference.com/w/cpp/language/storage_duration#Static_local_variables