nebula
nebula copied to clipboard
Graphd server coredump when nebula-java time-out
Please check the FAQ documentation before raising an issue
I have the same problem as in the link below, but I am not using the nebula-go client side, I am using the nebula-java client side, but the call stack of graphd crash is still the same.
Describe the bug (required) https://discuss.nebula-graph.com.cn/t/topic/10101/13 https://github.com/vesoft-inc/nebula/issues/4635
- nebula version v3.2.1
Thread 79 "graph-netio25" received signal SIGSEGV, Segmentation fault.
[Switching to LWP 393143]
0x000000000673b4c8 in apache::thrift::transport::THeader::getSequenceNumber() const ()
(gdb) bt
#0 0x000000000673b4c8 in apache::thrift::transport::THeader::getSequenceNumber() const ()
#1 0x000000000673ba84 in apache::thrift::HeaderServerChannel::HeaderRequest::isOneway() const ()
#2 0x000000000673c002 in apache::thrift::Cpp2Connection::Cpp2Request::isOneway() const ()
#3 0x0000000006735785 in apache::thrift::Cpp2Connection::stop() ()
#4 0x0000000006738dfb in ?? ()
#5 0x000000000673afc5 in ?? ()
#6 0x000000000673a91e in ?? ()
#7 0x0000000006738fb9 in apache::thrift::Cpp2Connection::channelClosed(folly::exception_wrapper&&) ()
#8 0x000000000675c3a7 in apache::thrift::HeaderServerChannel::messageChannelEOF() ()
#9 0x00000000066a4f9e in apache::thrift::Cpp2Channel::processReadEOF() ()
#10 0x00000000066a4966 in apache::thrift::Cpp2Channel::readEOF(wangle::HandlerContext<int, std::pair<std::unique_ptr<folly::IOBuf, std::default_delete<folly::IOBuf> >, apache::thrift::transport::THeader*> >*) ()
#11 0x00000000066b91b0 in wangle::ContextImpl<apache::thrift::Cpp2Channel>::readEOF() ()
#12 0x00000000066b78f6 in wangle::ContextImpl<apache::thrift::FramingHandler>::fireReadEOF() ()
#13 0x00000000066bb047 in wangle::Handler<folly::IOBufQueue&, std::pair<std::unique_ptr<folly::IOBuf, std::default_delete<folly::IOBuf> >, std::unique_ptr<apache::thrift::transport::THeader, std::default_delete<apache::thrift::transport::THeader> > >, std::pair<std::unique_ptr<folly::IOBuf, std::default_delete<folly::IOBuf> >, apache::thrift::transport::THeader*>, std::unique_ptr<folly::IOBuf, std::default_delete<folly::IOBuf> > >::readEOF(wangle::HandlerContext<std::pair<std::unique_ptr<folly::IOBuf, std::default_delete<folly::IOBuf> >, std::unique_ptr<apache::thrift::transport::THeader, std::default_delete<apache::thrift::transport::THeader> > >, std::unique_ptr<folly::IOBuf, std::default_delete<folly::IOBuf> > >*) ()
#14 0x00000000066b80bc in wangle::ContextImpl<apache::thrift::FramingHandler>::readEOF() ()
#15 0x00000000066b5e18 in wangle::ContextImpl<apache::thrift::TAsyncTransportHandler>::fireReadEOF() ()
#16 0x00000000066a7355 in apache::thrift::TAsyncTransportHandler::readEOF() ()
#17 0x00000000069d30a9 in folly::AsyncSocket::handleRead() ()
#18 0x00000000069c7e30 in folly::AsyncSocket::ioReady(unsigned short) ()
#19 0x0000000006a9f3e4 in ?? ()
#20 0x0000000006a9fc9f in event_base_loop ()
#21 0x00000000069e1d95 in folly::EventBase::loopBody(int, bool) ()
#22 0x00000000069e267e in folly::EventBase::loop() ()
#23 0x00000000069e4f08 in folly::EventBase::loopForever() ()
#24 0x000000000696c799 in folly::IOThreadPoolExecutor::threadRun(std::shared_ptr<folly::ThreadPoolExecutor::Thread>) ()
#25 0x000000000697b0c5 in void folly::detail::function::FunctionTraits<void ()>::callSmall<std::_Bind<void (folly::ThreadPoolExecutor::*(folly::ThreadPoolExecutor*, std::shared_ptr<folly::ThreadPoolExecutor::Thread>))(std::shared_ptr<folly::ThreadPoolExecutor::Thread>)> >(folly::detail::function::Data&) ()
#26 0x0000000004618188 in folly::detail::function::FunctionTraits<void ()>::operator()() (this=0x7f34168112c0)
at ../../../third_party_build/install/include/folly/Function.h:400
#27 0x00000000046a0446 in folly::NamedThreadFactory::newThread(folly::Function<void ()>&&)::{lambda()#1}::operator()() (
--Type <RET> for more, q to quit, c to continue without paging--
__closure=0x7f34168112c0) at ../../../../third_party_build/install/include/folly/executors/thread_factory/NamedThreadFactory.h:40
#28 0x00000000046dffc7 in std::__invoke_impl<void, folly::NamedThreadFactory::newThread(folly::Function<void ()>&&)::{lambda()#1}>(std::__invoke_other, folly::NamedThreadFactory::newThread(folly::Function<void ()>&&)::{lambda()#1}&&) (__f=...)
at /opt/buildtools/gcc-10.3.0/include/c++/10.3.0/bits/invoke.h:60
#29 0x00000000046dfa91 in std::__invoke<folly::NamedThreadFactory::newThread(folly::Function<void ()>&&)::{lambda()#1}>(std::__invoke_result&&, (folly::NamedThreadFactory::newThread(folly::Function<void ()>&&)::{lambda()#1}&&)...) (__fn=...)
at /opt/buildtools/gcc-10.3.0/include/c++/10.3.0/bits/invoke.h:95
#30 0x00000000046df78c in std::thread::_Invoker<std::tuple<folly::NamedThreadFactory::newThread(folly::Function<void ()>&&)::{lambda()#1}> >::_M_invoke<0ul>(std::_Index_tuple<0ul>) (this=0x7f34168112c0) at /opt/buildtools/gcc-10.3.0/include/c++/10.3.0/thread:264
#31 0x00000000046df3c8 in std::thread::_Invoker<std::tuple<folly::NamedThreadFactory::newThread(folly::Function<void ()>&&)::{lambda()#1}> >::operator()() (this=0x7f34168112c0) at /opt/buildtools/gcc-10.3.0/include/c++/10.3.0/thread:271
#32 0x00000000046ded0a in std::thread::_State_impl<std::thread::_Invoker<std::tuple<folly::NamedThreadFactory::newThread(folly::Function<void ()>&&)::{lambda()#1}> > >::_M_run() (this=0x7f34168112b0) at /opt/buildtools/gcc-10.3.0/include/c++/10.3.0/thread:215
#33 0x0000000006fd915e in std::execute_native_thread_routine (__p=0x7f34168112b0) at ../../../.././libstdc++-v3/src/c++11/thread.cc:78
#34 0x00007f3416fb9f3b in ?? () from /usr/lib64/libpthread.so.0
#35 0x00007f3416ef1840 in clone () from /usr/lib64/libc.so.6
Your Environments (required)
- OS:
uname -a
Linux ncn4a-wisemlopsdppservice-32-203-100 4.18.0-147.5.1.6.h934.eulerosv2r9.x86_64 #1 SMP Sat Feb 4 09:00:27 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
- Compiler:
g++ --version
orclang++ --version
g++ (Ubuntu 10.5.0-1ubuntu1~20.04) 10.5.0
- CPU:
lscpu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
Address sizes: 42 bits physical, 48 bits virtual
CPU(s): 16
On-line CPU(s) list: 0-15
Thread(s) per core: 2
Core(s) per socket: 8
Socket(s): 1
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 85
Model name: Intel(R) Xeon(R) Gold 6266C CPU @ 3.
00GHz
Stepping: 7
CPU MHz: 3000.000
BogoMIPS: 6000.00
Hypervisor vendor: KVM
Virtualization type: full
L1d cache: 256 KiB
L1i cache: 256 KiB
L2 cache: 8 MiB
L3 cache: 30.3 MiB
NUMA node0 CPU(s): 0-15
Vulnerability Itlb multihit: Processor vulnerable
Vulnerability L1tf: Not affected
Vulnerability Mds: Not affected
Vulnerability Meltdown: Not affected
Vulnerability Mmio stale data: Vulnerable: Clear CPU buffers attemp
ted, no microcode; SMT Host state un
known
Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass
disabled via prctl and seccomp
Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers
and __user pointer sanitization
Vulnerability Spectre v2: Vulnerable, IBPB: disabled, STIBP: d
isabled, PBRSB-eIBRS: Vulnerable
Vulnerability Srbds: Not affected
Vulnerability Tsx async abort: Vulnerable: Clear CPU buffers attemp
ted, no microcode; SMT Host state un
known
Flags: fpu vme de pse tsc msr pae mce cx8 a
pic sep mtrr pge mca cmov pat pse36
clflush mmx fxsr sse sse2 ss ht sysc
all nx pdpe1gb rdtscp lm constant_ts
c rep_good nopl xtopology nonstop_ts
c cpuid tsc_known_freq pni pclmulqdq
ssse3 fma cx16 pcid sse4_1 sse4_2 x
2apic movbe popcnt tsc_deadline_time
r aes xsave avx f16c rdrand hypervis
or lahf_lm abm 3dnowprefetch invpcid
_single ssbd ibrs ibpb stibp ibrs_en
hanced fsgsbase tsc_adjust bmi1 hle
avx2 smep bmi2 erms invpcid rtm mpx
avx512f avx512dq rdseed adx smap clf
lushopt clwb avx512cd avx512bw avx51
2vl xsaveopt xsavec xgetbv1 arat avx
512_vnni md_clear flush_l1d arch_cap
abilities
- Commit id (e.g.
a3ffc7d8
)
version v3.2.1
nebula-graphd version Git: bb2e684
How To Reproduce(required)
Steps to reproduce the behavior:
- Step 1
- Step 2
- Step 3
Expected behavior
Additional context
The reason for this problem is that the apache::thrift::HeaderServerChannel::HeaderRequest::isOneway()
function in the thrift library does not do null pointer verification
- thrift\lib\cpp2\async\HeaderServerChannel.h
class HeaderRequest final : public ResponseChannelRequest {
public:
HeaderRequest(
HeaderServerChannel* channel,
std::unique_ptr<folly::IOBuf>&& buf,
std::unique_ptr<apache::thrift::transport::THeader>&& header,
const server::TServerObserver::SamplingStatus& samplingStatus);
bool isActive() const override {
DCHECK(false);
return true;
}
// Note: 这个函数里面应该做空指针校验
bool isOneway() const override {
return header_->getSequenceNumber() == ONEWAY_REQUEST_ID;
}
bool includeEnvelope() const override { return true; }
void setInOrderRecvSequenceId(uint32_t seqId) { InOrderRecvSeqId_ = seqId; }
need change to
// Note: 这个函数里面应该做空指针校验
bool isOneway() const override {
if (header_.get() == nullptr) {
return true;
}
return header_->getSequenceNumber() == ONEWAY_REQUEST_ID;
}
this is my patch
diff -ur a/thrift/lib/cpp2/async/HeaderServerChannel.h b/thrift/lib/cpp2/async/HeaderServerChannel.h
--- a/thrift/lib/cpp2/async/HeaderServerChannel.h 2022-08-18 16:27:45.353299307 +0800
+++ b/thrift/lib/cpp2/async/HeaderServerChannel.h 2022-08-18 16:27:28.453299912 +0800
@@ -108,6 +108,10 @@
}
bool isOneway() const override {
+ if (header_.get() == nullptr) {
+ LOG(ERROR) << "header request is null";
+ return true;
+ }
return header_->getSequenceNumber() == ONEWAY_REQUEST_ID;
}
Amazing @flymysql would you mind PR to https://github.com/vesoft-inc/nebula-third-party/tree/master/project/patches ?
@dutor could you please take a look at this?
THANKS!
Seems already addressed by https://github.com/vesoft-inc/nebula-third-party/blob/release-3.3/project/patches/fbthrift-2021-11-29.patch ?
Seems already addressed by https://github.com/vesoft-inc/nebula-third-party/blob/release-3.3/project/patches/fbthrift-2021-11-29.patch ?
Oh, that's great. Since I'm still using the v3.0 version, I didn't find it fixed.
This is a known issue, and we have fixed it with an adhoc patch.