Search index is not flushed during SET command
I can't reproduce this bug with the minimal scenario.
This crash can be reproduced with traffic-replay using the -ignore-parse-errors option.
F20250602 09:40:36.681166 701 doc_index.cc:194] Check failed: ids_.count(key) == 0u (1 vs. 0)
*** Check failure stack trace: ***
@ 0x564f59cfecf3 google::LogMessage::Fail()
@ 0x564f59cfec39 google::LogMessage::SendToLog()
@ 0x564f59cfe3ec google::LogMessage::Flush()
@ 0x564f59d024e4 google::LogMessageFatal::~LogMessageFatal()
@ 0x564f59354c90 dfly::ShardDocIndex::DocKeyIndex::Add()
@ 0x564f59355ff8 dfly::ShardDocIndex::AddDoc()
@ 0x564f593582f4 dfly::ShardDocIndices::AddDoc()
@ 0x564f58e4a992 dfly::(anonymous namespace)::OpSet()
@ 0x564f58e4d914 _ZZN4dfly10HSetFamily4HSetEN4absl12lts_202407224SpanIKSt17basic_string_viewIcSt11char_traitsIcEEEERKNS_14CommandContextEENKUlPNS_11TransactionEPNS_11EngineShardEE_clESE_SG_
@ 0x564f58e52c41 _ZZN4dfly11Transaction18ScheduleSingleHopTIZNS_10HSetFamily4HSetEN4absl12lts_202407224SpanIKSt17basic_string_viewIcSt11char_traitsIcEEEERKNS_14CommandContextEEUlPS0_PNS_11EngineShardEE_EEDTclfp_fpTLDnEEEOT_ENKUlSF_SH_E_clESF_SH_
@ 0x564f58e55ddc _ZSt13__invoke_implIN6facade8OpStatusERKZN4dfly11Transaction18ScheduleSingleHopTIZNS2_10HSetFamily4HSetEN4absl12lts_202407224SpanIKSt17basic_string_viewIcSt11char_traitsIcEEEERKNS2_14CommandContextEEUlPS3_PNS2_11EngineShardEE_EEDTclfp_fpTLDnEEEOT_EUlSI_SK_E_JSI_SK_EESN_St14__invoke_otherOT0_DpOT1_
@ 0x564f58e55871 _ZSt8__invokeIRKZN4dfly11Transaction18ScheduleSingleHopTIZNS0_10HSetFamily4HSetEN4absl12lts_202407224SpanIKSt17basic_string_viewIcSt11char_traitsIcEEEERKNS0_14CommandContextEEUlPS1_PNS0_11EngineShardEE_EEDTclfp_fpTLDnEEEOT_EUlSG_SI_E_JSG_SI_EENSt15__invoke_resultISL_JDpT0_EE4typeESM_DpOSR_
@ 0x564f58e55090 _ZSt6invokeIRKZN4dfly11Transaction18ScheduleSingleHopTIZNS0_10HSetFamily4HSetEN4absl12lts_202407224SpanIKSt17basic_string_viewIcSt11char_traitsIcEEEERKNS0_14CommandContextEEUlPS1_PNS0_11EngineShardEE_EEDTclfp_fpTLDnEEEOT_EUlSG_SI_E_JSG_SI_EENSt13invoke_resultISL_JDpT0_EE4typeESM_DpOSR_
@ 0x564f58e54723 _ZN4absl12lts_2024072219functional_internal12InvokeObjectIZN4dfly11Transaction18ScheduleSingleHopTIZNS3_10HSetFamily4HSetENS0_4SpanIKSt17basic_string_viewIcSt11char_traitsIcEEEERKNS3_14CommandContextEEUlPS4_PNS3_11EngineShardEE_EEDTclfp_fpTLDnEEEOT_EUlSH_SJ_E_NS4_14RunnableResultEJSH_SJ_EEET0_NS1_7VoidPtrEDpNS1_8ForwardTIT1_E4typeE
@ 0x564f5953e2b6 absl::lts_20240722::FunctionRef<>::operator()()
@ 0x564f5952a9e7 dfly::Transaction::RunCallback()
@ 0x564f5952fdac dfly::Transaction::ScheduleInShard()
@ 0x564f5952b313 _ZZN4dfly11Transaction16ScheduleInternalEvENKUlvE_clEv
@ 0x564f5953ad3a _ZSt13__invoke_implIvRZN4dfly11Transaction16ScheduleInternalEvEUlvE_JEET_St14__invoke_otherOT0_DpOT1_
@ 0x564f5953a5b9 _ZSt10__invoke_rIvRZN4dfly11Transaction16ScheduleInternalEvEUlvE_JEENSt9enable_ifIX16is_invocable_r_vIT_T0_DpT1_EES5_E4typeEOS6_DpOS7_
@ 0x564f59539a03 _ZNSt17_Function_handlerIFvvEZN4dfly11Transaction16ScheduleInternalEvEUlvE_E9_M_invokeERKSt9_Any_data
@ 0x564f58d1b12e std::function<>::operator()()
@ 0x564f59c890a6 util::fb2::FiberQueue::Run()
@ 0x564f5961caab _ZZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEENKUlvE_clEv
@ 0x564f5961dc87 _ZSt13__invoke_implIvZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEEUlvE_JEET_St14__invoke_otherOT0_DpOT1_
@ 0x564f5961da0c _ZSt8__invokeIZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEEUlvE_JEENSt15__invoke_resultIT_JDpT0_EE4typeEOS8_DpOS9_
@ 0x564f5961d81a _ZSt12__apply_implIZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEEUlvE_St5tupleIJEEJEEDcOT_OT0_St16integer_sequenceImJXspT1_EEE
@ 0x564f5961d854 _ZSt5applyIZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEEUlvE_St5tupleIJEEEDcOT_OT0_
@ 0x564f5961d8d1 _ZN4util3fb26detail15WorkerFiberImplIZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEEUlvE_JEE4run_EON5boost7context5fiberE
@ 0x564f5961d44f _ZZN4util3fb26detail15WorkerFiberImplIZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEEUlvE_JEEC4INS0_19FixedStackAllocatorEEES8_NS0_13FiberPriorityERKN5boost7context12preallocatedEOT_OS9_ENKUlONSF_5fiberEE_clESN_
@ 0x564f5961e5c8 _ZSt13__invoke_implIN5boost7context5fiberERZN4util3fb26detail15WorkerFiberImplIZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEEUlvE_JEEC4INS4_19FixedStackAllocatorEEESC_NS4_13FiberPriorityERKNS1_12preallocatedEOT_OSD_EUlOS2_E_JS2_EESL_St14__invoke_otherOT0_DpOT1_
@ 0x564f5961e3f3 _ZSt8__invokeIRZN4util3fb26detail15WorkerFiberImplIZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEEUlvE_JEEC4INS1_19FixedStackAllocatorEEES9_NS1_13FiberPriorityERKN5boost7context12preallocatedEOT_OSA_EUlONSG_5fiberEE_JSN_EENSt15__invoke_resultISK_JDpT0_EE4typeESL_DpOSS_
*** SIGABRT received at time=1748846436 on cpu 6 ***
PC: @ 0x7ebbc7178b2c (unknown) pthread_kill
@ 0x564f59d87070 64 absl::lts_20240722::WriteFailureInfo()
@ 0x564f59d872ec 96 absl::lts_20240722::AbslFailureSignalHandler()
@ 0x7ebbc711f330 1776 (unknown)
@ 0x7ebbc711f27e 32 raise
@ 0x7ebbc71028ff 192 abort
@ 0x564f59d09ad2 176 google::DumpStackTraceAndExit()
@ 0x564f59cfecf3 16 google::LogMessage::Fail()
@ 0x564f59cfec39 160 google::LogMessage::SendToLog()
@ 0x564f59cfe3ec 80 google::LogMessage::Flush()
@ 0x564f59d024e4 32 google::LogMessageFatal::~LogMessageFatal()
@ 0x564f59354c90 208 dfly::ShardDocIndex::DocKeyIndex::Add()
@ 0x564f59355ff8 112 dfly::ShardDocIndex::AddDoc()
@ 0x564f593582f4 160 dfly::ShardDocIndices::AddDoc()
@ 0x564f58e4a992 592 dfly::(anonymous namespace)::OpSet()
@ 0x564f58e4d914 128 dfly::HSetFamily::HSet()::{lambda()#1}::operator()()
@ 0x564f58e52c41 64 dfly::Transaction::ScheduleSingleHopT<>()::{lambda()#1}::operator()()
@ 0x564f58e55ddc 64 std::__invoke_impl<>()
@ 0x564f58e55871 64 std::__invoke<>()
@ 0x564f58e55090 64 std::invoke<>()
@ 0x564f58e54723 96 absl::lts_20240722::functional_internal::InvokeObject<>()
@ 0x564f5953e2b6 64 absl::lts_20240722::FunctionRef<>::operator()()
@ 0x564f5952a9e7 240 dfly::Transaction::RunCallback()
@ 0x564f5952fdac 448 dfly::Transaction::ScheduleInShard()
@ 0x564f5952b313 96 dfly::Transaction::ScheduleInternal()::{lambda()#1}::operator()()
@ 0x564f5953ad3a 32 std::__invoke_impl<>()
@ 0x564f5953a5b9 32 std::__invoke_r<>()
@ 0x564f59539a03 32 std::_Function_handler<>::_M_invoke()
@ 0x564f58d1b12e 32 std::function<>::operator()()
@ 0x564f59c890a6 240 util::fb2::FiberQueue::Run()
@ 0x564f5961caab 32 dfly::TaskQueue::Start()::{lambda()#1}::operator()()
@ 0x564f5961dc87 32 std::__invoke_impl<>()
@ 0x564f5961da0c 32 std::__invoke<>()
@ ... and at least 10 more frames
[1] 693 IOT instruction (core dumped) ./dragonfly --dbfilename= --logtostderr 2>&1
It seems we didn't remove the key in some cases
@vyavdoshenko Note that it crashes even when proactor_threads=1, so there is no race condition here
@BagritsevichStepan I could reproduce it using several proactors. You can rename this issue.
Yes, but reproducing the issue with multiple proactors doesn't necessarily mean it's a race condition. You can only confirm it with proactor_threads=1. If the crash still happens in that case, then multithreading is not the issue. On the other hand, if the crash only occurs with multiple proactors and not with a single one, then it could indeed be a race condition
The bug is an edge case involving a document (hset) with an empty key and a HASH index that has an empty prefix (includes all hash sets). I am sure that it is pretty easy to reproduce this edge case in the unit test - it should involve a document being added to the index with an empty key.
@romange I agree that this is a bug. And I can catch a crash with empty key, but I can not catch the same crash, when DCHECK_EQ(ids_.count(key), 0u); fails.
And I don't see a bug in the ShardDocIndex for empty key except this check DCHECK_GT(keys_[id].size(), 0u); . That can be easily fixed
This is the key for which ids_.count(key) returns not zero if you run the traffic - "\"return {KEYS[1],ARGV[1]}\" 1 key:123 \"value\\twith\\ttabs\""
so there are two bugs and I suggest starting with the one we understand.
https://github.com/dragonflydb/dragonfly/pull/5264
I found the second bug:
127.0.0.1:6379> ft.create index on hash schema field TEXT
OK
127.0.0.1:6379> hset k1 field value
(integer) 1
127.0.0.1:6379> set k1 anothervalue
OK
127.0.0.1:6379> rename k1 anotherkey
OK
127.0.0.1:6379> hset k1 field value
Error: Server closed the connection
(1.29s)
The problem is that some commands don't check whether the key already exists and is indexed. We should call RemoveDoc in such cases
I think there are a lot of commands like set k1 anothervalue
@vyavdoshenko I would like to fix this
We already have issue for this: 5014
Another bug that I found during investigation: #5265
Crash still exists with the latest changes.
The recorded traffic: traffic-search.tar.gz
The traffic replay tool can be used with -skip-time-sec 900 option to speed up the reproduction of the crash.
The crash log:
F20250616 20:56:54.521500 42718 doc_index.cc:194] Check failed: ids_.count(key) == 0u (1 vs. 0)
*** Check failure stack trace: ***
@ 0x62fa8ae78063 google::LogMessage::Fail()
@ 0x62fa8ae77fa9 google::LogMessage::SendToLog()
@ 0x62fa8ae7775c google::LogMessage::Flush()
@ 0x62fa8ae7b854 google::LogMessageFatal::~LogMessageFatal()
@ 0x62fa8a7f9ca4 dfly::ShardDocIndex::DocKeyIndex::Add()
@ 0x62fa8a7fb0c0 dfly::ShardDocIndex::AddDoc()
@ 0x62fa8a7fd3bc dfly::ShardDocIndices::AddDoc()
@ 0x62fa8a3060bc dfly::(anonymous namespace)::OpSet()
@ 0x62fa8a30903e _ZZN4dfly10HSetFamily4HSetEN4absl12lts_202407224SpanIKSt17basic_string_viewIcSt11char_traitsIcEEEERKNS_14CommandContextEENKUlPNS_11TransactionEPNS_11EngineShardEE_clESE_SG_
@ 0x62fa8a30e36b _ZZN4dfly11Transaction18ScheduleSingleHopTIZNS_10HSetFamily4HSetEN4absl12lts_202407224SpanIKSt17basic_string_viewIcSt11char_traitsIcEEEERKNS_14CommandContextEEUlPS0_PNS_11EngineShardEE_EEDTclfp_fpTLDnEEEOT_ENKUlSF_SH_E_clESF_SH_
@ 0x62fa8a311506 _ZSt13__invoke_implIN6facade8OpStatusERKZN4dfly11Transaction18ScheduleSingleHopTIZNS2_10HSetFamily4HSetEN4absl12lts_202407224SpanIKSt17basic_string_viewIcSt11char_traitsIcEEEERKNS2_14CommandContextEEUlPS3_PNS2_11EngineShardEE_EEDTclfp_fpTLDnEEEOT_EUlSI_SK_E_JSI_SK_EESN_St14__invoke_otherOT0_DpOT1_
@ 0x62fa8a310f9b _ZSt8__invokeIRKZN4dfly11Transaction18ScheduleSingleHopTIZNS0_10HSetFamily4HSetEN4absl12lts_202407224SpanIKSt17basic_string_viewIcSt11char_traitsIcEEEERKNS0_14CommandContextEEUlPS1_PNS0_11EngineShardEE_EEDTclfp_fpTLDnEEEOT_EUlSG_SI_E_JSG_SI_EENSt15__invoke_resultISL_JDpT0_EE4typeESM_DpOSR_
@ 0x62fa8a3107ba _ZSt6invokeIRKZN4dfly11Transaction18ScheduleSingleHopTIZNS0_10HSetFamily4HSetEN4absl12lts_202407224SpanIKSt17basic_string_viewIcSt11char_traitsIcEEEERKNS0_14CommandContextEEUlPS1_PNS0_11EngineShardEE_EEDTclfp_fpTLDnEEEOT_EUlSG_SI_E_JSG_SI_EENSt13invoke_resultISL_JDpT0_EE4typeESM_DpOSR_
@ 0x62fa8a30fe4d _ZN4absl12lts_2024072219functional_internal12InvokeObjectIZN4dfly11Transaction18ScheduleSingleHopTIZNS3_10HSetFamily4HSetENS0_4SpanIKSt17basic_string_viewIcSt11char_traitsIcEEEERKNS3_14CommandContextEEUlPS4_PNS3_11EngineShardEE_EEDTclfp_fpTLDnEEEOT_EUlSH_SJ_E_NS4_14RunnableResultEJSH_SJ_EEET0_NS1_7VoidPtrEDpNS1_8ForwardTIT1_E4typeE
@ 0x62fa8a9e31ce absl::lts_20240722::FunctionRef<>::operator()()
@ 0x62fa8a9cfd3b dfly::Transaction::RunCallback()
@ 0x62fa8a9d4d08 dfly::Transaction::ScheduleInShard()
@ 0x62fa8a9d0667 _ZZN4dfly11Transaction16ScheduleInternalEvENKUlvE_clEv
@ 0x62fa8a9dfcd4 _ZSt13__invoke_implIvRZN4dfly11Transaction16ScheduleInternalEvEUlvE_JEET_St14__invoke_otherOT0_DpOT1_
@ 0x62fa8a9df553 _ZSt10__invoke_rIvRZN4dfly11Transaction16ScheduleInternalEvEUlvE_JEENSt9enable_ifIX16is_invocable_r_vIT_T0_DpT1_EES5_E4typeEOS6_DpOS7_
@ 0x62fa8a9de99d _ZNSt17_Function_handlerIFvvEZN4dfly11Transaction16ScheduleInternalEvEUlvE_E9_M_invokeERKSt9_Any_data
@ 0x62fa8a1d4d2e std::function<>::operator()()
@ 0x62fa8ae01bd0 util::fb2::FiberQueue::Run()
@ 0x62fa8aac4149 _ZZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEENKUlvE_clEv
@ 0x62fa8aac5325 _ZSt13__invoke_implIvZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEEUlvE_JEET_St14__invoke_otherOT0_DpOT1_
@ 0x62fa8aac50aa _ZSt8__invokeIZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEEUlvE_JEENSt15__invoke_resultIT_JDpT0_EE4typeEOS8_DpOS9_
@ 0x62fa8aac4eb8 _ZSt12__apply_implIZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEEUlvE_St5tupleIJEEJEEDcOT_OT0_St16integer_sequenceImJXspT1_EEE
@ 0x62fa8aac4ef2 _ZSt5applyIZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEEUlvE_St5tupleIJEEEDcOT_OT0_
@ 0x62fa8aac4f6f _ZN4util3fb26detail15WorkerFiberImplIZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEEUlvE_JEE4run_EON5boost7context5fiberE
@ 0x62fa8aac4aed _ZZN4util3fb26detail15WorkerFiberImplIZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEEUlvE_JEEC4INS0_19FixedStackAllocatorEEES8_NS0_13FiberPriorityERKN5boost7context12preallocatedEOT_OS9_ENKUlONSF_5fiberEE_clESN_
@ 0x62fa8aac5c66 _ZSt13__invoke_implIN5boost7context5fiberERZN4util3fb26detail15WorkerFiberImplIZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEEUlvE_JEEC4INS4_19FixedStackAllocatorEEESC_NS4_13FiberPriorityERKNS1_12preallocatedEOT_OSD_EUlOS2_E_JS2_EESL_St14__invoke_otherOT0_DpOT1_
@ 0x62fa8aac5a91 _ZSt8__invokeIRZN4util3fb26detail15WorkerFiberImplIZN4dfly9TaskQueue5StartESt17basic_string_viewIcSt11char_traitsIcEEEUlvE_JEEC4INS1_19FixedStackAllocatorEEES9_NS1_13FiberPriorityERKN5boost7context12preallocatedEOT_OSA_EUlONSG_5fiberEE_JSN_EENSt15__invoke_resultISK_JDpT0_EE4typeESL_DpOSS_
*** SIGABRT received at time=1750096614 on cpu 3 ***
PC: @ 0x7e7f1147eb2c (unknown) pthread_kill
@ 0x62fa8af0019e 64 absl::lts_20240722::WriteFailureInfo()
@ 0x62fa8af0041a 96 absl::lts_20240722::AbslFailureSignalHandler()
@ 0x7e7f11425330 1776 (unknown)
@ 0x7e7f1142527e 32 raise
@ 0x7e7f114088ff 192 abort
@ 0x62fa8ae82e8e 176 google::DumpStackTraceAndExit()
@ 0x62fa8ae78063 16 google::LogMessage::Fail()
@ 0x62fa8ae77fa9 160 google::LogMessage::SendToLog()
@ 0x62fa8ae7775c 80 google::LogMessage::Flush()
@ 0x62fa8ae7b854 32 google::LogMessageFatal::~LogMessageFatal()
@ 0x62fa8a7f9ca4 208 dfly::ShardDocIndex::DocKeyIndex::Add()
@ 0x62fa8a7fb0c0 112 dfly::ShardDocIndex::AddDoc()
@ 0x62fa8a7fd3bc 160 dfly::ShardDocIndices::AddDoc()
@ 0x62fa8a3060bc 592 dfly::(anonymous namespace)::OpSet()
@ 0x62fa8a30903e 128 dfly::HSetFamily::HSet()::{lambda()#1}::operator()()
@ 0x62fa8a30e36b 64 dfly::Transaction::ScheduleSingleHopT<>()::{lambda()#1}::operator()()
@ 0x62fa8a311506 64 std::__invoke_impl<>()
@ 0x62fa8a310f9b 64 std::__invoke<>()
@ 0x62fa8a3107ba 64 std::invoke<>()
@ 0x62fa8a30fe4d 96 absl::lts_20240722::functional_internal::InvokeObject<>()
@ 0x62fa8a9e31ce 64 absl::lts_20240722::FunctionRef<>::operator()()
@ 0x62fa8a9cfd3b 240 dfly::Transaction::RunCallback()
@ 0x62fa8a9d4d08 448 dfly::Transaction::ScheduleInShard()
@ 0x62fa8a9d0667 96 dfly::Transaction::ScheduleInternal()::{lambda()#1}::operator()()
@ 0x62fa8a9dfcd4 32 std::__invoke_impl<>()
@ 0x62fa8a9df553 32 std::__invoke_r<>()
@ 0x62fa8a9de99d 32 std::_Function_handler<>::_M_invoke()
@ 0x62fa8a1d4d2e 32 std::function<>::operator()()
@ 0x62fa8ae01bd0 240 util::fb2::FiberQueue::Run()
@ 0x62fa8aac4149 32 dfly::TaskQueue::Start()::{lambda()#1}::operator()()
@ 0x62fa8aac5325 32 std::__invoke_impl<>()
@ 0x62fa8aac50aa 32 std::__invoke<>()
@ ... and at least 10 more frames
[1] 42713 IOT instruction (core dumped) ./dragonfly --dbfilename= --logtostderr 2>&1
It is interesting, I added the same snippet from the traffic to our search tests. Maybe the problem could be if we have multiple shards.
Lets lower priority of this bug right now and focus on higher priority issues