yugabyte-db
yugabyte-db copied to clipboard
[DocDB] Raft Replication progress stalled in certain scenarios.
Jira Link: DB-3257
Description
It is possible that a replica may be ahead of the committed id. However, it is so far ahead that the leader is only able to send a fraction of all the pending op-ids.
Assume that committed op id is 100, Replica has up to idx 200. Leader sends opids [101 -> 150] to the peer. However, since the peer already has all of them, it does not update it's last_received_from_current_leader and responds to the leader with 0.0 as its last_received_from_current_leader. This causes the leader to keep resending [101 -> 150] to the peer, stalling progress.
We should update last_received_from_current_leader based on whenever we receive something from the leader. (Note that updating last_received may not be safe, unless we have received something new. )
i.e.
- last_received_from_current_leader must be updated based on un-DeDuplicated entries
- last_received must only be updated based on DeDuplicated entries
a repro test to add to raft_consensus-test.cc
TEST_F(RaftConsensusTest, TestLastLeaderReceivedNotMinimum) {
SetUpConsensus(kMinimumTerm, 3);
SetUpGeneralExpectations();
ConsensusBootstrapInfo info;
ASSERT_OK(consensus_->Start(info));
ConsensusRequestPB request;
ConsensusResponsePB response;
int64_t caller_term = 0;
int64_t log_index = 0;
caller_term = 1;
string caller_uuid = config_.peers(0).permanent_uuid();
OpIdPB preceding_opid = MinimumOpId();
// Heartbeat. This will cause the term to increment on the follower.
request = MakeConsensusRequest(caller_term, caller_uuid, preceding_opid);
response.Clear();
ASSERT_OK(consensus_->Update(&request, &response, CoarseBigDeadline()));
ASSERT_FALSE(response.status().has_error()) << response.ShortDebugString();
ASSERT_EQ(caller_term, response.responder_term());
ASSERT_OPID_EQ(response.status().last_received(), MinimumOpId());
ASSERT_OPID_EQ(response.status().last_received_current_leader(), MinimumOpId());
// Replicate a no-op.
OpIdPB noop_opid;
for (int i = 0; i < 2000; ++i) {
noop_opid = MakeOpId(caller_term, ++log_index);
AddNoOpToConsensusRequest(&request, noop_opid);
}
response.Clear();
ASSERT_OK(consensus_->Update(&request, &response, CoarseBigDeadline()));
ASSERT_FALSE(response.status().has_error()) << response.ShortDebugString();
ASSERT_OPID_EQ(response.status().last_received(), noop_opid);
ASSERT_OPID_EQ(response.status().last_received_current_leader(), noop_opid);
caller_term = 2;
caller_uuid = config_.peers(1).permanent_uuid();
auto behind_op_id = MakeOpId(caller_term - 1, 1);
request = MakeConsensusRequest(caller_term, caller_uuid, behind_op_id);
AddNoOpToConsensusRequest(&request, behind_op_id);
response.Clear();
ASSERT_OK(consensus_->Update(&request, &response, CoarseBigDeadline()));
ASSERT_FALSE(response.status().has_error()) << response.ShortDebugString();
ASSERT_OPID_EQ(response.status().last_received(), noop_opid);
ASSERT_OPID_EQ(response.status().last_received_current_leader(), noop_opid);
}
commit cdfe5f9224bad61101aebbe3b19a01a46e766b71
Author: Amitanand Aiyer <[email protected]>
Date: Mon Aug 22 23:36:17 2022 -0700
fix
diff --git a/src/yb/consensus/raft_consensus.cc b/src/yb/consensus/raft_consensus.cc
index d53a596979..15ed3c7dc7 100644
--- a/src/yb/consensus/raft_consensus.cc
+++ b/src/yb/consensus/raft_consensus.cc
@@ -2214,6 +2214,8 @@ Status RaftConsensus::MarkOperationsAsCommittedUnlocked(const ConsensusRequestPB
"Bad preceding_opid: $0, last received: $1",
deduped_req.preceding_op_id,
state_->GetLastReceivedOpIdUnlocked());
+ } else if (request.ops_size() != 0) {
+ state_->UpdateLastReceivedOpIdFromCurrentLeaderUnlocked(request.ops().rbegin()->id());
}
VLOG_WITH_PREFIX(1) << "Marking committed up to " << apply_up_to;
diff --git a/src/yb/consensus/replica_state.cc b/src/yb/consensus/replica_state.cc
index 282a61c988..42171b9733 100644
--- a/src/yb/consensus/replica_state.cc
+++ b/src/yb/consensus/replica_state.cc
@@ -1040,7 +1040,11 @@ void ReplicaState::UpdateLastReceivedOpIdUnlocked(const OpIdPB& op_id) {
<< ", Trace:" << std::endl << (trace ? trace->DumpToString(true) : "No trace found");
last_received_op_id_ = yb::OpId::FromPB(op_id);
- last_received_op_id_current_leader_ = last_received_op_id_;
+ UpdateLastReceivedOpIdFromCurrentLeaderUnlocked(op_id);
+}
+
+void ReplicaState::UpdateLastReceivedOpIdFromCurrentLeaderUnlocked(const OpIdPB& op_id) {
+ last_received_op_id_current_leader_ = yb::OpId::FromPB(op_id);
next_index_ = op_id.index() + 1;
}
diff --git a/src/yb/consensus/replica_state.h b/src/yb/consensus/replica_state.h
index 379f858711..f4e4a8fd6a 100644
--- a/src/yb/consensus/replica_state.h
+++ b/src/yb/consensus/replica_state.h
@@ -318,6 +318,10 @@ class ReplicaState {
// This must be called under a lock.
void UpdateLastReceivedOpIdUnlocked(const OpIdPB& op_id);
+ // Updates the last received operation from current leader.
+ // This must be called under a lock.
+ void UpdateLastReceivedOpIdFromCurrentLeaderUnlocked(const OpIdPB& op_id);
+
// Returns the last received op id. This must be called under the lock.
const OpId& GetLastReceivedOpIdUnlocked() const;
@robertsami , Lets backport the fix to stable releases so that customers?
@amitanandaiyer, Can you take a stab at a more complete fix, given that the original approach lead to other failures in raft?
First attempt to the fix caused https://github.com/yugabyte/yugabyte-db/issues/14213