orchestrator
orchestrator copied to clipboard
"timed out enqueuing operation" when orchestrator-client -c forget -i "ip:port"
- your orchestrator.conf.json config file/contents
{
"Debug": true,
"EnableSyslog": false,
"ListenAddress": ":3000",
"MySQLTopologyUser": "orchestrator",
"MySQLTopologyPassword": "password",
"MySQLTopologyCredentialsConfigFile": "",
"MySQLTopologySSLPrivateKeyFile": "",
"MySQLTopologySSLCertFile": "",
"MySQLTopologySSLCAFile": "",
"MySQLTopologySSLSkipVerify": true,
"MySQLTopologyUseMutualTLS": false,
"BackendDB": "sqlite",
"SQLite3DataFile": "/usr/local/orchestrator/orchestrator.sqlite3",
"MySQLConnectTimeoutSeconds": 1,
"DefaultInstancePort": 3306,
"DiscoverByShowSlaveHosts": true,
"InstancePollSeconds": 5,
"DiscoveryIgnoreReplicaHostnameFilters": [
"a_host_i_want_to_ignore[.]example[.]com",
".*[.]ignore_all_hosts_from_this_domain[.]example[.]com",
"a_host_with_extra_port_i_want_to_ignore[.]example[.]com:3307",
"10.111.*.*",
"115.236.*.*"
],
"UnseenInstanceForgetHours": 240,
"SnapshotTopologiesIntervalHours": 0,
"InstanceBulkOperationsWaitTimeoutSeconds": 10,
"HostnameResolveMethod": "none",
"MySQLHostnameResolveMethod": "",
"SkipBinlogServerUnresolveCheck": true,
"ExpiryHostnameResolvesMinutes": 60,
"RejectHostnameResolvePattern": "",
"ReasonableReplicationLagSeconds": 10,
"ProblemIgnoreHostnameFilters": [],
"VerifyReplicationFilters": false,
"ReasonableMaintenanceReplicationLagSeconds": 20,
"CandidateInstanceExpireMinutes": 60,
"AuditLogFile": "",
"AuditToSyslog": false,
"RemoveTextFromHostnameDisplay": ".mydomain.com:3306",
"ReadOnly": false,
"AuthenticationMethod": "basic",
"HTTPAuthUser": "orchestrator",
"HTTPAuthPassword": "password",
"AuthUserHeader": "",
"PowerAuthUsers": [
"*"
],
"ClusterNameToAlias": {
"127.0.0.1": "test suite"
},
"ReplicationLagQuery": "",
"DetectInstanceAliasQuery": "",
"DetectPromotionRuleQuery": "",
"DataCenterPattern": "[.]([^.]+)[.][^.]+[.]mydomain[.]com",
"PhysicalEnvironmentPattern": "[.]([^.]+[.][^.]+)[.]mydomain[.]com",
"PromotionIgnoreHostnameFilters": [],
"DetectSemiSyncEnforcedQuery": "",
"ServeAgentsHttp": false,
"AgentsServerPort": ":3001",
"AgentsUseSSL": false,
"AgentsUseMutualTLS": false,
"AgentSSLSkipVerify": false,
"AgentSSLPrivateKeyFile": "",
"AgentSSLCertFile": "",
"AgentSSLCAFile": "",
"AgentSSLValidOUs": [],
"UseSSL": false,
"UseMutualTLS": false,
"SSLSkipVerify": false,
"SSLPrivateKeyFile": "",
"SSLCertFile": "",
"SSLCAFile": "",
"SSLValidOUs": [],
"URLPrefix": "",
"StatusEndpoint": "/api/status",
"StatusSimpleHealth": true,
"StatusOUVerify": false,
"AgentPollMinutes": 60,
"UnseenAgentForgetHours": 6,
"StaleSeedFailMinutes": 60,
"SeedAcceptableBytesDiff": 8192,
"PseudoGTIDPattern": "",
"PseudoGTIDPatternIsFixedSubstring": false,
"PseudoGTIDMonotonicHint": "asc:",
"DetectPseudoGTIDQuery": "",
"BinlogEventsChunkSize": 10000,
"SkipBinlogEventsContaining": [],
"ReduceReplicationAnalysisCount": true,
"FailureDetectionPeriodBlockMinutes": 60,
"FailMasterPromotionOnLagMinutes": 0,
"RecoveryPeriodBlockSeconds": 3600,
"RecoveryIgnoreHostnameFilters": [],
"RecoverMasterClusterFilters": ["none"],
"RecoverIntermediateMasterClusterFilters": [
"none"
],
"OnFailureDetectionProcesses": [
"echo 'Detected {failureType} on {failureCluster}. Affected replicas: {countSlaves}' >> /tmp/recovery.log"
],
"PreGracefulTakeoverProcesses": [
"echo 'Planned takeover about to take place on {failureCluster}. Master will switch to read_only' >> /tmp/recovery.log"
],
"PreFailoverProcesses": [
"echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/recovery.log"
],
"PostFailoverProcesses": [
"echo '(for all types) Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
],
"PostUnsuccessfulFailoverProcesses": [],
"PostMasterFailoverProcesses": [
"echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Promoted: {successorHost}:{successorPort}' >> /tmp/recovery.log"
],
"PostIntermediateMasterFailoverProcesses": [
"echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
],
"PostGracefulTakeoverProcesses": [
"echo 'Planned takeover complete' >> /tmp/recovery.log"
],
"CoMasterRecoveryMustPromoteOtherCoMaster": true,
"DetachLostSlavesAfterMasterFailover": true,
"ApplyMySQLPromotionAfterMasterFailover": true,
"PreventCrossDataCenterMasterFailover": false,
"PreventCrossRegionMasterFailover": false,
"MasterFailoverDetachReplicaMasterHost": false,
"MasterFailoverLostInstancesDowntimeMinutes": 0,
"PostponeReplicaRecoveryOnLagMinutes": 0,
"OSCIgnoreHostnameFilters": [],
"GraphiteAddr": "",
"GraphitePath": "",
"GraphiteConvertHostnameDotsToUnderscores": true,
"ConsulAddress": "",
"ConsulAclToken": "",
"RaftEnabled": true,
"RaftDataDir": "/var/lib/orchestrator",
"RaftBind": "10.21.17.114",
"DefaultRaftPort": 10008,
"RaftNodes": [
"10.21.17.112",
"10.21.17.113",
"10.21.17.114"
]
}
- your topology (e.g. run
orchestrator-client -c topology -alias my-cluster)
# orchestrator-client -c topology -alias 10.21.17.73:23306
10.21.17.73:23306 [0s,ok,5.7.25-28-log,rw,ROW,>>,GTID]
+ 10.21.17.74:23307 [0s,ok,5.7.25-28-log,ro,ROW,>>,GTID]
# orchestrator-client -c raft-leader
10.21.17.114:10008
orchestrator-client -c api -path status | jq .
{
"Code": "OK",
"Message": "Application node is healthy",
"Details": {
"Healthy": true,
"Hostname": "mysql-10-21-17-114",
"Token": "d79fc086292690e23a4c2bafbab8315b123f743a641b89622ae2cbd77a2bcabb",
"IsActiveNode": true,
"ActiveNode": {
"Hostname": "10.21.17.114:10008",
"Token": "",
"AppVersion": "",
"FirstSeenActive": "",
"LastSeenActive": "",
"ExtraInfo": "",
"Command": "",
"DBBackend": "",
"LastReported": "0001-01-01T00:00:00Z"
},
"Error": null,
"AvailableNodes": [
{
"Hostname": "mysql-10-21-17-114",
"Token": "d79fc086292690e23a4c2bafbab8315b123f743a641b89622ae2cbd77a2bcabb",
"AppVersion": "3.2.2",
"FirstSeenActive": "2020-12-24T00:47:29Z",
"LastSeenActive": "2021-01-07T03:54:14Z",
"ExtraInfo": "",
"Command": "",
"DBBackend": "/usr/local/orchestrator/orchestrator.sqlite3",
"LastReported": "0001-01-01T00:00:00Z"
}
],
"RaftLeader": "10.21.17.114:10008",
"IsRaftLeader": true,
"RaftLeaderURI": "http://10.21.17.114:3000",
"RaftAdvertise": "10.21.17.114",
"RaftHealthyMembers": null
}
}
- my issue
# orchestrator-client -c forget -i 10.21.17.74:23307
timed out enqueuing operation
- what did you expect to happen?
corresponding instance can be forgotten - what happened?
timeout - orchestrator logs; please use
--debug --stackfor maximum verbosityno logs output for this command
The error message is generated by raft, in vendor/github.com/hashicorp/raft/raft.go
I se eyou have "RaftHealthyMembers": null in your JSON output. Please cehck the logs (not for orchestrator-client, but for orchestrator) with --debug-stack to see why orchestrator is failing to detect the other raft nodes on .112 and .113.
I have added --debug --stack and restart orchestrator, now it works properly. It might take time to reproduce the issue.
keep the issue open ? I will take care of the issue. Thanks for your answer
Good luck; please close it once you feel everything's working.
# ps -ef | grep orch
root 28777 1 15 16:45 ? 00:07:31 /usr/local/orchestrator/orchestrator http --debug --stack
- orchestrator-client -c api -path status | jq .
RaftHealthyMembers becomes null once it runs for a while.
{
"Code": "OK",
"Message": "Application node is healthy",
"Details": {
"Healthy": true,
"Hostname": "mysql-10-21-17-114",
"Token": "972209b5356130e2aff8eaae9df1e757c7181fb161cb95631a93280997ad4df6",
"IsActiveNode": true,
"ActiveNode": {
"Hostname": "10.21.17.114:10008",
"Token": "",
"AppVersion": "",
"FirstSeenActive": "",
"LastSeenActive": "",
"ExtraInfo": "",
"Command": "",
"DBBackend": "",
"LastReported": "0001-01-01T00:00:00Z"
},
"Error": null,
"AvailableNodes": [
{
"Hostname": "mysql-10-21-17-114",
"Token": "972209b5356130e2aff8eaae9df1e757c7181fb161cb95631a93280997ad4df6",
"AppVersion": "3.2.2",
"FirstSeenActive": "2021-01-07T08:45:07Z",
"LastSeenActive": "2021-01-07T09:32:52Z",
"ExtraInfo": "",
"Command": "",
"DBBackend": "/usr/local/orchestrator/orchestrator.sqlite3",
"LastReported": "0001-01-01T00:00:00Z"
}
],
"RaftLeader": "10.21.17.114:10008",
"IsRaftLeader": true,
"RaftLeaderURI": "http://10.21.17.114:3000",
"RaftAdvertise": "10.21.17.114",
"RaftHealthyMembers": null
}
}
i didn't find any useful information from the log. Could you give me a hand ? 114 log
What does the log show on the two other orchestrators? Specifically look for raft (e.g. grep -i raft) in the logs.
I didn't any special, except the following.
2021-01-07 17:25:49 ERROR HttpGetLeader: got 500 status on http://10.21.17.114:3000/api/raft-follower-health-report/15097c82/10.21.17.113/10.21.17.113
The 112 and 113 logs you attached don't seem to be with --debug --stack
all the logs come from /var/log/messages
[root@mysql-10-21-17-112 orchestrator]# ps -ef | grep orch
root 57769 1 0 2020 ? 02:05:01 /usr/sbin/haproxy -f /etc/haproxy/haproxy-orchestrator.cfg -p /run/haproxy80.pid
root 65475 1 15 16:45 ? 00:19:36 /usr/local/orchestrator/orchestrator http --debug --stack
root 112397 63167 0 18:52 pts/0 00:00:00 grep --color=auto orch
[root@mysql-10-21-17-112 orchestrator]#
on 112, 113, please run orchestrator with --debug --stack and upload again please.
there's no DEBUG entries in those logs; only in 114
I have run orchestrator with --debug --stack as you can see the ps -ef run on 112.
And i grep debug from the uploaded log. I don't know whether the following info is debug entries or not .
Many thanks
[root@mysql-10-21-17-112 ~]# grep -i debug raft.log | head -10
Jan 07 15:08:49 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:08:49 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:08:54 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:08:54 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:08:59 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:08:59 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:09:04 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:09:04 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:09:09 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:09:09 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:09:14 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:09:14 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:09:19 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:09:19 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:09:24 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:09:24 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:09:29 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:09:29 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:09:34 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:09:34 DEBUG raft leader is 10.21.17.114:10008; state: Follower
we get the following logs yesterday.
[root@mysql-10-21-17-113 ~]# grep -i orches raft-113.log | grep -i "ERR" | grep -i raft
Jan 07 16:43:46 mysql-10-21-17-113 orchestrator[26135]: 2021/01/07 16:43:46 [ERR] raft: Failed to make RequestVote RPC to 10.21.17.114:10008: dial tcp 10.21.17.114:10008: connect: connection refused
Jan 07 16:45:55 mysql-10-21-17-113 orchestrator[24342]: 2021-01-07 16:45:55 ERROR HttpGetLeader: got 500 status on http://10.21.17.112:3000/api/raft-follower-health-report/bcf4da4f/10.21.17.113/10.21.17.113
Jan 07 17:19:20 mysql-10-21-17-113 orchestrator[24342]: 2021/01/07 17:19:20 [ERR] raft-net: Failed to flush response: write tcp 10.21.17.113:10008->10.21.17.114:52874: write: broken pipe
Jan 07 17:19:20 mysql-10-21-17-113 orchestrator[24342]: 2021/01/07 17:19:20 [ERR] raft-net: Failed to flush response: write tcp 10.21.17.113:10008->10.21.17.114:52876: write: broken pipe
on 113 with : tcpdump -i any port 10008

orchestrator-raft-from113WithConditionPort10008.pcapng
The download link is available for 2hours. I will upload again if you miss it
Thank you, let me take another look. Can you please post the config files for 112 and 113?
- 113
{
"Debug": true,
"EnableSyslog": false,
"ListenAddress": ":3000",
"MySQLTopologyUser": "orchestrator",
"MySQLTopologyPassword": "password",
"MySQLTopologyCredentialsConfigFile": "",
"MySQLTopologySSLPrivateKeyFile": "",
"MySQLTopologySSLCertFile": "",
"MySQLTopologySSLCAFile": "",
"MySQLTopologySSLSkipVerify": true,
"MySQLTopologyUseMutualTLS": false,
"BackendDB": "sqlite",
"SQLite3DataFile": "/usr/local/orchestrator/orchestrator.sqlite3",
"MySQLConnectTimeoutSeconds": 1,
"DefaultInstancePort": 3306,
"DiscoverByShowSlaveHosts": true,
"InstancePollSeconds": 5,
"DiscoveryIgnoreReplicaHostnameFilters": [
"a_host_i_want_to_ignore[.]example[.]com",
".*[.]ignore_all_hosts_from_this_domain[.]example[.]com",
"a_host_with_extra_port_i_want_to_ignore[.]example[.]com:3307",
"10.111.*.*",
"115.236.*.*"
],
"UnseenInstanceForgetHours": 240,
"SnapshotTopologiesIntervalHours": 0,
"InstanceBulkOperationsWaitTimeoutSeconds": 10,
"HostnameResolveMethod": "none",
"MySQLHostnameResolveMethod": "",
"SkipBinlogServerUnresolveCheck": true,
"ExpiryHostnameResolvesMinutes": 60,
"RejectHostnameResolvePattern": "",
"ReasonableReplicationLagSeconds": 10,
"ProblemIgnoreHostnameFilters": [],
"VerifyReplicationFilters": false,
"ReasonableMaintenanceReplicationLagSeconds": 20,
"CandidateInstanceExpireMinutes": 60,
"AuditLogFile": "",
"AuditToSyslog": false,
"RemoveTextFromHostnameDisplay": ".mydomain.com:3306",
"ReadOnly": false,
"AuthenticationMethod": "basic",
"HTTPAuthUser": "orchestrator",
"HTTPAuthPassword": "password",
"AuthUserHeader": "",
"PowerAuthUsers": [
"*"
],
"ClusterNameToAlias": {
"127.0.0.1": "test suite"
},
"ReplicationLagQuery": "",
"DetectInstanceAliasQuery": "",
"DetectPromotionRuleQuery": "",
"DataCenterPattern": "[.]([^.]+)[.][^.]+[.]mydomain[.]com",
"PhysicalEnvironmentPattern": "[.]([^.]+[.][^.]+)[.]mydomain[.]com",
"PromotionIgnoreHostnameFilters": [],
"DetectSemiSyncEnforcedQuery": "",
"ServeAgentsHttp": false,
"AgentsServerPort": ":3001",
"AgentsUseSSL": false,
"AgentsUseMutualTLS": false,
"AgentSSLSkipVerify": false,
"AgentSSLPrivateKeyFile": "",
"AgentSSLCertFile": "",
"AgentSSLCAFile": "",
"AgentSSLValidOUs": [],
"UseSSL": false,
"UseMutualTLS": false,
"SSLSkipVerify": false,
"SSLPrivateKeyFile": "",
"SSLCertFile": "",
"SSLCAFile": "",
"SSLValidOUs": [],
"URLPrefix": "",
"StatusEndpoint": "/api/status",
"StatusSimpleHealth": true,
"StatusOUVerify": false,
"AgentPollMinutes": 60,
"UnseenAgentForgetHours": 6,
"StaleSeedFailMinutes": 60,
"SeedAcceptableBytesDiff": 8192,
"PseudoGTIDPattern": "",
"PseudoGTIDPatternIsFixedSubstring": false,
"PseudoGTIDMonotonicHint": "asc:",
"DetectPseudoGTIDQuery": "",
"BinlogEventsChunkSize": 10000,
"SkipBinlogEventsContaining": [],
"ReduceReplicationAnalysisCount": true,
"FailureDetectionPeriodBlockMinutes": 60,
"FailMasterPromotionOnLagMinutes": 0,
"RecoveryPeriodBlockSeconds": 3600,
"RecoveryIgnoreHostnameFilters": [],
"RecoverMasterClusterFilters": ["none"],
"RecoverIntermediateMasterClusterFilters": [
"none"
],
"OnFailureDetectionProcesses": [
"echo 'Detected {failureType} on {failureCluster}. Affected replicas: {countSlaves}' >> /tmp/recovery.log"
],
"PreGracefulTakeoverProcesses": [
"echo 'Planned takeover about to take place on {failureCluster}. Master will switch to read_only' >> /tmp/recovery.log"
],
"PreFailoverProcesses": [
"echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/recovery.log"
],
"PostFailoverProcesses": [
"echo '(for all types) Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
],
"PostUnsuccessfulFailoverProcesses": [],
"PostMasterFailoverProcesses": [
"echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Promoted: {successorHost}:{successorPort}' >> /tmp/recovery.log"
],
"PostIntermediateMasterFailoverProcesses": [
"echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
],
"PostGracefulTakeoverProcesses": [
"echo 'Planned takeover complete' >> /tmp/recovery.log"
],
"CoMasterRecoveryMustPromoteOtherCoMaster": true,
"DetachLostSlavesAfterMasterFailover": true,
"ApplyMySQLPromotionAfterMasterFailover": true,
"PreventCrossDataCenterMasterFailover": false,
"PreventCrossRegionMasterFailover": false,
"MasterFailoverDetachReplicaMasterHost": false,
"MasterFailoverLostInstancesDowntimeMinutes": 0,
"PostponeReplicaRecoveryOnLagMinutes": 0,
"OSCIgnoreHostnameFilters": [],
"GraphiteAddr": "",
"GraphitePath": "",
"GraphiteConvertHostnameDotsToUnderscores": true,
"ConsulAddress": "",
"ConsulAclToken": "",
"RaftEnabled": true,
"RaftDataDir": "/var/lib/orchestrator",
"RaftBind": "10.21.17.113:10008",
"RaftAdvertise": "10.21.17.113:10008",
"DefaultRaftPort": 10008,
"RaftNodes": [
"10.21.17.112",
"10.21.17.113",
"10.21.17.114"
]
}
- 112
{
"Debug": true,
"EnableSyslog": false,
"ListenAddress": ":3000",
"MySQLTopologyUser": "orchestrator",
"MySQLTopologyPassword": "password",
"MySQLTopologyCredentialsConfigFile": "",
"MySQLTopologySSLPrivateKeyFile": "",
"MySQLTopologySSLCertFile": "",
"MySQLTopologySSLCAFile": "",
"MySQLTopologySSLSkipVerify": true,
"MySQLTopologyUseMutualTLS": false,
"BackendDB": "sqlite",
"SQLite3DataFile": "/usr/local/orchestrator/orchestrator.sqlite3",
"MySQLConnectTimeoutSeconds": 1,
"DefaultInstancePort": 3306,
"DiscoverByShowSlaveHosts": true,
"InstancePollSeconds": 5,
"DiscoveryIgnoreReplicaHostnameFilters": [
"a_host_i_want_to_ignore[.]example[.]com",
".*[.]ignore_all_hosts_from_this_domain[.]example[.]com",
"a_host_with_extra_port_i_want_to_ignore[.]example[.]com:3307",
"10.111.*.*",
"115.236.*.*"
],
"UnseenInstanceForgetHours": 240,
"SnapshotTopologiesIntervalHours": 0,
"InstanceBulkOperationsWaitTimeoutSeconds": 10,
"HostnameResolveMethod": "none",
"MySQLHostnameResolveMethod": "",
"SkipBinlogServerUnresolveCheck": true,
"ExpiryHostnameResolvesMinutes": 60,
"RejectHostnameResolvePattern": "",
"ReasonableReplicationLagSeconds": 10,
"ProblemIgnoreHostnameFilters": [],
"VerifyReplicationFilters": false,
"ReasonableMaintenanceReplicationLagSeconds": 20,
"CandidateInstanceExpireMinutes": 60,
"AuditLogFile": "",
"AuditToSyslog": false,
"RemoveTextFromHostnameDisplay": ".mydomain.com:3306",
"ReadOnly": false,
"AuthenticationMethod": "basic",
"HTTPAuthUser": "orchestrator",
"HTTPAuthPassword": "password",
"AuthUserHeader": "",
"PowerAuthUsers": [
"*"
],
"ClusterNameToAlias": {
"127.0.0.1": "test suite"
},
"ReplicationLagQuery": "",
"DetectInstanceAliasQuery": "",
"DetectPromotionRuleQuery": "",
"DataCenterPattern": "[.]([^.]+)[.][^.]+[.]mydomain[.]com",
"PhysicalEnvironmentPattern": "[.]([^.]+[.][^.]+)[.]mydomain[.]com",
"PromotionIgnoreHostnameFilters": [],
"DetectSemiSyncEnforcedQuery": "",
"ServeAgentsHttp": false,
"AgentsServerPort": ":3001",
"AgentsUseSSL": false,
"AgentsUseMutualTLS": false,
"AgentSSLSkipVerify": false,
"AgentSSLPrivateKeyFile": "",
"AgentSSLCertFile": "",
"AgentSSLCAFile": "",
"AgentSSLValidOUs": [],
"UseSSL": false,
"UseMutualTLS": false,
"SSLSkipVerify": false,
"SSLPrivateKeyFile": "",
"SSLCertFile": "",
"SSLCAFile": "",
"SSLValidOUs": [],
"URLPrefix": "",
"StatusEndpoint": "/api/status",
"StatusSimpleHealth": true,
"StatusOUVerify": false,
"AgentPollMinutes": 60,
"UnseenAgentForgetHours": 6,
"StaleSeedFailMinutes": 60,
"SeedAcceptableBytesDiff": 8192,
"PseudoGTIDPattern": "",
"PseudoGTIDPatternIsFixedSubstring": false,
"PseudoGTIDMonotonicHint": "asc:",
"DetectPseudoGTIDQuery": "",
"BinlogEventsChunkSize": 10000,
"SkipBinlogEventsContaining": [],
"ReduceReplicationAnalysisCount": true,
"FailureDetectionPeriodBlockMinutes": 60,
"FailMasterPromotionOnLagMinutes": 0,
"RecoveryPeriodBlockSeconds": 3600,
"RecoveryIgnoreHostnameFilters": [],
"RecoverMasterClusterFilters": ["none"],
"RecoverIntermediateMasterClusterFilters": [
"none"
],
"OnFailureDetectionProcesses": [
"echo 'Detected {failureType} on {failureCluster}. Affected replicas: {countSlaves}' >> /tmp/recovery.log"
],
"PreGracefulTakeoverProcesses": [
"echo 'Planned takeover about to take place on {failureCluster}. Master will switch to read_only' >> /tmp/recovery.log"
],
"PreFailoverProcesses": [
"echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/recovery.log"
],
"PostFailoverProcesses": [
"echo '(for all types) Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
],
"PostUnsuccessfulFailoverProcesses": [],
"PostMasterFailoverProcesses": [
"echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Promoted: {successorHost}:{successorPort}' >> /tmp/recovery.log"
],
"PostIntermediateMasterFailoverProcesses": [
"echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
],
"PostGracefulTakeoverProcesses": [
"echo 'Planned takeover complete' >> /tmp/recovery.log"
],
"CoMasterRecoveryMustPromoteOtherCoMaster": true,
"DetachLostSlavesAfterMasterFailover": true,
"ApplyMySQLPromotionAfterMasterFailover": true,
"PreventCrossDataCenterMasterFailover": false,
"PreventCrossRegionMasterFailover": false,
"MasterFailoverDetachReplicaMasterHost": false,
"MasterFailoverLostInstancesDowntimeMinutes": 0,
"PostponeReplicaRecoveryOnLagMinutes": 0,
"OSCIgnoreHostnameFilters": [],
"GraphiteAddr": "",
"GraphitePath": "",
"GraphiteConvertHostnameDotsToUnderscores": true,
"ConsulAddress": "",
"ConsulAclToken": "",
"RaftEnabled": true,
"RaftDataDir": "/var/lib/orchestrator",
"RaftBind": "10.21.17.112:10008",
"RaftAdvertise": "10.21.17.112:10008",
"DefaultRaftPort": 10008,
"RaftNodes": [
"10.21.17.112",
"10.21.17.113",
"10.21.17.114"
]
}
from orchestrator-client -c api -path status | jq .
"RaftLeaderURI": "http://10.21.17.112:10008:3000"
how is the RaftLeaderURI coming from ?
So far, it seems raftbind server takes too long to deal with AppendEntriesRequest. how cloud it affect raft flush "logs" ?
[root@mysql-10-21-17-114 mongodb_data]# tail -n 1000000 /var/log/messages | grep "51836"
Jan 8 15:41:23 mysql-10-21-17-114 orchestrator: [martini] Started GET /api/leader-check for 10.21.17.113:51836
Jan 8 17:51:43 mysql-10-21-17-114 orchestrator: 2021/01/08 17:51:43 [DEBUG] raft-net: 10.21.17.114:10008 accepted connection from: 10.21.17.112:51836
Jan 8 18:26:05 mysql-10-21-17-114 orchestrator: 2021/01/08 18:26:05 [ERR] raft-net: Failed to flush response: write tcp 10.21.17.114:10008->10.21.17.112:51836: write: broken pipe
Hi @shlomi-noach, We use AWS EC2 to deploy Orchestrator, which has been running stably for nearly a year, and recently encountered this problem. The problem occurs randomly, the leader's RaftHealthyMembers is null, and the captured network packets contain RST packets. Restart all nodes and the problem is solved. But as mentioned earlier, the problem appeared randomly.