orchestrator icon indicating copy to clipboard operation
orchestrator copied to clipboard

"timed out enqueuing operation" when orchestrator-client -c forget -i "ip:port"

Open jianhaiqing opened this issue 4 years ago • 19 comments

  • your orchestrator.conf.json config file/contents
{
  "Debug": true,
  "EnableSyslog": false,
  "ListenAddress": ":3000",
  "MySQLTopologyUser": "orchestrator",
  "MySQLTopologyPassword": "password",
  "MySQLTopologyCredentialsConfigFile": "",
  "MySQLTopologySSLPrivateKeyFile": "",
  "MySQLTopologySSLCertFile": "",
  "MySQLTopologySSLCAFile": "",
  "MySQLTopologySSLSkipVerify": true,
  "MySQLTopologyUseMutualTLS": false,
  "BackendDB": "sqlite",
  "SQLite3DataFile": "/usr/local/orchestrator/orchestrator.sqlite3",
  "MySQLConnectTimeoutSeconds": 1,
  "DefaultInstancePort": 3306,
  "DiscoverByShowSlaveHosts": true,
  "InstancePollSeconds": 5,
  "DiscoveryIgnoreReplicaHostnameFilters": [
    "a_host_i_want_to_ignore[.]example[.]com",
    ".*[.]ignore_all_hosts_from_this_domain[.]example[.]com",
    "a_host_with_extra_port_i_want_to_ignore[.]example[.]com:3307",
    "10.111.*.*",
    "115.236.*.*"
  ],
  "UnseenInstanceForgetHours": 240,
  "SnapshotTopologiesIntervalHours": 0,
  "InstanceBulkOperationsWaitTimeoutSeconds": 10,
  "HostnameResolveMethod": "none",
  "MySQLHostnameResolveMethod": "",
  "SkipBinlogServerUnresolveCheck": true,
  "ExpiryHostnameResolvesMinutes": 60,
  "RejectHostnameResolvePattern": "",
  "ReasonableReplicationLagSeconds": 10,
  "ProblemIgnoreHostnameFilters": [],
  "VerifyReplicationFilters": false,
  "ReasonableMaintenanceReplicationLagSeconds": 20,
  "CandidateInstanceExpireMinutes": 60,
  "AuditLogFile": "",
  "AuditToSyslog": false,
  "RemoveTextFromHostnameDisplay": ".mydomain.com:3306",
  "ReadOnly": false,
  "AuthenticationMethod": "basic",
  "HTTPAuthUser": "orchestrator",
  "HTTPAuthPassword": "password",
  "AuthUserHeader": "",
  "PowerAuthUsers": [
    "*"
  ],
  "ClusterNameToAlias": {
    "127.0.0.1": "test suite"
  },
  "ReplicationLagQuery": "",
  "DetectInstanceAliasQuery": "",
  "DetectPromotionRuleQuery": "",
  "DataCenterPattern": "[.]([^.]+)[.][^.]+[.]mydomain[.]com",
  "PhysicalEnvironmentPattern": "[.]([^.]+[.][^.]+)[.]mydomain[.]com",
  "PromotionIgnoreHostnameFilters": [],
  "DetectSemiSyncEnforcedQuery": "",
  "ServeAgentsHttp": false,
  "AgentsServerPort": ":3001",
  "AgentsUseSSL": false,
  "AgentsUseMutualTLS": false,
  "AgentSSLSkipVerify": false,
  "AgentSSLPrivateKeyFile": "",
  "AgentSSLCertFile": "",
  "AgentSSLCAFile": "",
  "AgentSSLValidOUs": [],
  "UseSSL": false,
  "UseMutualTLS": false,
  "SSLSkipVerify": false,
  "SSLPrivateKeyFile": "",
  "SSLCertFile": "",
  "SSLCAFile": "",
  "SSLValidOUs": [],
  "URLPrefix": "",
  "StatusEndpoint": "/api/status",
  "StatusSimpleHealth": true,
  "StatusOUVerify": false,
  "AgentPollMinutes": 60,
  "UnseenAgentForgetHours": 6,
  "StaleSeedFailMinutes": 60,
  "SeedAcceptableBytesDiff": 8192,
  "PseudoGTIDPattern": "",
  "PseudoGTIDPatternIsFixedSubstring": false,
  "PseudoGTIDMonotonicHint": "asc:",
  "DetectPseudoGTIDQuery": "",
  "BinlogEventsChunkSize": 10000,
  "SkipBinlogEventsContaining": [],
  "ReduceReplicationAnalysisCount": true,
  "FailureDetectionPeriodBlockMinutes": 60,
  "FailMasterPromotionOnLagMinutes": 0,
  "RecoveryPeriodBlockSeconds": 3600,
  "RecoveryIgnoreHostnameFilters": [],
  "RecoverMasterClusterFilters": ["none"],
  "RecoverIntermediateMasterClusterFilters": [
    "none"
  ],
  "OnFailureDetectionProcesses": [
    "echo 'Detected {failureType} on {failureCluster}. Affected replicas: {countSlaves}' >> /tmp/recovery.log"
  ],
  "PreGracefulTakeoverProcesses": [
    "echo 'Planned takeover about to take place on {failureCluster}. Master will switch to read_only' >> /tmp/recovery.log"
  ],
  "PreFailoverProcesses": [
    "echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/recovery.log"
  ],
  "PostFailoverProcesses": [
    "echo '(for all types) Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostUnsuccessfulFailoverProcesses": [],
  "PostMasterFailoverProcesses": [
    "echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Promoted: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostIntermediateMasterFailoverProcesses": [
    "echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostGracefulTakeoverProcesses": [
    "echo 'Planned takeover complete' >> /tmp/recovery.log"
  ],
  "CoMasterRecoveryMustPromoteOtherCoMaster": true,
  "DetachLostSlavesAfterMasterFailover": true,
  "ApplyMySQLPromotionAfterMasterFailover": true,
  "PreventCrossDataCenterMasterFailover": false,
  "PreventCrossRegionMasterFailover": false,
  "MasterFailoverDetachReplicaMasterHost": false,
  "MasterFailoverLostInstancesDowntimeMinutes": 0,
  "PostponeReplicaRecoveryOnLagMinutes": 0,
  "OSCIgnoreHostnameFilters": [],
  "GraphiteAddr": "",
  "GraphitePath": "",
  "GraphiteConvertHostnameDotsToUnderscores": true,
  "ConsulAddress": "",
  "ConsulAclToken": "",
"RaftEnabled": true,
"RaftDataDir": "/var/lib/orchestrator",
  "RaftBind": "10.21.17.114",
  "DefaultRaftPort": 10008,
  "RaftNodes": [
    "10.21.17.112",
    "10.21.17.113",
    "10.21.17.114"
  ]
}
  • your topology (e.g. run orchestrator-client -c topology -alias my-cluster)
# orchestrator-client  -c topology -alias 10.21.17.73:23306
10.21.17.73:23306   [0s,ok,5.7.25-28-log,rw,ROW,>>,GTID]
+ 10.21.17.74:23307 [0s,ok,5.7.25-28-log,ro,ROW,>>,GTID]

# orchestrator-client  -c raft-leader
10.21.17.114:10008
  • orchestrator-client -c api -path status | jq .
{
  "Code": "OK",
  "Message": "Application node is healthy",
  "Details": {
    "Healthy": true,
    "Hostname": "mysql-10-21-17-114",
    "Token": "d79fc086292690e23a4c2bafbab8315b123f743a641b89622ae2cbd77a2bcabb",
    "IsActiveNode": true,
    "ActiveNode": {
      "Hostname": "10.21.17.114:10008",
      "Token": "",
      "AppVersion": "",
      "FirstSeenActive": "",
      "LastSeenActive": "",
      "ExtraInfo": "",
      "Command": "",
      "DBBackend": "",
      "LastReported": "0001-01-01T00:00:00Z"
    },
    "Error": null,
    "AvailableNodes": [
      {
        "Hostname": "mysql-10-21-17-114",
        "Token": "d79fc086292690e23a4c2bafbab8315b123f743a641b89622ae2cbd77a2bcabb",
        "AppVersion": "3.2.2",
        "FirstSeenActive": "2020-12-24T00:47:29Z",
        "LastSeenActive": "2021-01-07T03:54:14Z",
        "ExtraInfo": "",
        "Command": "",
        "DBBackend": "/usr/local/orchestrator/orchestrator.sqlite3",
        "LastReported": "0001-01-01T00:00:00Z"
      }
    ],
    "RaftLeader": "10.21.17.114:10008",
    "IsRaftLeader": true,
    "RaftLeaderURI": "http://10.21.17.114:3000",
    "RaftAdvertise": "10.21.17.114",
    "RaftHealthyMembers": null
  }
}
  • my issue
# orchestrator-client  -c forget -i 10.21.17.74:23307
timed out enqueuing operation
  • what did you expect to happen? corresponding instance can be forgotten
  • what happened? timeout
  • orchestrator logs; please use --debug --stack for maximum verbosity no logs output for this command

jianhaiqing avatar Jan 07 '21 03:01 jianhaiqing

The error message is generated by raft, in vendor/github.com/hashicorp/raft/raft.go

I se eyou have "RaftHealthyMembers": null in your JSON output. Please cehck the logs (not for orchestrator-client, but for orchestrator) with --debug-stack to see why orchestrator is failing to detect the other raft nodes on .112 and .113.

shlomi-noach avatar Jan 07 '21 06:01 shlomi-noach

I have added --debug --stack and restart orchestrator, now it works properly. It might take time to reproduce the issue. keep the issue open ? I will take care of the issue. Thanks for your answer

jianhaiqing avatar Jan 07 '21 08:01 jianhaiqing

Good luck; please close it once you feel everything's working.

shlomi-noach avatar Jan 07 '21 08:01 shlomi-noach

# ps -ef | grep orch
root      28777      1 15 16:45 ?        00:07:31 /usr/local/orchestrator/orchestrator http --debug --stack
  • orchestrator-client -c api -path status | jq . RaftHealthyMembers becomes null once it runs for a while.
{
  "Code": "OK",
  "Message": "Application node is healthy",
  "Details": {
    "Healthy": true,
    "Hostname": "mysql-10-21-17-114",
    "Token": "972209b5356130e2aff8eaae9df1e757c7181fb161cb95631a93280997ad4df6",
    "IsActiveNode": true,
    "ActiveNode": {
      "Hostname": "10.21.17.114:10008",
      "Token": "",
      "AppVersion": "",
      "FirstSeenActive": "",
      "LastSeenActive": "",
      "ExtraInfo": "",
      "Command": "",
      "DBBackend": "",
      "LastReported": "0001-01-01T00:00:00Z"
    },
    "Error": null,
    "AvailableNodes": [
      {
        "Hostname": "mysql-10-21-17-114",
        "Token": "972209b5356130e2aff8eaae9df1e757c7181fb161cb95631a93280997ad4df6",
        "AppVersion": "3.2.2",
        "FirstSeenActive": "2021-01-07T08:45:07Z",
        "LastSeenActive": "2021-01-07T09:32:52Z",
        "ExtraInfo": "",
        "Command": "",
        "DBBackend": "/usr/local/orchestrator/orchestrator.sqlite3",
        "LastReported": "0001-01-01T00:00:00Z"
      }
    ],
    "RaftLeader": "10.21.17.114:10008",
    "IsRaftLeader": true,
    "RaftLeaderURI": "http://10.21.17.114:3000",
    "RaftAdvertise": "10.21.17.114",
    "RaftHealthyMembers": null
  }
}

i didn't find any useful information from the log. Could you give me a hand ? 114 log

jianhaiqing avatar Jan 07 '21 09:01 jianhaiqing

What does the log show on the two other orchestrators? Specifically look for raft (e.g. grep -i raft) in the logs.

shlomi-noach avatar Jan 07 '21 09:01 shlomi-noach

I didn't any special, except the following.

2021-01-07 17:25:49 ERROR HttpGetLeader: got 500 status on http://10.21.17.114:3000/api/raft-follower-health-report/15097c82/10.21.17.113/10.21.17.113

112 log 113 raft log

jianhaiqing avatar Jan 07 '21 09:01 jianhaiqing

The 112 and 113 logs you attached don't seem to be with --debug --stack

shlomi-noach avatar Jan 07 '21 10:01 shlomi-noach

all the logs come from /var/log/messages

[root@mysql-10-21-17-112 orchestrator]# ps -ef | grep orch
root      57769      1  0  2020 ?        02:05:01 /usr/sbin/haproxy -f /etc/haproxy/haproxy-orchestrator.cfg -p /run/haproxy80.pid
root      65475      1 15 16:45 ?        00:19:36 /usr/local/orchestrator/orchestrator http --debug --stack
root     112397  63167  0 18:52 pts/0    00:00:00 grep --color=auto orch
[root@mysql-10-21-17-112 orchestrator]#

jianhaiqing avatar Jan 07 '21 10:01 jianhaiqing

on 112, 113, please run orchestrator with --debug --stack and upload again please.

shlomi-noach avatar Jan 07 '21 10:01 shlomi-noach

there's no DEBUG entries in those logs; only in 114

shlomi-noach avatar Jan 07 '21 10:01 shlomi-noach

I have run orchestrator with --debug --stack as you can see the ps -ef run on 112. And i grep debug from the uploaded log. I don't know whether the following info is debug entries or not . Many thanks

[root@mysql-10-21-17-112 ~]# grep -i debug raft.log  | head -10
Jan 07 15:08:49 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:08:49 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:08:54 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:08:54 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:08:59 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:08:59 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:09:04 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:09:04 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:09:09 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:09:09 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:09:14 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:09:14 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:09:19 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:09:19 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:09:24 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:09:24 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:09:29 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:09:29 DEBUG raft leader is 10.21.17.114:10008; state: Follower
Jan 07 15:09:34 mysql-10-21-17-112 orchestrator[96526]: 2021-01-07 15:09:34 DEBUG raft leader is 10.21.17.114:10008; state: Follower

jianhaiqing avatar Jan 08 '21 00:01 jianhaiqing

we get the following logs yesterday.

[root@mysql-10-21-17-113 ~]#  grep -i orches raft-113.log | grep  -i "ERR" | grep -i raft
Jan 07 16:43:46 mysql-10-21-17-113 orchestrator[26135]: 2021/01/07 16:43:46 [ERR] raft: Failed to make RequestVote RPC to 10.21.17.114:10008: dial tcp 10.21.17.114:10008: connect: connection refused
Jan 07 16:45:55 mysql-10-21-17-113 orchestrator[24342]: 2021-01-07 16:45:55 ERROR HttpGetLeader: got 500 status on http://10.21.17.112:3000/api/raft-follower-health-report/bcf4da4f/10.21.17.113/10.21.17.113
Jan 07 17:19:20 mysql-10-21-17-113 orchestrator[24342]: 2021/01/07 17:19:20 [ERR] raft-net: Failed to flush response: write tcp 10.21.17.113:10008->10.21.17.114:52874: write: broken pipe
Jan 07 17:19:20 mysql-10-21-17-113 orchestrator[24342]: 2021/01/07 17:19:20 [ERR] raft-net: Failed to flush response: write tcp 10.21.17.113:10008->10.21.17.114:52876: write: broken pipe

jianhaiqing avatar Jan 08 '21 00:01 jianhaiqing

on 113 with : tcpdump -i any port 10008 113-114

orchestrator-raft-from113WithConditionPort10008.pcapng

The download link is available for 2hours. I will upload again if you miss it

jianhaiqing avatar Jan 08 '21 01:01 jianhaiqing

Thank you, let me take another look. Can you please post the config files for 112 and 113?

shlomi-noach avatar Jan 08 '21 05:01 shlomi-noach

  • 113
{
  "Debug": true,
  "EnableSyslog": false,
  "ListenAddress": ":3000",
  "MySQLTopologyUser": "orchestrator",
  "MySQLTopologyPassword": "password",
  "MySQLTopologyCredentialsConfigFile": "",
  "MySQLTopologySSLPrivateKeyFile": "",
  "MySQLTopologySSLCertFile": "",
  "MySQLTopologySSLCAFile": "",
  "MySQLTopologySSLSkipVerify": true,
  "MySQLTopologyUseMutualTLS": false,
  "BackendDB": "sqlite",
  "SQLite3DataFile": "/usr/local/orchestrator/orchestrator.sqlite3",
  "MySQLConnectTimeoutSeconds": 1,
  "DefaultInstancePort": 3306,
  "DiscoverByShowSlaveHosts": true,
  "InstancePollSeconds": 5,
  "DiscoveryIgnoreReplicaHostnameFilters": [
    "a_host_i_want_to_ignore[.]example[.]com",
    ".*[.]ignore_all_hosts_from_this_domain[.]example[.]com",
    "a_host_with_extra_port_i_want_to_ignore[.]example[.]com:3307",
    "10.111.*.*",
    "115.236.*.*"
  ],
  "UnseenInstanceForgetHours": 240,
  "SnapshotTopologiesIntervalHours": 0,
  "InstanceBulkOperationsWaitTimeoutSeconds": 10,
  "HostnameResolveMethod": "none",
  "MySQLHostnameResolveMethod": "",
  "SkipBinlogServerUnresolveCheck": true,
  "ExpiryHostnameResolvesMinutes": 60,
  "RejectHostnameResolvePattern": "",
  "ReasonableReplicationLagSeconds": 10,
  "ProblemIgnoreHostnameFilters": [],
  "VerifyReplicationFilters": false,
  "ReasonableMaintenanceReplicationLagSeconds": 20,
  "CandidateInstanceExpireMinutes": 60,
  "AuditLogFile": "",
  "AuditToSyslog": false,
  "RemoveTextFromHostnameDisplay": ".mydomain.com:3306",
  "ReadOnly": false,
  "AuthenticationMethod": "basic",
  "HTTPAuthUser": "orchestrator",
  "HTTPAuthPassword": "password",
  "AuthUserHeader": "",
  "PowerAuthUsers": [
    "*"
  ],
  "ClusterNameToAlias": {
    "127.0.0.1": "test suite"
  },
  "ReplicationLagQuery": "",
  "DetectInstanceAliasQuery": "",
  "DetectPromotionRuleQuery": "",
  "DataCenterPattern": "[.]([^.]+)[.][^.]+[.]mydomain[.]com",
  "PhysicalEnvironmentPattern": "[.]([^.]+[.][^.]+)[.]mydomain[.]com",
  "PromotionIgnoreHostnameFilters": [],
  "DetectSemiSyncEnforcedQuery": "",
  "ServeAgentsHttp": false,
  "AgentsServerPort": ":3001",
  "AgentsUseSSL": false,
  "AgentsUseMutualTLS": false,
  "AgentSSLSkipVerify": false,
  "AgentSSLPrivateKeyFile": "",
  "AgentSSLCertFile": "",
  "AgentSSLCAFile": "",
  "AgentSSLValidOUs": [],
  "UseSSL": false,
  "UseMutualTLS": false,
  "SSLSkipVerify": false,
  "SSLPrivateKeyFile": "",
  "SSLCertFile": "",
  "SSLCAFile": "",
  "SSLValidOUs": [],
  "URLPrefix": "",
  "StatusEndpoint": "/api/status",
  "StatusSimpleHealth": true,
  "StatusOUVerify": false,
  "AgentPollMinutes": 60,
  "UnseenAgentForgetHours": 6,
  "StaleSeedFailMinutes": 60,
  "SeedAcceptableBytesDiff": 8192,
  "PseudoGTIDPattern": "",
  "PseudoGTIDPatternIsFixedSubstring": false,
  "PseudoGTIDMonotonicHint": "asc:",
  "DetectPseudoGTIDQuery": "",
  "BinlogEventsChunkSize": 10000,
  "SkipBinlogEventsContaining": [],
  "ReduceReplicationAnalysisCount": true,
  "FailureDetectionPeriodBlockMinutes": 60,
  "FailMasterPromotionOnLagMinutes": 0,
  "RecoveryPeriodBlockSeconds": 3600,
  "RecoveryIgnoreHostnameFilters": [],
  "RecoverMasterClusterFilters": ["none"],
  "RecoverIntermediateMasterClusterFilters": [
    "none"
  ],
  "OnFailureDetectionProcesses": [
    "echo 'Detected {failureType} on {failureCluster}. Affected replicas: {countSlaves}' >> /tmp/recovery.log"
  ],
  "PreGracefulTakeoverProcesses": [
    "echo 'Planned takeover about to take place on {failureCluster}. Master will switch to read_only' >> /tmp/recovery.log"
  ],
  "PreFailoverProcesses": [
    "echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/recovery.log"
  ],
  "PostFailoverProcesses": [
    "echo '(for all types) Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostUnsuccessfulFailoverProcesses": [],
  "PostMasterFailoverProcesses": [
    "echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Promoted: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostIntermediateMasterFailoverProcesses": [
    "echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostGracefulTakeoverProcesses": [
    "echo 'Planned takeover complete' >> /tmp/recovery.log"
  ],
  "CoMasterRecoveryMustPromoteOtherCoMaster": true,
  "DetachLostSlavesAfterMasterFailover": true,
  "ApplyMySQLPromotionAfterMasterFailover": true,
  "PreventCrossDataCenterMasterFailover": false,
  "PreventCrossRegionMasterFailover": false,
  "MasterFailoverDetachReplicaMasterHost": false,
  "MasterFailoverLostInstancesDowntimeMinutes": 0,
  "PostponeReplicaRecoveryOnLagMinutes": 0,
  "OSCIgnoreHostnameFilters": [],
  "GraphiteAddr": "",
  "GraphitePath": "",
  "GraphiteConvertHostnameDotsToUnderscores": true,
  "ConsulAddress": "",
  "ConsulAclToken": "",
  "RaftEnabled": true,
  "RaftDataDir": "/var/lib/orchestrator",
  "RaftBind": "10.21.17.113:10008",
  "RaftAdvertise": "10.21.17.113:10008",
  "DefaultRaftPort": 10008,
  "RaftNodes": [
    "10.21.17.112",
    "10.21.17.113",
    "10.21.17.114"
  ]
}
  • 112
{
  "Debug": true,
  "EnableSyslog": false,
  "ListenAddress": ":3000",
  "MySQLTopologyUser": "orchestrator",
  "MySQLTopologyPassword": "password",
  "MySQLTopologyCredentialsConfigFile": "",
  "MySQLTopologySSLPrivateKeyFile": "",
  "MySQLTopologySSLCertFile": "",
  "MySQLTopologySSLCAFile": "",
  "MySQLTopologySSLSkipVerify": true,
  "MySQLTopologyUseMutualTLS": false,
  "BackendDB": "sqlite",
  "SQLite3DataFile": "/usr/local/orchestrator/orchestrator.sqlite3",
  "MySQLConnectTimeoutSeconds": 1,
  "DefaultInstancePort": 3306,
  "DiscoverByShowSlaveHosts": true,
  "InstancePollSeconds": 5,
  "DiscoveryIgnoreReplicaHostnameFilters": [
    "a_host_i_want_to_ignore[.]example[.]com",
    ".*[.]ignore_all_hosts_from_this_domain[.]example[.]com",
    "a_host_with_extra_port_i_want_to_ignore[.]example[.]com:3307",
    "10.111.*.*",
    "115.236.*.*"
  ],
  "UnseenInstanceForgetHours": 240,
  "SnapshotTopologiesIntervalHours": 0,
  "InstanceBulkOperationsWaitTimeoutSeconds": 10,
  "HostnameResolveMethod": "none",
  "MySQLHostnameResolveMethod": "",
  "SkipBinlogServerUnresolveCheck": true,
  "ExpiryHostnameResolvesMinutes": 60,
  "RejectHostnameResolvePattern": "",
  "ReasonableReplicationLagSeconds": 10,
  "ProblemIgnoreHostnameFilters": [],
  "VerifyReplicationFilters": false,
  "ReasonableMaintenanceReplicationLagSeconds": 20,
  "CandidateInstanceExpireMinutes": 60,
  "AuditLogFile": "",
  "AuditToSyslog": false,
  "RemoveTextFromHostnameDisplay": ".mydomain.com:3306",
  "ReadOnly": false,
  "AuthenticationMethod": "basic",
  "HTTPAuthUser": "orchestrator",
  "HTTPAuthPassword": "password",
  "AuthUserHeader": "",
  "PowerAuthUsers": [
    "*"
  ],
  "ClusterNameToAlias": {
    "127.0.0.1": "test suite"
  },
  "ReplicationLagQuery": "",
  "DetectInstanceAliasQuery": "",
  "DetectPromotionRuleQuery": "",
  "DataCenterPattern": "[.]([^.]+)[.][^.]+[.]mydomain[.]com",
  "PhysicalEnvironmentPattern": "[.]([^.]+[.][^.]+)[.]mydomain[.]com",
  "PromotionIgnoreHostnameFilters": [],
  "DetectSemiSyncEnforcedQuery": "",
  "ServeAgentsHttp": false,
  "AgentsServerPort": ":3001",
  "AgentsUseSSL": false,
  "AgentsUseMutualTLS": false,
  "AgentSSLSkipVerify": false,
  "AgentSSLPrivateKeyFile": "",
  "AgentSSLCertFile": "",
  "AgentSSLCAFile": "",
  "AgentSSLValidOUs": [],
  "UseSSL": false,
  "UseMutualTLS": false,
  "SSLSkipVerify": false,
  "SSLPrivateKeyFile": "",
  "SSLCertFile": "",
  "SSLCAFile": "",
  "SSLValidOUs": [],
  "URLPrefix": "",
  "StatusEndpoint": "/api/status",
  "StatusSimpleHealth": true,
  "StatusOUVerify": false,
  "AgentPollMinutes": 60,
  "UnseenAgentForgetHours": 6,
  "StaleSeedFailMinutes": 60,
  "SeedAcceptableBytesDiff": 8192,
  "PseudoGTIDPattern": "",
  "PseudoGTIDPatternIsFixedSubstring": false,
  "PseudoGTIDMonotonicHint": "asc:",
  "DetectPseudoGTIDQuery": "",
  "BinlogEventsChunkSize": 10000,
  "SkipBinlogEventsContaining": [],
  "ReduceReplicationAnalysisCount": true,
  "FailureDetectionPeriodBlockMinutes": 60,
  "FailMasterPromotionOnLagMinutes": 0,
  "RecoveryPeriodBlockSeconds": 3600,
  "RecoveryIgnoreHostnameFilters": [],
  "RecoverMasterClusterFilters": ["none"],
  "RecoverIntermediateMasterClusterFilters": [
    "none"
  ],
  "OnFailureDetectionProcesses": [
    "echo 'Detected {failureType} on {failureCluster}. Affected replicas: {countSlaves}' >> /tmp/recovery.log"
  ],
  "PreGracefulTakeoverProcesses": [
    "echo 'Planned takeover about to take place on {failureCluster}. Master will switch to read_only' >> /tmp/recovery.log"
  ],
  "PreFailoverProcesses": [
    "echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/recovery.log"
  ],
  "PostFailoverProcesses": [
    "echo '(for all types) Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostUnsuccessfulFailoverProcesses": [],
  "PostMasterFailoverProcesses": [
    "echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Promoted: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostIntermediateMasterFailoverProcesses": [
    "echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostGracefulTakeoverProcesses": [
    "echo 'Planned takeover complete' >> /tmp/recovery.log"
  ],
  "CoMasterRecoveryMustPromoteOtherCoMaster": true,
  "DetachLostSlavesAfterMasterFailover": true,
  "ApplyMySQLPromotionAfterMasterFailover": true,
  "PreventCrossDataCenterMasterFailover": false,
  "PreventCrossRegionMasterFailover": false,
  "MasterFailoverDetachReplicaMasterHost": false,
  "MasterFailoverLostInstancesDowntimeMinutes": 0,
  "PostponeReplicaRecoveryOnLagMinutes": 0,
  "OSCIgnoreHostnameFilters": [],
  "GraphiteAddr": "",
  "GraphitePath": "",
  "GraphiteConvertHostnameDotsToUnderscores": true,
  "ConsulAddress": "",
  "ConsulAclToken": "",
  "RaftEnabled": true,
  "RaftDataDir": "/var/lib/orchestrator",
  "RaftBind": "10.21.17.112:10008",
  "RaftAdvertise": "10.21.17.112:10008",
  "DefaultRaftPort": 10008,
  "RaftNodes": [
    "10.21.17.112",
    "10.21.17.113",
    "10.21.17.114"
  ]
}

jianhaiqing avatar Jan 08 '21 06:01 jianhaiqing

from orchestrator-client -c api -path status | jq . "RaftLeaderURI": "http://10.21.17.112:10008:3000" how is the RaftLeaderURI coming from ?

jianhaiqing avatar Jan 08 '21 07:01 jianhaiqing

So far, it seems raftbind server takes too long to deal with AppendEntriesRequest. how cloud it affect raft flush "logs" ?

[root@mysql-10-21-17-114 mongodb_data]# tail -n 1000000 /var/log/messages | grep "51836"
Jan  8 15:41:23 mysql-10-21-17-114 orchestrator: [martini] Started GET /api/leader-check for 10.21.17.113:51836
Jan  8 17:51:43 mysql-10-21-17-114 orchestrator: 2021/01/08 17:51:43 [DEBUG] raft-net: 10.21.17.114:10008 accepted connection from: 10.21.17.112:51836
Jan  8 18:26:05 mysql-10-21-17-114 orchestrator: 2021/01/08 18:26:05 [ERR] raft-net: Failed to flush response: write tcp 10.21.17.114:10008->10.21.17.112:51836: write: broken pipe

jianhaiqing avatar Jan 08 '21 13:01 jianhaiqing

Hi @shlomi-noach, We use AWS EC2 to deploy Orchestrator, which has been running stably for nearly a year, and recently encountered this problem. The problem occurs randomly, the leader's RaftHealthyMembers is null, and the captured network packets contain RST packets. Restart all nodes and the problem is solved. But as mentioned earlier, the problem appeared randomly.

huangzhiyong avatar Sep 08 '21 04:09 huangzhiyong