consul
consul copied to clipboard
Invalid memory address or nil pointer dereference when syncing a wan cluster
Overview of the Issue
I'm trying to federate two consul clusters, a primary one with three server nodes and a secondary one with three server nodes. When starting the replication, i get a panic: runtime error: invalid memory address or nil pointer dereference on the secondary cluster
Reproduction Steps
Steps to reproduce this issue:
- Create a cluster with 3 server nodes and a second one with 1 server node, the cluster are running on docker with host network and setup with ansible; i use the initial manangement tokens, different for the two clusters
- Create with terraform consul provider a replication token, an agent token and dns token with relative roles (i use the builtin one for dns), a jwt auth method and relative bindings to certain roles
- Set the agent and replication token on all server nodes for both clusters in a different config file
Consul info for both Client and Server
Server first cluster info
agent:
check_monitors = 0
check_ttls = 0
checks = 0
services = 0
build:
prerelease =
revision = 9f62fb41
version = 1.19.1
version_metadata =
consul:
acl = enabled
bootstrap = false
known_datacenters = 2
leader = true
leader_addr = 10.100.4.223:8300
server = true
raft:
applied_index = 9295
commit_index = 9295
fsm_pending = 0
last_contact = 0
last_log_index = 9295
last_log_term = 10
last_snapshot_index = 0
last_snapshot_term = 0
latest_configuration = [{Suffrage:Voter ID:491d58d5-44a2-76f1-8a7c-8f86a4d8957f Address:10.100.4.223:8300} {Suffrage:Voter ID:865a3564-372b-e96a-0e3a-241b08e3398c Address:10.100.4.225:8300} {Suffrage:Voter ID:5da003ae-fbb2-e06c-984f-7c4c3ced489d Address:10.100.4.224:8300}]
latest_configuration_index = 0
num_peers = 2
protocol_version = 3
protocol_version_max = 3
protocol_version_min = 0
snapshot_version_max = 1
snapshot_version_min = 0
state = Leader
term = 10
runtime:
arch = amd64
cpu_count = 2
goroutines = 300
max_procs = 2
os = linux
version = go1.22.5
serf_lan:
coordinate_resets = 0
encrypted = true
event_queue = 0
event_time = 19
failed = 0
health_score = 0
intent_queue = 0
left = 0
member_time = 91
members = 4
query_queue = 0
query_time = 1
serf_wan:
coordinate_resets = 0
encrypted = true
event_queue = 0
event_time = 1
failed = 0
health_score = 0
intent_queue = 0
left = 0
member_time = 1345
members = 6
query_queue = 0
query_time = 1
{
"node_name": "server01",
"server": true,
"ui_config": {
"enabled" : true
},
"data_dir": "/consul/data",
"client_addr": "0.0.0.0",
"bind_addr": "10.100.4.223",
"ports": {
"https": 8501
},
"datacenter": "poc",
"primary_datacenter": "poc",
"recursors": ["rec_ip01", "rec_ip02", "rec_ip03","rec_ip04"],
"bootstrap_expect": 3,
"retry_join":[
"server02" ,
"server03"
],
"retry_join_wan":[
"server04",
"server05",
"server06"
],
"encrypt": "key",
"tls": {
"defaults": {
"ca_file": "/consul/config/certs/consul-agent-ca.pem",
"cert_file": "/consul/config/certs/consul-agent-cert.pem",
"key_file": "/consul/config/certs/consul-agent-key.pem",
"verify_incoming": true,
"verify_outgoing": true,
"verify_server_hostname": true
}
},
"acl": {
"enabled": true,
"default_policy": "deny",
"down_policy": "async-cache",
"enable_token_persistence": true,
"enable_token_replication": true,
"tokens": {
"initial_management": "INITIAL_TOKEN_01"
},
"policy_ttl": "60s",
"role_ttl" : "60s",
"token_ttl" : "60s"
}
}
Server second cluster info
agent:
check_monitors = 0
check_ttls = 0
checks = 0
services = 0
build:
prerelease =
revision = 9f62fb41
version = 1.19.1
version_metadata =
consul:
acl = enabled
bootstrap = false
known_datacenters = 2
leader = true
leader_addr = 10.100.4.223:8300
server = true
raft:
applied_index = 9295
commit_index = 9295
fsm_pending = 0
last_contact = 0
last_log_index = 9295
last_log_term = 10
last_snapshot_index = 0
last_snapshot_term = 0
latest_configuration = [{Suffrage:Voter ID:491d58d5-44a2-76f1-8a7c-8f86a4d8957f Address:10.100.4.223:8300} {Suffrage:Voter ID:865a3564-372b-e96a-0e3a-241b08e3398c Address:10.100.4.225:8300} {Suffrage:Voter ID:5da003ae-fbb2-e06c-984f-7c4c3ced489d Address:10.100.4.224:8300}]
latest_configuration_index = 0
num_peers = 2
protocol_version = 3
protocol_version_max = 3
protocol_version_min = 0
snapshot_version_max = 1
snapshot_version_min = 0
state = Leader
term = 10
runtime:
arch = amd64
cpu_count = 2
goroutines = 300
max_procs = 2
os = linux
version = go1.22.5
serf_lan:
coordinate_resets = 0
encrypted = true
event_queue = 0
event_time = 19
failed = 0
health_score = 0
intent_queue = 0
left = 0
member_time = 91
members = 4
query_queue = 0
query_time = 1
serf_wan:
coordinate_resets = 0
encrypted = true
event_queue = 0
event_time = 1
failed = 0
health_score = 0
intent_queue = 0
left = 0
member_time = 1345
members = 6
query_queue = 0
query_time = 1
{
"node_name": "server04",
"server": true,
"ui_config": {
"enabled" : true
},
"data_dir": "/consul/data",
"client_addr": "0.0.0.0",
"bind_addr": "10.100.4.35",
"ports": {
"https": 8501
},
"datacenter": "poc-secondary",
"primary_datacenter": "poc",
"recursors": ["rec_ip01", "rec_ip02", "rec_ip03","rec_ip04"],
"bootstrap_expect": 3,
"retry_join":[
"server05" ,
"server06"
],
"retry_join_wan":[
"server01" ,
"server02" ,
"server03"
],
"encrypt": "key",
"tls": {
"defaults": {
"ca_file": "/consul/config/certs/consul-agent-ca.pem",
"cert_file": "/consul/config/certs/consul-agent-cert.pem",
"key_file": "/consul/config/certs/consul-agent-key.pem",
"verify_incoming": true,
"verify_outgoing": true,
"verify_server_hostname": true
}
},
"acl": {
"enabled": true,
"default_policy": "deny",
"down_policy": "async-cache",
"enable_token_persistence": true,
"enable_token_replication": true,
"tokens": {
"initial_management": "INITIAL_TOKEN_02"
},
"policy_ttl": "60s",
"role_ttl" : "60s",
"token_ttl" : "60s"
}
}
Common token config file
{
"acl": {
"tokens": {
"agent": "b4c0c9bc-3804-a625-7e02-2eb74c2a3895",
"replication": "bf5a3137-cc74-0261-7dd5-5f9062a883b9"
}
}
}
Flatcar linux on vmware VM, running official consul container images version 1.19.1
Log Fragments
2024-09-03T10:22:23.213Z [INFO] agent: Joining cluster...: cluster=WAN 2024-09-03T10:22:23.213Z [INFO] agent: (WAN) joining: wan_addresses=["10.100.4.225", "10.100.4.224", "10.100.4.223"] 2024-09-03T10:22:23.221Z [INFO] agent: (WAN) joined: number_of_nodes=3 2024-09-03T10:22:23.223Z [INFO] agent: Join cluster completed. Synced with initial agents: cluster=WAN num_agents=3 2024-09-03T10:22:31.217Z [WARN] agent.server.raft: heartbeat timeout reached, starting election: last-leader-addr= last-leader-id= 2024-09-03T10:22:31.217Z [INFO] agent.server.raft: entering candidate state: node="Node at 10.100.4.227:8300 [Candidate]" term=11 2024-09-03T10:22:31.222Z [INFO] agent.server.raft: election won: term=11 tally=1 2024-09-03T10:22:31.223Z [INFO] agent.server.raft: entering leader state: leader="Node at 10.100.4.227:8300 [Leader]" 2024-09-03T10:22:31.223Z [INFO] agent.server: cluster leadership acquired 2024-09-03T10:22:31.223Z [INFO] agent.server: New leader elected: payload=GSVCSRTST04 2024-09-03T10:22:31.229Z [INFO] agent.leader: started routine: routine="ACL policy replication" 2024-09-03T10:22:31.229Z [INFO] agent.leader: started routine: routine="ACL role replication" 2024-09-03T10:22:31.229Z [INFO] agent.leader: started routine: routine="ACL token replication" 2024-09-03T10:22:31.229Z [INFO] agent.server.replication.acl.policy: started ACL Policy replication 2024-09-03T10:22:31.229Z [INFO] agent.server.replication.acl.role: started ACL Role replication 2024-09-03T10:22:31.229Z [INFO] agent.server.replication.acl.token: started ACL Token replication panic: runtime error: invalid memory address or nil pointer dereference [signal SIGSEGV: segmentation violation code=0x1 addr=0x8 pc=0x102bd6a]
goroutine 154 [running]: github.com/hashicorp/consul/agent/structs.(*ACLTemplatedPolicyVariables).EstimateSize(...) github.com/hashicorp/consul/agent/structs/acl_templated_policy.go:229 github.com/hashicorp/consul/agent/structs.(*ACLTemplatedPolicy).EstimateSize(...) github.com/hashicorp/consul/agent/structs/acl_templated_policy.go:220 github.com/hashicorp/consul/agent/structs.(*ACLToken).EstimateSize(0xc001485b80?) github.com/hashicorp/consul/agent/structs/acl.go:569 +0xea github.com/hashicorp/consul/agent/consul.(*aclTokenReplicator).PendingUpdateEstimatedSize(0x989680?, 0x7f6108b8ad08?) github.com/hashicorp/consul/agent/consul/acl_replication_types.go:103 +0x25 github.com/hashicorp/consul/agent/consul.(*Server).updateLocalACLType(0xc000cd9808, {0x54c3b98, 0xc0005c8280}, {0x54e4ca8, 0xc000b99d10}, {0x54dc5e0, 0xc001484fa0}) github.com/hashicorp/consul/agent/consul/acl_replication.go:290 +0x388 github.com/hashicorp/consul/agent/consul.(*Server).replicateACLType(0xc000cd9808, {0x54c3b98, 0xc0005c8280}, {0x54e4ca8, 0xc000b99d10}, {0x54dc5e0, 0xc001484fa0}, 0x0) github.com/hashicorp/consul/agent/consul/acl_replication.go:468 +0xd5c github.com/hashicorp/consul/agent/consul.(*Server).replicateACLTokens(0xc000cd9808, {0x54c3b98, 0xc0005c8280}, {0x54e4ca8, 0xc000b99d10}, 0x0) github.com/hashicorp/consul/agent/consul/acl_replication.go:361 +0x65 github.com/hashicorp/consul/agent/consul.(*Server).runACLReplicator(0xc000cd9808, {0x54c3b98, 0xc0005c8280}, {0x54e4ca8, 0xc000b99d10}, {0x43f5f79, 0x6}, 0xc000b93ed0, {0x43ff33d, 0xa}) github.com/hashicorp/consul/agent/consul/leader.go:772 +0x144 github.com/hashicorp/consul/agent/consul.(*Server).runACLTokenReplicator(0xc000cd9808, {0x54c3b98, 0xc0005c8280}) github.com/hashicorp/consul/agent/consul/leader.go:747 +0xe6 github.com/hashicorp/consul/lib/routine.(*Manager).execute(0xc00112dec0, {0x54c3b98?, 0xc0005c8280?}, {0x442df2a, 0x15}, 0x0?, 0x0?) github.com/hashicorp/consul/lib/routine/routine.go:104 +0x7b created by github.com/hashicorp/consul/lib/routine.(*Manager).Start in goroutine 214 github.com/hashicorp/consul/lib/routine/routine.go:91 +0x2bf