consul icon indicating copy to clipboard operation
consul copied to clipboard

Invalid memory address or nil pointer dereference when syncing a wan cluster

Open SilverXXX opened this issue 5 months ago • 1 comments

Overview of the Issue

I'm trying to federate two consul clusters, a primary one with three server nodes and a secondary one with three server nodes. When starting the replication, i get a panic: runtime error: invalid memory address or nil pointer dereference on the secondary cluster


Reproduction Steps

Steps to reproduce this issue:

  1. Create a cluster with 3 server nodes and a second one with 1 server node, the cluster are running on docker with host network and setup with ansible; i use the initial manangement tokens, different for the two clusters
  2. Create with terraform consul provider a replication token, an agent token and dns token with relative roles (i use the builtin one for dns), a jwt auth method and relative bindings to certain roles
  3. Set the agent and replication token on all server nodes for both clusters in a different config file

Consul info for both Client and Server

Server first cluster info
agent:
        check_monitors = 0
        check_ttls = 0
        checks = 0
        services = 0
build:
        prerelease =
        revision = 9f62fb41
        version = 1.19.1
        version_metadata =
consul:
        acl = enabled
        bootstrap = false
        known_datacenters = 2
        leader = true
        leader_addr = 10.100.4.223:8300
        server = true
raft:
        applied_index = 9295
        commit_index = 9295
        fsm_pending = 0
        last_contact = 0
        last_log_index = 9295
        last_log_term = 10
        last_snapshot_index = 0
        last_snapshot_term = 0
        latest_configuration = [{Suffrage:Voter ID:491d58d5-44a2-76f1-8a7c-8f86a4d8957f Address:10.100.4.223:8300} {Suffrage:Voter ID:865a3564-372b-e96a-0e3a-241b08e3398c Address:10.100.4.225:8300} {Suffrage:Voter ID:5da003ae-fbb2-e06c-984f-7c4c3ced489d Address:10.100.4.224:8300}]
        latest_configuration_index = 0
        num_peers = 2
        protocol_version = 3
        protocol_version_max = 3
        protocol_version_min = 0
        snapshot_version_max = 1
        snapshot_version_min = 0
        state = Leader
        term = 10
runtime:
        arch = amd64
        cpu_count = 2
        goroutines = 300
        max_procs = 2
        os = linux
        version = go1.22.5
serf_lan:
        coordinate_resets = 0
        encrypted = true
        event_queue = 0
        event_time = 19
        failed = 0
        health_score = 0
        intent_queue = 0
        left = 0
        member_time = 91
        members = 4
        query_queue = 0
        query_time = 1
serf_wan:
        coordinate_resets = 0
        encrypted = true
        event_queue = 0
        event_time = 1
        failed = 0
        health_score = 0
        intent_queue = 0
        left = 0
        member_time = 1345
        members = 6
        query_queue = 0
        query_time = 1
{
    "node_name": "server01",
    "server": true,
    "ui_config": {
        "enabled" : true
    },
    "data_dir": "/consul/data",
    "client_addr": "0.0.0.0",
    "bind_addr": "10.100.4.223",
    "ports": {
        "https": 8501
    },
    "datacenter": "poc",
    "primary_datacenter": "poc",
    "recursors": ["rec_ip01", "rec_ip02", "rec_ip03","rec_ip04"],
    "bootstrap_expect": 3,
    "retry_join":[
        "server02" ,
        "server03"
    ],
    "retry_join_wan":[
        "server04",
        "server05",
        "server06"
    ],
    "encrypt": "key",
    "tls": {
        "defaults": {
            "ca_file": "/consul/config/certs/consul-agent-ca.pem",
            "cert_file": "/consul/config/certs/consul-agent-cert.pem",
            "key_file": "/consul/config/certs/consul-agent-key.pem",
            "verify_incoming": true,
            "verify_outgoing": true,
            "verify_server_hostname": true
        }
    },
    "acl": {
        "enabled": true,
        "default_policy": "deny",
        "down_policy": "async-cache",
        "enable_token_persistence": true,
        "enable_token_replication": true,
        "tokens": {
            "initial_management": "INITIAL_TOKEN_01"
        },
        "policy_ttl": "60s",
        "role_ttl" : "60s",
        "token_ttl" : "60s"
    }
}
Server second cluster info
agent:
        check_monitors = 0
        check_ttls = 0
        checks = 0
        services = 0
build:
        prerelease =
        revision = 9f62fb41
        version = 1.19.1
        version_metadata =
consul:
        acl = enabled
        bootstrap = false
        known_datacenters = 2
        leader = true
        leader_addr = 10.100.4.223:8300
        server = true
raft:
        applied_index = 9295
        commit_index = 9295
        fsm_pending = 0
        last_contact = 0
        last_log_index = 9295
        last_log_term = 10
        last_snapshot_index = 0
        last_snapshot_term = 0
        latest_configuration = [{Suffrage:Voter ID:491d58d5-44a2-76f1-8a7c-8f86a4d8957f Address:10.100.4.223:8300} {Suffrage:Voter ID:865a3564-372b-e96a-0e3a-241b08e3398c Address:10.100.4.225:8300} {Suffrage:Voter ID:5da003ae-fbb2-e06c-984f-7c4c3ced489d Address:10.100.4.224:8300}]
        latest_configuration_index = 0
        num_peers = 2
        protocol_version = 3
        protocol_version_max = 3
        protocol_version_min = 0
        snapshot_version_max = 1
        snapshot_version_min = 0
        state = Leader
        term = 10
runtime:
        arch = amd64
        cpu_count = 2
        goroutines = 300
        max_procs = 2
        os = linux
        version = go1.22.5
serf_lan:
        coordinate_resets = 0
        encrypted = true
        event_queue = 0
        event_time = 19
        failed = 0
        health_score = 0
        intent_queue = 0
        left = 0
        member_time = 91
        members = 4
        query_queue = 0
        query_time = 1
serf_wan:
        coordinate_resets = 0
        encrypted = true
        event_queue = 0
        event_time = 1
        failed = 0
        health_score = 0
        intent_queue = 0
        left = 0
        member_time = 1345
        members = 6
        query_queue = 0
        query_time = 1
{
    "node_name": "server04",
    "server": true,
    "ui_config": {
        "enabled" : true
    },
    "data_dir": "/consul/data",
    "client_addr": "0.0.0.0",
    "bind_addr": "10.100.4.35",
    "ports": {
        "https": 8501
    },
    "datacenter": "poc-secondary",
    "primary_datacenter": "poc",
    "recursors": ["rec_ip01", "rec_ip02", "rec_ip03","rec_ip04"],
    "bootstrap_expect": 3,
    "retry_join":[
        "server05" ,
        "server06"
    ],
    "retry_join_wan":[
        "server01" ,
        "server02" ,
        "server03"
    ],
    "encrypt": "key",
    "tls": {
        "defaults": {
            "ca_file": "/consul/config/certs/consul-agent-ca.pem",
            "cert_file": "/consul/config/certs/consul-agent-cert.pem",
            "key_file": "/consul/config/certs/consul-agent-key.pem",
            "verify_incoming": true,
            "verify_outgoing": true,
            "verify_server_hostname": true
        }
    },
    "acl": {
        "enabled": true,
        "default_policy": "deny",
        "down_policy": "async-cache",
        "enable_token_persistence": true,
        "enable_token_replication": true,
        "tokens": {
            "initial_management": "INITIAL_TOKEN_02"
        },
        "policy_ttl": "60s",
        "role_ttl" : "60s",
        "token_ttl" : "60s"
    }
}
Common token config file
{
    "acl": {
        "tokens": {
            "agent": "b4c0c9bc-3804-a625-7e02-2eb74c2a3895",
            "replication": "bf5a3137-cc74-0261-7dd5-5f9062a883b9"
        }
    }
}
### Operating system and Environment details

Flatcar linux on vmware VM, running official consul container images version 1.19.1

Log Fragments

2024-09-03T10:22:23.213Z [INFO] agent: Joining cluster...: cluster=WAN 2024-09-03T10:22:23.213Z [INFO] agent: (WAN) joining: wan_addresses=["10.100.4.225", "10.100.4.224", "10.100.4.223"] 2024-09-03T10:22:23.221Z [INFO] agent: (WAN) joined: number_of_nodes=3 2024-09-03T10:22:23.223Z [INFO] agent: Join cluster completed. Synced with initial agents: cluster=WAN num_agents=3 2024-09-03T10:22:31.217Z [WARN] agent.server.raft: heartbeat timeout reached, starting election: last-leader-addr= last-leader-id= 2024-09-03T10:22:31.217Z [INFO] agent.server.raft: entering candidate state: node="Node at 10.100.4.227:8300 [Candidate]" term=11 2024-09-03T10:22:31.222Z [INFO] agent.server.raft: election won: term=11 tally=1 2024-09-03T10:22:31.223Z [INFO] agent.server.raft: entering leader state: leader="Node at 10.100.4.227:8300 [Leader]" 2024-09-03T10:22:31.223Z [INFO] agent.server: cluster leadership acquired 2024-09-03T10:22:31.223Z [INFO] agent.server: New leader elected: payload=GSVCSRTST04 2024-09-03T10:22:31.229Z [INFO] agent.leader: started routine: routine="ACL policy replication" 2024-09-03T10:22:31.229Z [INFO] agent.leader: started routine: routine="ACL role replication" 2024-09-03T10:22:31.229Z [INFO] agent.leader: started routine: routine="ACL token replication" 2024-09-03T10:22:31.229Z [INFO] agent.server.replication.acl.policy: started ACL Policy replication 2024-09-03T10:22:31.229Z [INFO] agent.server.replication.acl.role: started ACL Role replication 2024-09-03T10:22:31.229Z [INFO] agent.server.replication.acl.token: started ACL Token replication panic: runtime error: invalid memory address or nil pointer dereference [signal SIGSEGV: segmentation violation code=0x1 addr=0x8 pc=0x102bd6a]

goroutine 154 [running]: github.com/hashicorp/consul/agent/structs.(*ACLTemplatedPolicyVariables).EstimateSize(...) github.com/hashicorp/consul/agent/structs/acl_templated_policy.go:229 github.com/hashicorp/consul/agent/structs.(*ACLTemplatedPolicy).EstimateSize(...) github.com/hashicorp/consul/agent/structs/acl_templated_policy.go:220 github.com/hashicorp/consul/agent/structs.(*ACLToken).EstimateSize(0xc001485b80?) github.com/hashicorp/consul/agent/structs/acl.go:569 +0xea github.com/hashicorp/consul/agent/consul.(*aclTokenReplicator).PendingUpdateEstimatedSize(0x989680?, 0x7f6108b8ad08?) github.com/hashicorp/consul/agent/consul/acl_replication_types.go:103 +0x25 github.com/hashicorp/consul/agent/consul.(*Server).updateLocalACLType(0xc000cd9808, {0x54c3b98, 0xc0005c8280}, {0x54e4ca8, 0xc000b99d10}, {0x54dc5e0, 0xc001484fa0}) github.com/hashicorp/consul/agent/consul/acl_replication.go:290 +0x388 github.com/hashicorp/consul/agent/consul.(*Server).replicateACLType(0xc000cd9808, {0x54c3b98, 0xc0005c8280}, {0x54e4ca8, 0xc000b99d10}, {0x54dc5e0, 0xc001484fa0}, 0x0) github.com/hashicorp/consul/agent/consul/acl_replication.go:468 +0xd5c github.com/hashicorp/consul/agent/consul.(*Server).replicateACLTokens(0xc000cd9808, {0x54c3b98, 0xc0005c8280}, {0x54e4ca8, 0xc000b99d10}, 0x0) github.com/hashicorp/consul/agent/consul/acl_replication.go:361 +0x65 github.com/hashicorp/consul/agent/consul.(*Server).runACLReplicator(0xc000cd9808, {0x54c3b98, 0xc0005c8280}, {0x54e4ca8, 0xc000b99d10}, {0x43f5f79, 0x6}, 0xc000b93ed0, {0x43ff33d, 0xa}) github.com/hashicorp/consul/agent/consul/leader.go:772 +0x144 github.com/hashicorp/consul/agent/consul.(*Server).runACLTokenReplicator(0xc000cd9808, {0x54c3b98, 0xc0005c8280}) github.com/hashicorp/consul/agent/consul/leader.go:747 +0xe6 github.com/hashicorp/consul/lib/routine.(*Manager).execute(0xc00112dec0, {0x54c3b98?, 0xc0005c8280?}, {0x442df2a, 0x15}, 0x0?, 0x0?) github.com/hashicorp/consul/lib/routine/routine.go:104 +0x7b created by github.com/hashicorp/consul/lib/routine.(*Manager).Start in goroutine 214 github.com/hashicorp/consul/lib/routine/routine.go:91 +0x2bf

SilverXXX avatar Sep 03 '24 13:09 SilverXXX