nomad
nomad copied to clipboard
Envoy proxy “Permission denied: anonymous token lacks permission” when using Consul agent-configured token
Nomad version
Nomad v1.7.2
BuildDate 2023-12-13T19:59:42Z
Revision 64e3dca9274b493e38a49fda3a70fd31d0485b91
Operating system and Environment details
Linux 6.1.66-91.160.amzn2023.x86_64
Issue
Envoy proxy doesn't seem to use the service identity token to connect with the gRPC interface, which cause the proxy to never bootstrap
Service identity is properly set up and a token has been issued by consul. No task identity is used
Reproduction steps
Expected Result
Envoy should be able to bootstrap correctly and the consul healthchecks should pass
Actual Result
Envoy isn't correctly bootstrapped and the service is not marked as healthy
Job file (if appropriate)
job "test-server" {
datacenters = ["eu-west-3a", "eu-west-3b", "eu-west-3c"]
type = "service"
update {
stagger = "30s"
max_parallel = 2
}
group "app" {
count = 1
network {
mode = "bridge"
port "http" {
to = 8080
}
}
service {
port = 8080
name = "test-server"
task = "app"
tags = [
"http",
"traefik.enable=true",
"traefik.http.routers.test.rule=PathPrefix(`/test`)",
"traefik.http.routers.test.middlewares=test_strip_prefix",
"traefik.http.middlewares.test_strip_prefix.stripprefix.prefixes=/test",
"traefik.consulcatalog.connect=true"
]
connect {
sidecar_service {}
sidecar_task {
resources {
cpu = 64
memory = 320
}
}
}
check {
type = "http"
port = "http"
path = "/ping"
interval = "10s"
timeout = "2s"
}
}
task "app" {
driver = "docker"
config {
image = "XXXXXXXXX.dkr.ecr.eu-west-3.amazonaws.com/test-server:latest"
ports = ["http"]
command = "/main"
}
resources {
cpu = 500 # MHz
memory = 128 # MB
}
}
}
}
Nomad Server Config
data_dir = "/opt/nomad/data"
region = "ops"
datacenter = "eu-west-3a"
name = "i-06d5eeb006c024855"
bind_addr = "0.0.0.0"
leave_on_terminate = true
advertise {
http = "{{ GetInterfaceIP \"ens5\" }}"
rpc = "{{ GetInterfaceIP \"ens5\" }}"
serf = "{{ GetInterfaceIP \"ens5\" }}"
}
consul {
address = "127.0.0.1:8501"
auto_advertise = true
ssl = true
verify_ssl = true
ca_file = "/opt/consul/tls/consul-ca.pem"
key_file = "/opt/consul/tls/consul-key.pem"
cert_file = "/opt/consul/tls/consul-cert.pem"
service_identity {
aud = ["nomad.<redacted>"]
ttl = "1h"
}
task_identity {
env = true
ttl = "1h"
aud = ["nomad.<redacted>"]
file = true
}
}
server {
enabled = true
bootstrap_expect = 3
node_gc_threshold = "30m"
rejoin_after_leave = true
}
acl {
enabled = true
token_ttl = "30s"
policy_ttl = "60s"
}
tls {
http = true
rpc = true
verify_server_hostname = true
verify_https_client = false
ca_file = "/opt/nomad/tls/nomad-ca.pem"
cert_file = "/opt/nomad/tls/nomad-cert.pem"
key_file = "/opt/nomad/tls/nomad-key.pem"
}
telemetry {
collection_interval = "1s"
disable_hostname = true
prometheus_metrics = true
publish_allocation_metrics = true
publish_node_metrics = true
}
Nomad Client config
data_dir = "/opt/nomad/data"
plugin_dir = "/opt/nomad/plugins"
region = "ops"
datacenter = "eu-west-3b"
name = "i-040bc7525ff44b848"
bind_addr = "0.0.0.0"
leave_on_terminate = true
advertise {
http = "{{ GetInterfaceIP \"ens5\" }}"
rpc = "{{ GetInterfaceIP \"ens5\" }}"
serf = "{{ GetInterfaceIP \"ens5\" }}"
}
consul {
address = "127.0.0.1:8501"
auto_advertise = true
grpc_address = "127.0.0.1:8503"
grpc_ca_file = "/opt/consul/tls/consul-ca.pem"
share_ssl = true
ssl = true
verify_ssl = true
ca_file = "/opt/consul/tls/consul-ca.pem"
key_file = "/opt/consul/tls/consul-key.pem"
cert_file = "/opt/consul/tls/consul-cert.pem"
}
client {
enabled = true
node_class = "default"
host_volume "docker-sock-ro" {
path = "/var/run/docker.sock"
read_only = true
policy = "read"
}
chroot_env {
"/bin" = "/bin"
"/lib" = "/lib"
"/lib64" = "/lib64"
"/etc/ld.so.cache" = "/etc/ld.so.cache"
"/etc/ld.so.conf" = "/etc/ld.so.conf"
"/etc/ld.so.conf.d" = "/etc/ld.so.conf.d"
"/etc/passwd" = "/etc/passwd"
}
options = {
"driver.denylist" = "raw_exec"
}
}
tls {
http = true
rpc = true
verify_server_hostname = true
verify_https_client = false
ca_file = "/opt/nomad/tls/nomad-ca.pem"
cert_file = "/opt/nomad/tls/nomad-cert.pem"
key_file = "/opt/nomad/tls/nomad-key.pem"
}
telemetry {
collection_interval = "1s"
disable_hostname = true
prometheus_metrics = true
publish_allocation_metrics = true
publish_node_metrics = true
}
plugin "docker" {
config {
allow_privileged = true
volumes {
enabled = true
}
extra_labels = ["job_name", "task_group_name", "task_name", "namespace", "node_name"]
auth {
config = "/opt/nomad/docker.config.json"
}
}
}
Consul agent config
server = false
node_name = "i-040bc7525ff44b848"
bind_addr = "{{ GetInterfaceIP \"ens5\" }}"
advertise_addr = "{{ GetInterfaceIP \"ens5\" }}"
client_addr = "0.0.0.0"
data_dir = "/opt/consul"
datacenter = "ops"
primary_datacenter = "ops"
leave_on_terminate = true
enable_agent_tls_for_checks = true
encrypt = "<redacted>"
encrypt_verify_incoming = true
encrypt_verify_outgoing = true
retry_join = [
"provider=aws region=eu-west-3 tag_key=ConsulClusterID tag_value=<redacted> addr_type=private_v4 service=ec2"
]
acl {
enabled = true
default_policy = "deny"
down_policy = "extend-cache"
enable_token_persistence = true
}
connect {
enabled = true
ca_provider = "consul"
}
ports {
http = 8500 # TCP only
https = 8501 # TCP only
grpc = 8502 # TCP only
grpc_tls = 8503 # TCP only
dns = 8600 # TCP and UDP
server = 8300 # TCP only
serf_lan = 8301 # TCP and UDP
serf_wan = 8302 # TCP and UDP
}
node_meta {
server_type = "nomad-client"
instance_type = "t3.medium"
availability_zone = "eu-west-3b"
ami_id = "ami-<redacted>"
}
autopilot {
cleanup_dead_servers = true
last_contact_threshold = "200ms"
max_trailing_logs = 250
server_stabilization_time = "10s"
}
telemetry {
prometheus_retention_time = "60s"
disable_hostname = true
}
ui_config {
enabled = false
}
auto_encrypt {
tls = true
}
peering {
enabled = true
}
tls {
defaults {
verify_incoming = true
verify_outgoing = true
verify_server_hostname = true
ca_file = "/opt/consul/tls/consul-ca.pem"
key_file = "/opt/consul/tls/consul-key.pem"
cert_file = "/opt/consul/tls/consul-cert.pem"
}
grpc {
verify_incoming = false
}
https {
verify_incoming = false
}
}
Nomad Server logs (if appropriate)
systemd[1]: nomad.service: Unit cannot be reloaded because it is inactive.
systemd[1]: Started nomad.service - "HashiCorp Nomad".
nomad[25113]: ==> Loaded configuration from /etc/nomad.d/nomad.hcl
nomad[25113]: ==> Starting Nomad agent...
nomad[25113]: ==> Nomad agent configuration:
nomad[25113]: Advertise Addrs: HTTP: 10.0.75.167:4646; RPC: 10.0.75.167:4647; Serf: 10.0.75.167:4648
nomad[25113]: Bind Addrs: HTTP: [0.0.0.0:4646]; RPC: 0.0.0.0:4647; Serf: 0.0.0.0:4648
nomad[25113]: Client: false
nomad[25113]: Log Level: INFO
nomad[25113]: Node Id: 816240fb-942e-6df4-fbd8-fba4de751891
nomad[25113]: Region: ops (DC: eu-west-3b)
nomad[25113]: Server: true
nomad[25113]: Version: 1.7.2
nomad[25113]: ==> Nomad agent started! Log data will stream in below:
nomad[25113]: 2024-01-08T15:54:05.011Z [INFO] nomad: setting up raft bolt store: no_freelist_sync=false
nomad[25113]: 2024-01-08T15:54:05.015Z [INFO] nomad.raft: initial configuration: index=0 servers=[]
nomad[25113]: 2024-01-08T15:54:05.015Z [INFO] nomad.raft: entering follower state: follower="Node at 10.0.75.167:4647 [Follower]" leader-address= leader-id=
nomad[25113]: 2024-01-08T15:54:05.020Z [INFO] nomad: serf: EventMemberJoin: i-0a01ae7fbba893d72.ops 10.0.75.167
nomad[25113]: 2024-01-08T15:54:05.020Z [INFO] nomad: starting scheduling worker(s): num_workers=2 schedulers=["sysbatch", "service", "batch", "system", "_core"]
nomad[25113]: 2024-01-08T15:54:05.021Z [INFO] nomad: started scheduling worker(s): num_workers=2 schedulers=["sysbatch", "service", "batch", "system", "_core"]
nomad[25113]: 2024-01-08T15:54:05.024Z [INFO] nomad: adding server: server="i-0a01ae7fbba893d72.ops (Addr: 10.0.75.167:4647) (DC: eu-west-3b)"
nomad[25113]: 2024-01-08T15:54:05.056Z [INFO] nomad: serf: EventMemberJoin: i-0d81ccccc7821108b.ops 10.0.83.110
nomad[25113]: 2024-01-08T15:54:05.056Z [INFO] nomad: adding server: server="i-0d81ccccc7821108b.ops (Addr: 10.0.83.110:4647) (DC: eu-west-3c)"
nomad[25113]: 2024-01-08T15:54:05.056Z [INFO] nomad: successfully contacted Nomad servers: num_servers=1
nomad[25113]: 2024-01-08T15:54:06.621Z [WARN] nomad.raft: no known peers, aborting election
nomad[25113]: 2024-01-08T15:54:08.066Z [INFO] nomad: successfully contacted Nomad servers: num_servers=1
nomad[25113]: 2024-01-08T15:54:08.290Z [INFO] nomad: serf: EventMemberJoin: i-06d5eeb006c024855.ops 10.0.58.98
nomad[25113]: 2024-01-08T15:54:08.290Z [INFO] nomad: adding server: server="i-06d5eeb006c024855.ops (Addr: 10.0.58.98:4647)(DC: eu-west-3a)"
nomad[25113]: 2024-01-08T15:54:08.309Z [INFO] nomad: found expected number of peers, attempting to bootstrap cluster...: peers="10.0.75.167:4647,10.0.83.110:4647,10.0.58.98:4647"
nomad[25113]: 2024-01-08T15:54:08.517Z [WARN] nomad.raft: heartbeat timeout reached, starting election: last-leader-addr= last-leader-id=
nomad[25113]: 2024-01-08T15:54:08.517Z [INFO] nomad.raft: entering candidate state: node="Node at 10.0.75.167:4647 [Candidate]" term=2
nomad[25113]: 2024-01-08T15:54:08.533Z [INFO] nomad.raft: election won: term=2 tally=2
nomad[25113]: 2024-01-08T15:54:08.533Z [INFO] nomad.raft: entering leader state: leader="Node at 10.0.75.167:4647 [Leader]"
nomad[25113]: 2024-01-08T15:54:08.533Z [INFO] nomad.raft: added peer, starting replication: peer=02ccf3a1-df23-d6d3-634f-70372c5ad320
nomad[25113]: 2024-01-08T15:54:08.533Z [INFO] nomad.raft: added peer, starting replication: peer=4112cabe-dde9-ce73-1476-2d25fd38335b
nomad[25113]: 2024-01-08T15:54:08.533Z [INFO] nomad: cluster leadership acquired
nomad[25113]: 2024-01-08T15:54:08.535Z [INFO] nomad.raft: pipelining replication: peer="{Voter 02ccf3a1-df23-d6d3-634f-70372c5ad320 10.0.83.110:4647}"
nomad[25113]: 2024-01-08T15:54:08.539Z [WARN] nomad.raft: appendEntries rejected, sending older logs: peer="{Voter 4112cabe-dde9-ce73-1476-2d25fd38335b 10.0.58.98:4647}" next=1
nomad[25113]: 2024-01-08T15:54:08.542Z [INFO] nomad.raft: pipelining replication: peer="{Voter 4112cabe-dde9-ce73-1476-2d25fd38335b 10.0.58.98:4647}"
nomad[25113]: 2024-01-08T15:54:08.568Z [INFO] nomad.core: established cluster id: cluster_id=009f440f-47df-0c24-a2ce-700a92ba697f create_time=1704729248559995130
nomad[25113]: 2024-01-08T15:54:08.568Z [INFO] nomad: eval broker status modified: paused=false
nomad[25113]: 2024-01-08T15:54:08.568Z [INFO] nomad: blocked evals status modified: paused=false
nomad[25113]: 2024-01-08T15:54:09.158Z [INFO] nomad.keyring: initialized keyring: id=82fb7a22-5815-0e4b-a811-3e100f7efbcc
Nomad Client logs (if appropriate)
systemd[1]: nomad.service: Unit cannot be reloaded because it is inactive.
systemd[1]: Started nomad.service - "HashiCorp Nomad".
nomad[16049]: ==> Loaded configuration from /etc/nomad.d/nomad.hcl
nomad[16049]: ==> Starting Nomad agent...
nomad[16049]: ==> Nomad agent configuration:
nomad[16049]: Advertise Addrs: HTTP: 10.0.82.238:4646
nomad[16049]: Bind Addrs: HTTP: [0.0.0.0:4646]
nomad[16049]: Client: true
nomad[16049]: Log Level: INFO
nomad[16049]: Region: ops (DC: eu-west-3c)
nomad[16049]: Server: false
nomad[16049]: Version: 1.7.2
nomad[16049]: ==> Nomad agent started! Log data will stream in below:
nomad[16049]: 2024-01-08T16:04:08.044Z [INFO] agent: detected plugin: name=qemu type=driver plugin_version=0.1.0
nomad[16049]: 2024-01-08T16:04:08.044Z [INFO] agent: detected plugin: name=java type=driver plugin_version=0.1.0
nomad[16049]: 2024-01-08T16:04:08.044Z [INFO] agent: detected plugin: name=docker type=driver plugin_version=0.1.0
nomad[16049]: 2024-01-08T16:04:08.044Z [INFO] agent: detected plugin: name=raw_exec type=driver plugin_version=0.1.0
nomad[16049]: 2024-01-08T16:04:08.044Z [INFO] agent: detected plugin: name=exec type=driver plugin_version=0.1.0
nomad[16049]: 2024-01-08T16:04:08.047Z [INFO] client: using state directory: state_dir=/opt/nomad/data/client
nomad[16049]: 2024-01-08T16:04:08.058Z [INFO] client: using alloc directory: alloc_dir=/opt/nomad/data/alloc
nomad[16049]: 2024-01-08T16:04:08.058Z [INFO] client: using dynamic ports: min=20000 max=32000 reserved=""
nomad[16049]: 2024-01-08T16:04:08.090Z [INFO] client.fingerprint_mgr.consul: consul agent is available: cluster=default
nomad[16049]: 2024-01-08T16:04:08.095Z [WARN] client.fingerprint_mgr.landlock: failed to fingerprint kernel landlock feature: error="function not implemented"
nomad[16049]: 2024-01-08T16:04:08.111Z [WARN] client.fingerprint_mgr.network: unable to parse speed: path=/usr/sbin/ethtooldevice=ens5
nomad[16049]: 2024-01-08T16:04:08.114Z [WARN] client.fingerprint_mgr.network: unable to parse speed: path=/usr/sbin/ethtooldevice=lo
nomad[16049]: 2024-01-08T16:04:08.121Z [WARN] client.fingerprint_mgr.network: unable to parse speed: path=/usr/sbin/ethtooldevice=ens5
nomad[16049]: 2024-01-08T16:04:08.130Z [WARN] client.fingerprint_mgr.network: unable to parse speed: path=/usr/sbin/ethtooldevice=docker0
nomad[16049]: 2024-01-08T16:04:08.687Z [INFO] client.proclib.cg2: initializing nomad cgroups: cores=0-1
nomad[16049]: 2024-01-08T16:04:08.688Z [INFO] client.plugin: starting plugin manager: plugin-type=csi
nomad[16049]: 2024-01-08T16:04:08.690Z [INFO] client.plugin: starting plugin manager: plugin-type=driver
nomad[16049]: 2024-01-08T16:04:08.690Z [INFO] client.plugin: starting plugin manager: plugin-type=device
nomad[16049]: 2024-01-08T16:04:08.710Z [INFO] client.consul: discovered following servers: servers=[10.0.58.98:4647, 10.0.75.167:4647, 10.0.83.110:4647]
nomad[16049]: 2024-01-08T16:04:08.754Z [INFO] client: started client: node_id=4418ce87-1dbd-0aa1-e412-1acdd3093d79
nomad[16049]: 2024-01-08T16:04:08.788Z [INFO] client: node registration complete
nomad[16049]: 2024-01-08T16:04:15.914Z [INFO] client: node registration complete
nomad[16049]: 2024-01-08T16:05:37.530Z [INFO] client.alloc_runner.task_runner: Task event: alloc_id=ea309cfe-11bb-25c6-eaec-89b8873de883 task=app type=Received msg="Task received by client" failed=false
nomad[16049]: 2024-01-08T16:05:37.531Z [INFO] client.alloc_runner.task_runner: Task event: alloc_id=ea309cfe-11bb-25c6-eaec-89b8873de883 task=connect-proxy-test-server type=Received msg="Task received by client" failed=false
nomad[16049]: 2024-01-08T16:05:39.236Z [INFO] client.alloc_runner.task_runner: Task event: alloc_id=ea309cfe-11bb-25c6-eaec-89b8873de883 task=connect-proxy-test-server type="Task Setup" msg="Building Task Directory" failed=false
nomad[16049]: 2024-01-08T16:05:39.329Z [INFO] client.alloc_runner.task_runner.task_hook.envoy_bootstrap: bootstrapping envoy: alloc_id=ea309cfe-11bb-25c6-eaec-89b8873de883 task=connect-proxy-test-server namespace="" proxy_id=_nomad-task-ea309cfe-11bb-25c6-eaec-89b8873de883-group-app-test-server-8080-sidecar-proxy service=test-server gateway="" bootstrap_file=/opt/nomad/data/alloc/ea309cfe-11bb-25c6-eaec-89b8873de883/connect-proxy-test-server/secrets/envoy_bootstrap.json grpc_addr=unix://alloc/tmp/consul_grpc.sock admin_bind=127.0.0.2:19001 ready_bind=127.0.0.1:19101
nomad[16049]: 2024-01-08T16:05:39.425Z [INFO] client.alloc_runner.task_runner: Task event: alloc_id=ea309cfe-11bb-25c6-eaec-89b8873de883 task=connect-proxy-test-server type=Driver msg="Downloading image" failed=false
nomad[16049]: 2024-01-08T16:05:45.173Z [INFO] client.driver_mgr.docker: created container: driver=docker container_id=a3426d2f35e09e40d6e3c68d207c1df8e03bab17d5d892cfb1fca4ab3eb31c0f
nomad[16049]: 2024-01-08T16:05:45.345Z [INFO] client.driver_mgr.docker: started container: driver=docker container_id=a3426d2f35e09e40d6e3c68d207c1df8e03bab17d5d892cfb1fca4ab3eb31c0f
nomad[16049]: 2024-01-08T16:05:45.422Z [INFO] client.alloc_runner.task_runner: Task event: alloc_id=ea309cfe-11bb-25c6-eaec-89b8873de883 task=connect-proxy-test-server type=Started msg="Task started by client" failed=false
nomad[16049]: 2024-01-08T16:05:45.430Z [INFO] client.alloc_runner.task_runner: Task event: alloc_id=ea309cfe-11bb-25c6-eaec-89b8873de883 task=app type="Task Setup" msg="Building Task Directory" failed=false
nomad[16049]: 2024-01-08T16:05:45.626Z [INFO] client.alloc_runner.task_runner: Task event: alloc_id=ea309cfe-11bb-25c6-eaec-89b8873de883 task=app type=Driver msg="Downloading image" failed=false
nomad[16049]: 2024-01-08T16:05:46.701Z [INFO] client.driver_mgr.docker: created container: driver=docker container_id=5d3814ae0bd49a378b785b437fc21d705acf584997548ef1b1f3cd2a61b04e2b
nomad[16049]: 2024-01-08T16:05:46.863Z [INFO] client.driver_mgr.docker: started container: driver=docker container_id=5d3814ae0bd49a378b785b437fc21d705acf584997548ef1b1f3cd2a61b04e2b
nomad[16049]: 2024-01-08T16:05:46.908Z [INFO] client.alloc_runner.task_runner: Task event: alloc_id=ea309cfe-11bb-25c6-eaec-89b8873de883 task=app type=Started msg="Task started by client" failed=false
nomad[16049]: 2024-01-08T16:10:37.577Z [INFO] client.alloc_runner.task_runner: Task event: alloc_id=ea309cfe-11bb-25c6-eaec-89b8873de883 task=connect-proxy-test-server type="Alloc Unhealthy" msg="Task not running for min_healthy_time of 10s by healthy_deadline of 5m0s" failed=false
nomad[16049]: 2024-01-08T16:10:37.579Z [INFO] client.alloc_runner.task_runner: Task event: alloc_id=ea309cfe-11bb-25c6-eaec-89b8873de883 task=app type="Alloc Unhealthy" msg="Task not running for min_healthy_time of 10s by healthy_deadline of 5m0s" failed=false
Job Log
[GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached.
[GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production.
- using env: export GIN_MODE=release
- using code: gin.SetMode(gin.ReleaseMode)
[GIN-debug] GET /ping --> main.main.func1 (3 handlers)
[GIN-debug] [WARNING] You trusted all proxies, this is NOT safe. We recommend you to set a value.
Please check https://pkg.go.dev/github.com/gin-gonic/gin#readme-don-t-trust-all-proxies for details.
[GIN-debug] Environment variable PORT is undefined. Using port :8080 by default
[GIN-debug] Listening and serving HTTP on :8080
[GIN] 2024/01/08 - 16:05:47 | 200 | 46.581µs | 10.0.82.238 | GET "/ping"
[GIN] 2024/01/08 - 16:05:57 | 200 | 41.591µs | 10.0.82.238 | GET "/ping"
[GIN] 2024/01/08 - 16:06:07 | 200 | 57.801µs | 10.0.82.238 | GET "/ping"
[GIN] 2024/01/08 - 16:06:17 | 200 | 36.22µs | 10.0.82.238 | GET "/ping"
[GIN] 2024/01/08 - 16:06:27 | 200 | 48.901µs | 10.0.82.238 | GET "/ping"
[GIN] 2024/01/08 - 16:06:37 | 200 | 39.33µs | 10.0.82.238 | GET "/ping"
[GIN] 2024/01/08 - 16:06:47 | 200 | 38.36µs | 10.0.82.238 | GET "/ping"
[GIN] 2024/01/08 - 16:06:57 | 200 | 66.88µs | 10.0.82.238 | GET "/ping"
[GIN] 2024/01/08 - 16:07:07 | 200 | 37.8µs | 10.0.82.238 | GET "/ping"
[GIN] 2024/01/08 - 16:07:17 | 200 | 39.23µs | 10.0.82.238 | GET "/ping"
[GIN] 2024/01/08 - 16:07:27 | 200 | 39.541µs | 10.0.82.238 | GET "/ping"
[GIN] 2024/01/08 - 16:07:37 | 200 | 45.691µs | 10.0.82.238 | GET "/ping"
Connect Proxy log
quest does not specify a token.
[2024-01-08 16:57:07.685][1][warning][config] [./source/extensions/config_subscription/grpc/grpc_stream.h:152] DeltaAggregatedResources gRPC config stream to local_agent closed: 7, Permission denied: anonymous token lacks permission 'service:write' on "test-server". The anonymous token is used implicitly when a request does not specify a token.
[2024-01-08 16:57:17.719][1][warning][config] [./source/extensions/config_subscription/grpc/grpc_stream.h:152] DeltaAggregatedResources gRPC config stream to local_agent closed: 7, Permission denied: anonymous token lacks permission 'service:write' on "test-server". The anonymous token is used implicitly when a request does not specify a token.
[2024-01-08 16:57:21.036][1][warning][config] [./source/extensions/config_subscription/grpc/grpc_stream.h:152] DeltaAggregatedResources gRPC config stream to local_agent closed: 7, Permission denied: anonymous token lacks permission 'service:write' on "test-server". The anonymous token is used implicitly when a request does not specify a token.
...
Small update as I've come to understand a bit more of the issue after looking at the source code:
The SI token should be sent to envoy through the envoy_bootstrap_hook
, and should be notified here
However, it doesn't seem like any SI token has been delivered (despite one having been created on the consul cluster). So it would appear that no tokens could have been received by this function
As the documentation led me to believe the token should be available as soon as the sids_hooks
has been completed, I've taken a look at the code initializing the hook and I've come to understand that the issue at hand is that, without specifying a token on the consul config part of the nomad agent configuration, nomad would believe consul ACL are not enabled, and therefore skipping the sids_hook
This is an issue in setups like mines where default consul token are set at the consul agent level, and not in the nomad config.
I'm sure there is a more surefire way to identify wether consul ACL are enabled, because as far as I can see, the specified consul token is used nowhere in the SI token derivation, and could be avoided entirely
Hi @nwmqpa! Just for my clarification, did you have the same problem prior to Nomad 1.7 and/or without Workload Identity?
I've taken a look at the code initializing the hook and I've come to understand that the issue at hand is that, without specifying a token on the consul config part of the nomad agent configuration, nomad would believe consul ACL are not enabled, and therefore skipping the sids_hook
This is an issue in setups like mines where default consul token are set at the consul agent level, and not in the nomad config.
That all makes sense to me but if I look at the same code in 1.6.5 (ref task_runner_hooks.go#L142
) it looks like we had the same behavior. So that's why I'm asking about whether you saw this pre-1.7. But either way it seems like a bug; we should be checking whether the Consul successfully fingerprinted and not looking at the token. Or better yet we should run sids_hook
unconditionally for the Connect task and let it fail gracefully in cases where Consul ACLs aren't configured.
Hello @tgross , I haven't noticed this before migrating to workload identity, but we are in the process of migrating our clusters to the newer workload identities for both consul and vault, and are therefore doing quite a lot of new things at once.
Beforehand, our default was to store consul token in the nomad agent configuration, as we weren't using mTLS, and thought it would be more secure.
So I assume this behaviour has always been here, and is just now coming to light.
I do believe your idea about using the sids_hook
and let it gracefully fail would be better suited, given the default is to already gracefully fail when attempting to get the si_token
Thanks for the context @nwmqpa. I've been a little swamped the last couple weeks, but I'll try to see if I can build out a minimal repro and get some motion into fixing this.
We have the same problem here. Is there any update regarding a fix?
Maybe related, maybe a separate issue, but whilst with our setup we can initially bootstrap and get services up and running. Once the service identity token renews it appears that Envoy is unaware of this and continues trying to use an expired token.
Manually restarting the sidecar task fixes this.
[./source/extensions/config_subscription/grpc/grpc_stream.h:152] DeltaAggregatedResources gRPC config stream to local_agent closed: 16, unauthenticated: ACL not found
Edit: turns out this was because we were renewing Nomad's Consul token periodically, and this is what Envoy uses to bootstrap? See NOMAD-579. Switching to a static Consul token for Nomad resolved this.
I haven't had time to get back to this, so surfacing it for roadmapping so we can get some eyes on it.