emissary
emissary copied to clipboard
Routes disappear and replaced with repeated route
Describe the bug
After emissary is triggered to reconfigure some routes seem to disappear. gRPC clients connected via mTLS are returned Unimplemented
.
Here's an example Unimplemented log event:
{
"protocol": "HTTP/2",
"upstream_cluster": "-",
"path": "/our.grpc.api.someservice.v1/Foo",
"duration": "0",
"response_code": "200",
"upstream_host": "-",
"upstream_service_time": "-",
"bytes_received": "0",
"grpc_code": "Unimplemented",
"upstream_transport_failure_reason": "-",
"user_agent": "grpc-go/1.42.0",
"method": "POST",
"bytes_sent": "0",
"requested_server_name": "our.service.com",
"response_flags": "NR"
}
Further investigation indicates that the routes are missing on the resulting envoy configuration. This has been challenging to reproduce reliably. However, we were able to reproduce it via triggering reconfiguration by updating a field on a mapping repeatedly:
The result was that some expected routes were replaced with the same route multiple times!
[
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
},
{
"exact_match": "https",
"name": "x-forwarded-proto"
}
],
"prefix": "/grpc.health.v1.Health/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
],
"route": {
"cluster": "cluster_ingress_foo_1234_o-1",
"prefix_rewrite": "/grpc.health.v1.Health/",
"priority": null,
"timeout": "300.000s"
}
},
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
}
],
"prefix": "/grpc.health.v1.Health/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"redirect": {
"https_redirect": true
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
]
},
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
},
{
"exact_match": "https",
"name": "x-forwarded-proto"
}
],
"prefix": "/grpc.health.v1.Health/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
],
"route": {
"cluster": "cluster_ingress_foo_1234_o-1",
"prefix_rewrite": "/grpc.health.v1.Health/",
"priority": null,
"timeout": "300.000s"
}
},
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
}
],
"prefix": "/grpc.health.v1.Health/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"redirect": {
"https_redirect": true
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
]
},
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
},
{
"exact_match": "https",
"name": "x-forwarded-proto"
}
],
"prefix": "/grpc.health.v1.Health/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
],
"route": {
"cluster": "cluster_ingress_foo_1234_o-1",
"prefix_rewrite": "/grpc.health.v1.Health/",
"priority": null,
"timeout": "300.000s"
}
},
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
}
],
"prefix": "/grpc.health.v1.Health/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"redirect": {
"https_redirect": true
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
]
},
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
},
{
"exact_match": "https",
"name": "x-forwarded-proto"
}
],
"prefix": "/grpc.health.v1.Health/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
],
"route": {
"cluster": "cluster_ingress_foo_1234_o-1",
"prefix_rewrite": "/grpc.health.v1.Health/",
"priority": null,
"timeout": "300.000s"
}
},
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
}
],
"prefix": "/grpc.health.v1.Health/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"redirect": {
"https_redirect": true
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
]
},
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
},
{
"exact_match": "https",
"name": "x-forwarded-proto"
}
],
"prefix": "/grpc.health.v1.Health/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
],
"route": {
"cluster": "cluster_ingress_foo_1234_o-1",
"prefix_rewrite": "/grpc.health.v1.Health/",
"priority": null,
"timeout": "300.000s"
}
},
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
}
],
"prefix": "/grpc.health.v1.Health/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"redirect": {
"https_redirect": true
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
]
},
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
},
{
"exact_match": "https",
"name": "x-forwarded-proto"
}
],
"prefix": "/grpc.health.v1.Health/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
],
"route": {
"cluster": "cluster_ingress_foo_1234_o-1",
"prefix_rewrite": "/grpc.health.v1.Health/",
"priority": null,
"timeout": "300.000s"
}
},
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
}
],
"prefix": "/grpc.health.v1.Health/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"redirect": {
"https_redirect": true
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
]
},
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
},
{
"exact_match": "https",
"name": "x-forwarded-proto"
}
],
"prefix": "/grpc.health.v1.Health/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
],
"route": {
"cluster": "cluster_ingress_foo_1234_o-1",
"prefix_rewrite": "/grpc.health.v1.Health/",
"priority": null,
"timeout": "300.000s"
}
},
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
}
],
"prefix": "/grpc.health.v1.Health/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"redirect": {
"https_redirect": true
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
]
}
]
Expected behavior This is the expected route config:
[
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
},
{
"exact_match": "https",
"name": "x-forwarded-proto"
}
],
"prefix": "/our.grpc.api.someservice.v1/Foo/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
],
"route": {
"cluster": "cluster_ingress_foo_1234_o-1",
"prefix_rewrite": "/our.grpc.api.someservice.v1/Foo/",
"priority": null,
"timeout": "300.000s"
}
},
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
}
],
"prefix": "/our.grpc.api.someservice.v1/Foo/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"redirect": {
"https_redirect": true
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
]
},
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
},
{
"exact_match": "https",
"name": "x-forwarded-proto"
}
],
"prefix": "/grpc.health.v1.Health/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
],
"route": {
"cluster": "cluster_ingress_foo_1234_o-1",
"prefix_rewrite": "/grpc.health.v1.Health/",
"priority": null,
"timeout": "300.000s"
}
},
{
"match": {
"case_sensitive": true,
"headers": [
{
"exact_match": "some-tenant.my-service.com",
"name": ":authority"
}
],
"prefix": "/grpc.health.v1.Health/",
"runtime_fraction": {
"default_value": {
"denominator": "HUNDRED",
"numerator": 100
},
"runtime_key": "routing.traffic_shift.cluster_ingress_foo_1234_o-1"
}
},
"redirect": {
"https_redirect": true
},
"request_headers_to_add": [
{
"append": false,
"header": {
"key": "x-tenant",
"value": "some-tenant"
}
},
{
"append": false,
"header": {
"key": "x-subject",
"value": "%DOWNSTREAM_PEER_SUBJECT%"
}
}
]
}
]
Versions (please complete the following information):
- Emissary Helm chart: 8.1.0 (fast reconfigure default)
- Kubernetes environment: 1.22
Additional context We have a lot of mappings and hosts because each endpoint uses a different mTLS certificate authority: Mappings: ~ 1500 Hosts: ~1000 Listeners: 2
debug stats:
{
"timers": {
"check_alive": "263, 366.629µs/25.187206ms/1.000849721s",
"check_ready": "64, 371.627µs/38.623674ms/985.04253ms",
"katesUpdate": "6226, 1.35µs/413.625µs/1.547234439s",
"notifyWebhook:diagd": "4, 56.462850273s/1m14.404906809s/1m44.460728719s",
"notifyWebhooks": "4, 56.462893744s/1m14.40494906s/1m44.46076506s",
"parseAnnotations": "15, 311.637µs/347.553µs/406.819µs",
"reconcileAuthServices": "15, 11.98µs/14.159µs/16.451µs",
"reconcileConsul": "15, 133.283µs/165.674µs/277.566µs",
"reconcileRateLimitServices": "15, 2.29µs/2.583µs/3.131µs",
"reconcileSecrets": "15, 44.506476ms/49.058517ms/63.156767ms"
},
"values": {
"envoyReconfigs": {
"times": [
"2022-08-30T16:31:36.490086696Z",
"2022-08-30T16:32:38.538463024Z",
"2022-08-30T16:36:44.632338864Z",
"2022-08-30T16:39:42.332444531Z",
"2022-08-30T16:41:27.434778702Z"
],
"staleCount": 4,
"staleMax": 0,
"synced": true,
"disableRatelimiter": false
},
"memory": "1.80Gi of 2.34Gi (76%)"
}
}
It has come to my attention that there was a bugfix related to this cache in 2.3.0. Since the behavior you saw was different enough between 2.x and 3.x, it would be useful for you to verify whether it happens with 2.3.2; see if the 2.x behavior you saw was fixed in 2.3 and then the 3.x is a separate regression?
I've seen this behavior on 3.x one time. I tried 2.3.2 and have not been able to reproduce the issue of the duplicate route.
I was experiencing similar behavior in 2.3.1, 2.3.2, and 2.4.0. The issue appears to have been caused by how I was naming my Mapping resources. I had two naming conventions:
- my-mapping-resource
- my.mapping.resource
Over time, I noticed that the routes defined by Mapping resources with a .
in the name would disappear and seemingly be consolidated/repeated into a single route. I have since renamed all Mapping resources to use -
instead of .
and have yet to see the issue resurface in any of the versions above.
Closing as a workaround is available. If issues persist please let us know.