etcd3 icon indicating copy to clipboard operation
etcd3 copied to clipboard

Intermittent "13 INTERNAL: Received RST_STREAM with code 5" received from ElectionObserver

Open FredrikAugust opened this issue 5 months ago • 0 comments

Hi, we're running 3 instances of etcd in our cluster 3.5.15-debian-12-r5 and are using etcd3@^1.1.2. When running two instances of our application in our kubernetes cluster with [email protected] we're seeing the 13 INTERNAL: Received RST_STREAM with code 5 error quite frequently, followed by the elected leader losing their leadership.

2024-08-27T11:58:30.849685861Z {"level":"info","time":1724759910849,"pid":1,"msg":"Application Manager: Running leader campaign."}
2024-08-27T11:58:30.922776224Z {"level":"info","time":1724759910922,"pid":1,"data":{"previousLeader":"fYnTDpOjW7EEvCRPEyUR5"},"msg":"Application Manager: I am now the leader!"}
2024-08-27T11:59:25.699950965Z {"level":"info","time":1724759965699,"pid":1,"data":{"newLeader":"fYnTDpOjW7EEvCRPEyUR5"},"msg":"Application Manager: I am no longer the leader!"}
2024-08-27T12:00:25.296245667Z {"level":"error","time":1724760025295,"pid":1,"error":{"type":"Error","message":"Election observer failed.: 13 INTERNAL: Received RST_STREAM with c
ode 5","stack":"Error: Election observer failed.\n    at ElectionObserver.<anonymous> (/app/libs/kernel/dist/Infrastructure/Management/ApplicationManager.js:311:54)\n    at Elect
ionObserver.emit (node:events:531:35)\n    at /app/node_modules/.pnpm/[email protected]/node_modules/etcd3/lib/election.js:30:18\ncaused by: GRPCInternalError: 13 INTERNAL: Received RS
T_STREAM with code 5\n    at callErrorFromStatus (/app/node_modules/.pnpm/@[email protected]/node_modules/@grpc/grpc-js/build/src/call.js:31:19)\n    at Object.onReceiveStatus
(/app/node_modules/.pnpm/@[email protected]/node_modules/@grpc/grpc-js/build/src/client.js:193:76)\n    at Object.onReceiveStatus (/app/node_modules/.pnpm/@[email protected]/
node_modules/@grpc/grpc-js/build/src/client-interceptors.js:360:141)\n    at Object.onReceiveStatus (/app/node_modules/.pnpm/@[email protected]/node_modules/@grpc/grpc-js/build
/src/client-interceptors.js:323:181)\n    at /app/node_modules/.pnpm/@[email protected]/node_modules/@grpc/grpc-js/build/src/resolving-call.js:129:78\n    at process.processTic
ksAndRejections (node:internal/process/task_queues:77:11)\nfor call at\n    at ServiceClientImpl.makeUnaryRequest (/app/node_modules/.pnpm/@[email protected]/node_modules/@grpc
/grpc-js/build/src/client.js:161:32)\n    at ServiceClientImpl.<anonymous> (/app/node_modules/.pnpm/@[email protected]/node_modules/@grpc/grpc-js/build/src/make-client.js:105:1
9)\n    at /otel-auto-instrumentation-nodejs/node_modules/@opentelemetry/instrumentation-grpc/build/src/clientUtils.js:131:31\n    at /otel-auto-instrumentation-nodejs/node_modul
es/@opentelemetry/instrumentation-grpc/build/src/instrumentation.js:237:209\n    at AsyncLocalStorage.run (node:async_hooks:346:14)\n    at AsyncLocalStorageContextManager.with (
/otel-auto-instrumentation-nodejs/node_modules/@opentelemetry/context-async-hooks/build/src/AsyncLocalStorageContextManager.js:33:40)\n    at ContextAPI.with (/otel-auto-instrume
ntation-nodejs/node_modules/@opentelemetry/api/build/src/api/context.js:60:46)\n    at ServiceClientImpl.clientMethodTrace [as range] (/otel-auto-instrumentation-nodejs/node_modu
les/@opentelemetry/instrumentation-grpc/build/src/instrumentation.js:237:42)\n    at /app/node_modules/.pnpm/[email protected]/node_modules/etcd3/lib/connection-pool.js:35:23\n    at n
ew Promise (<anonymous>)"},"data":{"component":"ApplicationManager","data":{}},"msg":"Error occurred"}
2024-08-27T12:00:30.297121888Z {"level":"info","time":1724760030296,"pid":1,"msg":"Application Manager: Running leader campaign."}
2024-08-27T12:00:30.335084888Z {"level":"info","time":1724760030334,"pid":1,"data":{"previousLeader":"fYnTDpOjW7EEvCRPEyUR5"},"msg":"Application Manager: I am now the leader!"}
2024-08-27T12:01:11.335468351Z {"level":"info","time":1724760071335,"pid":1,"data":{"newLeader":"fYnTDpOjW7EEvCRPEyUR5"},"msg":"Application Manager: I am no longer the leader!"}
2024-08-27T12:02:10.969006650Z {"level":"error","time":1724760130968,"pid":1,"error":{"type":"Error","message":"Election observer failed.: 13 INTERNAL: Received RST_STREAM with c
ode 5","stack":"Error: Election observer failed.\n    at ElectionObserver.<anonymous> (/app/libs/kernel/dist/Infrastructure/Management/ApplicationManager.js:311:54)\n    at Elect
ionObserver.emit (node:events:531:35)\n    at /app/node_modules/.pnpm/[email protected]/node_modules/etcd3/lib/election.js:30:18\ncaused by: GRPCInternalError: 13 INTERNAL: Received RS
T_STREAM with code 5\n    at callErrorFromStatus (/app/node_modules/.pnpm/@[email protected]/node_modules/@grpc/grpc-js/build/src/call.js:31:19)\n    at Object.onReceiveStatus
(/app/node_modules/.pnpm/@[email protected]/node_modules/@grpc/grpc-js/build/src/client.js:193:76)\n    at Object.onReceiveStatus (/app/node_modules/.pnpm/@[email protected]/
node_modules/@grpc/grpc-js/build/src/client-interceptors.js:360:141)\n    at Object.onReceiveStatus (/app/node_modules/.pnpm/@[email protected]/node_modules/@grpc/grpc-js/build
/src/client-interceptors.js:323:181)\n    at /app/node_modules/.pnpm/@[email protected]/node_modules/@grpc/grpc-js/build/src/resolving-call.js:129:78\n    at process.processTic
ksAndRejections (node:internal/process/task_queues:77:11)\nfor call at\n    at ServiceClientImpl.makeUnaryRequest (/app/node_modules/.pnpm/@[email protected]/node_modules/@grpc
/grpc-js/build/src/client.js:161:32)\n    at ServiceClientImpl.<anonymous> (/app/node_modules/.pnpm/@[email protected]/node_modules/@grpc/grpc-js/build/src/make-client.js:105:1
9)\n    at /otel-auto-instrumentation-nodejs/node_modules/@opentelemetry/instrumentation-grpc/build/src/clientUtils.js:131:31\n    at /otel-auto-instrumentation-nodejs/node_modul
es/@opentelemetry/instrumentation-grpc/build/src/instrumentation.js:237:209\n    at AsyncLocalStorage.run (node:async_hooks:346:14)\n    at AsyncLocalStorageContextManager.with (
/otel-auto-instrumentation-nodejs/node_modules/@opentelemetry/context-async-hooks/build/src/AsyncLocalStorageContextManager.js:33:40)\n    at ContextAPI.with (/otel-auto-instrume
ntation-nodejs/node_modules/@opentelemetry/api/build/src/api/context.js:60:46)\n    at ServiceClientImpl.clientMethodTrace [as range] (/otel-auto-instrumentation-nodejs/node_modu
les/@opentelemetry/instrumentation-grpc/build/src/instrumentation.js:237:42)\n    at /app/node_modules/.pnpm/[email protected]/node_modules/etcd3/lib/connection-pool.js:35:23\n    at n
ew Promise (<anonymous>)"},"data":{"component":"ApplicationManager","data":{}},"msg":"Error occurred"}
2024-08-27T12:02:15.966999285Z {"level":"info","time":1724760135966,"pid":1,"msg":"Application Manager: Running leader campaign."}
2024-08-27T12:02:15.999759748Z {"level":"info","time":1724760135999,"pid":1,"data":{"previousLeader":"fYnTDpOjW7EEvCRPEyUR5"},"msg":"Application Manager: I am now the leader!"}
2024-08-27T12:03:10.585657111Z {"level":"info","time":1724760190585,"pid":1,"data":{"newLeader":"fYnTDpOjW7EEvCRPEyUR5"},"msg":"Application Manager: I am no longer the leader!"}
2024-08-27T12:03:56.660135211Z {"level":"error","time":1724760236659,"pid":1,"error":{"type":"Error","message":"Election observer failed.: 13 INTERNAL: Received RST_STREAM with c
ode 5","stack":"Error: Election observer failed.\n    at ElectionObserver.<anonymous> (/app/libs/kernel/dist/Infrastructure/Management/ApplicationManager.js:311:54)\n    at Elect
ionObserver.emit (node:events:531:35)\n    at /app/node_modules/.pnpm/[email protected]/node_modules/etcd3/lib/election.js:30:18\n    at runNextTicks (node:internal/process/task_queues
:60:5)\n    at listOnTimeout (node:internal/timers:545:9)\n    at process.processTimers (node:internal/timers:519:7)\ncaused by: GRPCInternalError: 13 INTERNAL: Received RST_STRE
AM with code 5\n    at callErrorFromStatus (/app/node_modules/.pnpm/@[email protected]/node_modules/@grpc/grpc-js/build/src/call.js:31:19)\n    at Object.onReceiveStatus (/app/
node_modules/.pnpm/@[email protected]/node_modules/@grpc/grpc-js/build/src/client.js:193:76)\n    at Object.onReceiveStatus (/app/node_modules/.pnpm/@[email protected]/node_m
odules/@grpc/grpc-js/build/src/client-interceptors.js:360:141)\n    at Object.onReceiveStatus (/app/node_modules/.pnpm/@[email protected]/node_modules/@grpc/grpc-js/build/src/c
lient-interceptors.js:323:181)\n    at /app/node_modules/.pnpm/@[email protected]/node_modules/@grpc/grpc-js/build/src/resolving-call.js:129:78\n    at process.processTicksAndR
ejections (node:internal/process/task_queues:77:11)\nfor call at\n    at ServiceClientImpl.makeUnaryRequest (/app/node_modules/.pnpm/@[email protected]/node_modules/@grpc/grpc-
js/build/src/client.js:161:32)\n    at ServiceClientImpl.<anonymous> (/app/node_modules/.pnpm/@[email protected]/node_modules/@grpc/grpc-js/build/src/make-client.js:105:19)\n
  at /otel-auto-instrumentation-nodejs/node_modules/@opentelemetry/instrumentation-grpc/build/src/clientUtils.js:131:31\n    at /otel-auto-instrumentation-nodejs/node_modules/@op
entelemetry/instrumentation-grpc/build/src/instrumentation.js:237:209\n    at AsyncLocalStorage.run (node:async_hooks:346:14)\n    at AsyncLocalStorageContextManager.with (/otel-
auto-instrumentation-nodejs/node_modules/@opentelemetry/context-async-hooks/build/src/AsyncLocalStorageContextManager.js:33:40)\n    at ContextAPI.with (/otel-auto-instrumentatio
n-nodejs/node_modules/@opentelemetry/api/build/src/api/context.js:60:46)\n    at ServiceClientImpl.clientMethodTrace [as range] (/otel-auto-instrumentation-nodejs/node_modules/@o
pentelemetry/instrumentation-grpc/build/src/instrumentation.js:237:42)\n    at /app/node_modules/.pnpm/[email protected]/node_modules/etcd3/lib/connection-pool.js:35:23\n    at new Pro
mise (<anonymous>)"},"data":{"component":"ApplicationManager","data":{}},"msg":"Error occurred"}
2024-08-27T12:04:01.664572952Z {"level":"info","time":1724760241664,"pid":1,"msg":"Application Manager: Running leader campaign."}
2024-08-27T12:04:01.690372108Z {"level":"info","time":1724760241690,"pid":1,"data":{"previousLeader":"fYnTDpOjW7EEvCRPEyUR5"},"msg":"Application Manager: I am now the leader!"}
2024-08-27T12:04:56.206549039Z {"level":"info","time":1724760296206,"pid":1,"data":{"newLeader":"fYnTDpOjW7EEvCRPEyUR5"},"msg":"Application Manager: I am no longer the leader!"}

As you can see here there appears to be approximately 1 minute between the loss of leadership and the gRPC error. This would correspond quite nicely to the default TTL if I'm not mistaken.

Our code is largely based on the examples given in this repository, and any help debugging this would be greatly appreciated.

FredrikAugust avatar Aug 27 '24 12:08 FredrikAugust