agent
agent copied to clipboard
Fix SIQGUIT signal handling
We started seeing this crash with our buildkite agents on Linux:
SIGQUIT: quit
PC=0x468c21 m=0 sigcode=0
goroutine 0 [idle]:
runtime.futex(0x1b35a28, 0x80, 0x0, 0x0, 0x0, 0x7fffe24203e8, 0x43a7b3, 0xc000280148, 0x7fffe2420408, 0x40ae9f, ...)
/usr/local/go/src/runtime/sys_linux_amd64.s:567 +0x21
runtime.futexsleep(0x1b35a28, 0x7fff00000000, 0xffffffffffffffff)
/usr/local/go/src/runtime/os_linux.go:44 +0x46
runtime.notesleep(0x1b35a28)
/usr/local/go/src/runtime/lock_futex.go:151 +0x9f
runtime.stoplockedm()
/usr/local/go/src/runtime/proc.go:1971 +0x88
runtime.schedule()
/usr/local/go/src/runtime/proc.go:2454 +0x4a6
runtime.park_m(0xc000082900)
/usr/local/go/src/runtime/proc.go:2690 +0x9d
runtime.mcall(0x0)
/usr/local/go/src/runtime/asm_amd64.s:318 +0x5b
goroutine 1 [syscall]:
syscall.Syscall6(0xf7, 0x1, 0x63ee, 0xc00021e920, 0x1000004, 0x0, 0x0, 0xc00021e9a0, 0x0, 0xc00021ea00)
/usr/local/go/src/syscall/asm_linux_amd64.s:41 +0x5
os.(*Process).blockUntilWaitable(0xc000383bf0, 0xc0000b0780, 0x3, 0xc00021ea00)
/usr/local/go/src/os/wait_waitid.go:31 +0x98
os.(*Process).wait(0xc000383bf0, 0x2c, 0xc000383c20, 0xc000380000)
/usr/local/go/src/os/exec_unix.go:22 +0x39
os.(*Process).Wait(...)
/usr/local/go/src/os/exec.go:125
os/exec.(*Cmd).Wait(0xc0003b22c0, 0x120da40, 0x29)
/usr/local/go/src/os/exec/exec.go:502 +0x60
github.com/buildkite/agent/v3/process.(*Process).Run(0xc0003b01c0, 0x0, 0x0)
/work/process/process.go:233 +0x472
github.com/buildkite/agent/v3/bootstrap/shell.(*Shell).executeCommand(0xc0002aa000, 0xc0003ae280, 0x13adfa0, 0xc0000ba008, 0x10101, 0x0, 0x0)
/work/bootstrap/shell/shell.go:403 +0x29b
github.com/buildkite/agent/v3/bootstrap/shell.(*Shell).RunWithoutPrompt(0xc0002aa000, 0xc0003944d0, 0x9, 0xc0003be490, 0x3, 0x5, 0x3, 0x6)
/work/bootstrap/shell/shell.go:209 +0x186
github.com/buildkite/agent/v3/bootstrap.(*Bootstrap).defaultCommandPhase(0xc0001e3900, 0x0, 0x0)
/work/bootstrap/bootstrap.go:1481 +0x8b9
github.com/buildkite/agent/v3/bootstrap.(*Bootstrap).CommandPhase(0xc0001e3900, 0x7, 0x1)
/work/bootstrap/bootstrap.go:1310 +0x67d
github.com/buildkite/agent/v3/bootstrap.(*Bootstrap).Run(0xc0001e3900, 0x13c1960, 0xc0000bfb00, 0x0)
/work/bootstrap/bootstrap.go:144 +0x4aa
github.com/buildkite/agent/v3/clicommand.glob..func6(0xc0000e1080)
/work/clicommand/bootstrap.go:418 +0x97c
github.com/urfave/cli.HandleAction(0xf28260, 0x122bdb8, 0xc0000e1080, 0x0, 0xc0000bd440)
/go/pkg/mod/github.com/urfave/[email protected]/app.go:503 +0x7c
github.com/urfave/cli.Command.Run(0x11e8944, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0x11fcbae, 0x1b, 0x0, ...)
/go/pkg/mod/github.com/urfave/[email protected]/command.go:165 +0x4da
github.com/urfave/cli.(*App).Run(0xc0001c61c0, 0xc0000ac020, 0x2, 0x2, 0x0, 0x0)
/go/pkg/mod/github.com/urfave/[email protected]/app.go:259 +0x775
main.main()
/work/main.go:115 +0x55d
goroutine 18 [syscall]:
os/signal.signal_recv(0x13ba360)
/usr/local/go/src/runtime/sigqueue.go:147 +0x9c
os/signal.loop()
/usr/local/go/src/os/signal/signal_unix.go:23 +0x22
created by os/signal.Notify.func1
/usr/local/go/src/os/signal/signal.go:127 +0x44
goroutine 56 [syscall]:
syscall.Syscall(0x0, 0x3, 0xc000272000, 0x8000, 0x4cb2f5, 0x7f52ff9beee8, 0x72)
/usr/local/go/src/syscall/asm_linux_amd64.s:18 +0x5
syscall.read(0x3, 0xc000272000, 0x8000, 0x8000, 0x0, 0x0, 0x0)
/usr/local/go/src/syscall/zsyscall_linux_amd64.go:686 +0x5a
syscall.Read(...)
/usr/local/go/src/syscall/syscall_unix.go:189
internal/poll.(*FD).Read(0xc0003be4e0, 0xc000272000, 0x8000, 0x8000, 0x0, 0x0, 0x0)
/usr/local/go/src/internal/poll/fd_unix.go:165 +0x133
os.(*File).read(...)
/usr/local/go/src/os/file_unix.go:263
os.(*File).Read(0xc0003bc080, 0xc000272000, 0x8000, 0x8000, 0x57, 0x0, 0x0)
/usr/local/go/src/os/file.go:116 +0x71
io.copyBuffer(0x13adfa0, 0xc0000ba008, 0x13adf80, 0xc0003bc080, 0xc000272000, 0x8000, 0x8000, 0x1372428, 0x1, 0x43d526)
/usr/local/go/src/io/io.go:405 +0x122
io.Copy(...)
/usr/local/go/src/io/io.go:364
github.com/buildkite/agent/v3/process.(*Process).Run.func2(0xc0003b01c0, 0xc0003bc080, 0xc0003867c0, 0xc000394590)
/work/process/process.go:178 +0xa9
created by github.com/buildkite/agent/v3/process.(*Process).Run
/work/process/process.go:173 +0x39d
goroutine 57 [select]:
github.com/buildkite/agent/v3/process.(*Process).Run.func3(0xc0003b01c0)
/work/process/process.go:214 +0xd8
created by github.com/buildkite/agent/v3/process.(*Process).Run
/work/process/process.go:213 +0x9a1
rax 0xca
rbx 0x1b358e0
rcx 0x468c23
rdx 0x0
rdi 0x1b35a28
rsi 0x80
rbp 0x7fffe24203d0
rsp 0x7fffe2420388
r8 0x0
r9 0x0
r10 0x0
r11 0x286
r12 0x0
r13 0x1
r14 0xc000094c00
r15 0x0
rip 0x468c21
rflags 0x286
cs 0x33
fs 0x0
gs 0x0
This is intermittent with our agents running on k8s, let me know what other info I can provide to debug this.
Looking at the trace, I see SIGQUIT: quit
at the top. It's possible that this was actually the buildkite-agent
terminating the bootstrap with an unhandled SIGQUIT
. Are you seeing this manifest as errors in your builds?
When we see this the build fails and it terminates with an actual failure code (not one of the agent-disconnect codes)
There was a known crash that could occur with 3.21.0 on some linux kernels (#1213). The error output here suggests a different issue, but just to be sure: are you able to upgrade to 3.21.1 (or 3.22.0) to confirm the issue still happens on those versions?
I can update and see if we see it again yep!
Ok definitely still seeing this with 3.22.0:
SIGQUIT: quit
PC=0x469201 m=0 sigcode=0
goroutine 0 [idle]:
runtime.futex(0x1b36a68, 0x80, 0x0, 0x0, 0x0, 0x7ffd2fc39d68, 0x43ab93, 0xc0002a6148, 0x7ffd2fc39d88, 0x40b0df, ...)
/usr/local/go/src/runtime/sys_linux_amd64.s:567 +0x21
runtime.futexsleep(0x1b36a68, 0x7ffd00000000, 0xffffffffffffffff)
/usr/local/go/src/runtime/os_linux.go:45 +0x46
runtime.notesleep(0x1b36a68)
/usr/local/go/src/runtime/lock_futex.go:151 +0x9f
runtime.stoplockedm()
/usr/local/go/src/runtime/proc.go:1971 +0x88
runtime.schedule()
/usr/local/go/src/runtime/proc.go:2454 +0x4a6
runtime.park_m(0xc000001680)
/usr/local/go/src/runtime/proc.go:2690 +0x9d
runtime.mcall(0x0)
/usr/local/go/src/runtime/asm_amd64.s:318 +0x5b
goroutine 1 [syscall]:
syscall.Syscall6(0xf7, 0x1, 0x5c, 0xc00029e858, 0x1000004, 0x0, 0x0, 0xc00029e8d8, 0x0, 0xc00029e938)
/usr/local/go/src/syscall/asm_linux_amd64.s:41 +0x5
os.(*Process).blockUntilWaitable(0xc000349320, 0xc00001ec00, 0x0, 0xc00029e938)
/usr/local/go/src/os/wait_waitid.go:31 +0x98
os.(*Process).wait(0xc000349320, 0x29, 0xc000349350, 0xc0001d7520)
/usr/local/go/src/os/exec_unix.go:22 +0x39
os.(*Process).Wait(...)
/usr/local/go/src/os/exec.go:125
os/exec.(*Cmd).Wait(0xc000135340, 0x120ea6f, 0x29)
/usr/local/go/src/os/exec/exec.go:507 +0x60
github.com/buildkite/agent/v3/process.(*Process).Run(0xc0003022a0, 0x0, 0x0)
/work/process/process.go:233 +0x472
github.com/buildkite/agent/v3/bootstrap/shell.(*Shell).executeCommand(0xc000263f10, 0xc000266f00, 0x13aedc0, 0xc00000e018, 0x10101, 0x0, 0x0)
/work/bootstrap/shell/shell.go:403 +0x29b
github.com/buildkite/agent/v3/bootstrap/shell.(*Shell).RunWithoutPrompt(0xc000263f10, 0x11e3650, 0x3, 0xc00007b0e0, 0x5, 0x5, 0x32, 0xc00007b0e0)
/work/bootstrap/shell/shell.go:209 +0x186
github.com/buildkite/agent/v3/bootstrap/shell.(*Shell).Run(0xc000263f10, 0x11e3650, 0x3, 0xc00007b0e0, 0x5, 0x5, 0x2, 0x5)
/work/bootstrap/shell/shell.go:197 +0x136
github.com/buildkite/agent/v3/bootstrap.gitClone(0xc000263f10, 0xc0000263da, 0x2, 0xc00002042f, 0x20, 0x11e2f74, 0x1, 0x30, 0x30)
/work/bootstrap/git.go:60 +0x1fc
github.com/buildkite/agent/v3/bootstrap.(*Bootstrap).defaultCheckoutPhase(0xc00022f900, 0x4d3ab5e0725, 0xae78847)
/work/bootstrap/bootstrap.go:1127 +0x1fa
github.com/buildkite/agent/v3/bootstrap.(*Bootstrap).CheckoutPhase.func1(0xc0003289c0, 0x13b9c00, 0xc000321500)
/work/bootstrap/bootstrap.go:902 +0x33
github.com/buildkite/agent/v3/retry.Do(0xc00029f178, 0xc000344020, 0x8, 0x0)
/work/retry/retry.go:80 +0x1d9
github.com/buildkite/agent/v3/bootstrap.(*Bootstrap).CheckoutPhase(0xc00022f900, 0x8, 0x1)
/work/bootstrap/bootstrap.go:901 +0x60a
github.com/buildkite/agent/v3/bootstrap.(*Bootstrap).Run(0xc00022f900, 0x13c27c0, 0xc000115b00, 0x0)
/work/bootstrap/bootstrap.go:131 +0x2a3
github.com/buildkite/agent/v3/clicommand.glob..func6(0xc000134dc0)
/work/clicommand/bootstrap.go:418 +0x97c
github.com/urfave/cli.HandleAction(0xf29260, 0x122cea0, 0xc000134dc0, 0x0, 0xc00002d500)
/go/pkg/mod/github.com/urfave/[email protected]/app.go:503 +0x7c
github.com/urfave/cli.Command.Run(0x11e9966, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0x11fdbde, 0x1b, 0x0, ...)
/go/pkg/mod/github.com/urfave/[email protected]/command.go:165 +0x4da
github.com/urfave/cli.(*App).Run(0xc0002141c0, 0xc00000c080, 0x2, 0x2, 0x0, 0x0)
/go/pkg/mod/github.com/urfave/[email protected]/app.go:259 +0x775
main.main()
/work/main.go:115 +0x55d
goroutine 6 [syscall]:
os/signal.signal_recv(0x13bb1c0)
/usr/local/go/src/runtime/sigqueue.go:147 +0x9c
os/signal.loop()
/usr/local/go/src/os/signal/signal_unix.go:23 +0x22
created by os/signal.Notify.func1
/usr/local/go/src/os/signal/signal.go:127 +0x44
goroutine 19 [runnable]:
internal/poll.(*FD).readUnlock(0xc000366240)
/usr/local/go/src/internal/poll/fd_mutex.go:230 +0x58
internal/poll.(*FD).Read(0xc000366240, 0xc0000aa000, 0x8000, 0x8000, 0x0, 0x13b0100, 0xc000400000)
/usr/local/go/src/internal/poll/fd_unix.go:181 +0x1e5
os.(*File).read(...)
/usr/local/go/src/os/file_unix.go:263
os.(*File).Read(0xc00000e280, 0xc0000aa000, 0x8000, 0x8000, 0x9e, 0x0, 0x0)
/usr/local/go/src/os/file.go:116 +0x71
io.copyBuffer(0x13aedc0, 0xc00000e018, 0x13aeda0, 0xc00000e280, 0xc0000aa000, 0x8000, 0x8000, 0x1373508, 0x1, 0x43d906)
/usr/local/go/src/io/io.go:405 +0x122
io.Copy(...)
/usr/local/go/src/io/io.go:364
github.com/buildkite/agent/v3/process.(*Process).Run.func2(0xc0003022a0, 0xc00000e280, 0xc0003463e0, 0xc00027cc80)
/work/process/process.go:178 +0xa9
created by github.com/buildkite/agent/v3/process.(*Process).Run
/work/process/process.go:173 +0x39d
goroutine 20 [select]:
github.com/buildkite/agent/v3/process.(*Process).Run.func3(0xc0003022a0)
/work/process/process.go:214 +0xd8
created by github.com/buildkite/agent/v3/process.(*Process).Run
/work/process/process.go:213 +0x9a1
rax 0xca
rbx 0x1b36920
rcx 0x469203
rdx 0x0
rdi 0x1b36a68
rsi 0x80
rbp 0x7ffd2fc39d50
rsp 0x7ffd2fc39d08
r8 0x0
r9 0x0
r10 0x0
r11 0x286
r12 0x0
r13 0x1
r14 0xc000024c00
r15 0x0
rip 0x469201
rflags 0x286
cs 0x33
fs 0x0
gs 0x0
This could be happening when our agents are being spun down and it's just an unfortunate time, although if that's the case I'd be surprised we see it so frequently
FYI one thing we realized is the SIGQUIT is coming from our k8s config:
lifecycle:
preStop:
exec:
command: ["pkill", "-QUIT", "buildkite-agent"]
where we added this to get a clean disconnect if the k8s pod is killed while a job is running, but I wouldn't expect this to lead to this crash?