agent icon indicating copy to clipboard operation
agent copied to clipboard

Fix SIQGUIT signal handling

Open keith opened this issue 4 years ago • 7 comments

We started seeing this crash with our buildkite agents on Linux:

SIGQUIT: quit
PC=0x468c21 m=0 sigcode=0
 
goroutine 0 [idle]:
runtime.futex(0x1b35a28, 0x80, 0x0, 0x0, 0x0, 0x7fffe24203e8, 0x43a7b3, 0xc000280148, 0x7fffe2420408, 0x40ae9f, ...)
	/usr/local/go/src/runtime/sys_linux_amd64.s:567 +0x21
runtime.futexsleep(0x1b35a28, 0x7fff00000000, 0xffffffffffffffff)
	/usr/local/go/src/runtime/os_linux.go:44 +0x46
runtime.notesleep(0x1b35a28)
	/usr/local/go/src/runtime/lock_futex.go:151 +0x9f
runtime.stoplockedm()
	/usr/local/go/src/runtime/proc.go:1971 +0x88
runtime.schedule()
	/usr/local/go/src/runtime/proc.go:2454 +0x4a6
runtime.park_m(0xc000082900)
	/usr/local/go/src/runtime/proc.go:2690 +0x9d
runtime.mcall(0x0)
	/usr/local/go/src/runtime/asm_amd64.s:318 +0x5b
 
goroutine 1 [syscall]:
syscall.Syscall6(0xf7, 0x1, 0x63ee, 0xc00021e920, 0x1000004, 0x0, 0x0, 0xc00021e9a0, 0x0, 0xc00021ea00)
	/usr/local/go/src/syscall/asm_linux_amd64.s:41 +0x5
os.(*Process).blockUntilWaitable(0xc000383bf0, 0xc0000b0780, 0x3, 0xc00021ea00)
	/usr/local/go/src/os/wait_waitid.go:31 +0x98
os.(*Process).wait(0xc000383bf0, 0x2c, 0xc000383c20, 0xc000380000)
	/usr/local/go/src/os/exec_unix.go:22 +0x39
os.(*Process).Wait(...)
	/usr/local/go/src/os/exec.go:125
os/exec.(*Cmd).Wait(0xc0003b22c0, 0x120da40, 0x29)
	/usr/local/go/src/os/exec/exec.go:502 +0x60
github.com/buildkite/agent/v3/process.(*Process).Run(0xc0003b01c0, 0x0, 0x0)
	/work/process/process.go:233 +0x472
github.com/buildkite/agent/v3/bootstrap/shell.(*Shell).executeCommand(0xc0002aa000, 0xc0003ae280, 0x13adfa0, 0xc0000ba008, 0x10101, 0x0, 0x0)
	/work/bootstrap/shell/shell.go:403 +0x29b
github.com/buildkite/agent/v3/bootstrap/shell.(*Shell).RunWithoutPrompt(0xc0002aa000, 0xc0003944d0, 0x9, 0xc0003be490, 0x3, 0x5, 0x3, 0x6)
	/work/bootstrap/shell/shell.go:209 +0x186
github.com/buildkite/agent/v3/bootstrap.(*Bootstrap).defaultCommandPhase(0xc0001e3900, 0x0, 0x0)
	/work/bootstrap/bootstrap.go:1481 +0x8b9
github.com/buildkite/agent/v3/bootstrap.(*Bootstrap).CommandPhase(0xc0001e3900, 0x7, 0x1)
	/work/bootstrap/bootstrap.go:1310 +0x67d
github.com/buildkite/agent/v3/bootstrap.(*Bootstrap).Run(0xc0001e3900, 0x13c1960, 0xc0000bfb00, 0x0)
	/work/bootstrap/bootstrap.go:144 +0x4aa
github.com/buildkite/agent/v3/clicommand.glob..func6(0xc0000e1080)
	/work/clicommand/bootstrap.go:418 +0x97c
github.com/urfave/cli.HandleAction(0xf28260, 0x122bdb8, 0xc0000e1080, 0x0, 0xc0000bd440)
	/go/pkg/mod/github.com/urfave/[email protected]/app.go:503 +0x7c
github.com/urfave/cli.Command.Run(0x11e8944, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0x11fcbae, 0x1b, 0x0, ...)
	/go/pkg/mod/github.com/urfave/[email protected]/command.go:165 +0x4da
github.com/urfave/cli.(*App).Run(0xc0001c61c0, 0xc0000ac020, 0x2, 0x2, 0x0, 0x0)
	/go/pkg/mod/github.com/urfave/[email protected]/app.go:259 +0x775
main.main()
	/work/main.go:115 +0x55d
 
goroutine 18 [syscall]:
os/signal.signal_recv(0x13ba360)
	/usr/local/go/src/runtime/sigqueue.go:147 +0x9c
os/signal.loop()
	/usr/local/go/src/os/signal/signal_unix.go:23 +0x22
created by os/signal.Notify.func1
	/usr/local/go/src/os/signal/signal.go:127 +0x44
 
goroutine 56 [syscall]:
syscall.Syscall(0x0, 0x3, 0xc000272000, 0x8000, 0x4cb2f5, 0x7f52ff9beee8, 0x72)
	/usr/local/go/src/syscall/asm_linux_amd64.s:18 +0x5
syscall.read(0x3, 0xc000272000, 0x8000, 0x8000, 0x0, 0x0, 0x0)
	/usr/local/go/src/syscall/zsyscall_linux_amd64.go:686 +0x5a
syscall.Read(...)
	/usr/local/go/src/syscall/syscall_unix.go:189
internal/poll.(*FD).Read(0xc0003be4e0, 0xc000272000, 0x8000, 0x8000, 0x0, 0x0, 0x0)
	/usr/local/go/src/internal/poll/fd_unix.go:165 +0x133
os.(*File).read(...)
	/usr/local/go/src/os/file_unix.go:263
os.(*File).Read(0xc0003bc080, 0xc000272000, 0x8000, 0x8000, 0x57, 0x0, 0x0)
	/usr/local/go/src/os/file.go:116 +0x71
io.copyBuffer(0x13adfa0, 0xc0000ba008, 0x13adf80, 0xc0003bc080, 0xc000272000, 0x8000, 0x8000, 0x1372428, 0x1, 0x43d526)
	/usr/local/go/src/io/io.go:405 +0x122
io.Copy(...)
	/usr/local/go/src/io/io.go:364
github.com/buildkite/agent/v3/process.(*Process).Run.func2(0xc0003b01c0, 0xc0003bc080, 0xc0003867c0, 0xc000394590)
	/work/process/process.go:178 +0xa9
created by github.com/buildkite/agent/v3/process.(*Process).Run
	/work/process/process.go:173 +0x39d
 
goroutine 57 [select]:
github.com/buildkite/agent/v3/process.(*Process).Run.func3(0xc0003b01c0)
	/work/process/process.go:214 +0xd8
created by github.com/buildkite/agent/v3/process.(*Process).Run
	/work/process/process.go:213 +0x9a1
 
rax    0xca
rbx    0x1b358e0
rcx    0x468c23
rdx    0x0
rdi    0x1b35a28
rsi    0x80
rbp    0x7fffe24203d0
rsp    0x7fffe2420388
r8     0x0
r9     0x0
r10    0x0
r11    0x286
r12    0x0
r13    0x1
r14    0xc000094c00
r15    0x0
rip    0x468c21
rflags 0x286
cs     0x33
fs     0x0
gs     0x0

This is intermittent with our agents running on k8s, let me know what other info I can provide to debug this.

keith avatar Jun 08 '20 18:06 keith

Looking at the trace, I see SIGQUIT: quit at the top. It's possible that this was actually the buildkite-agent terminating the bootstrap with an unhandled SIGQUIT. Are you seeing this manifest as errors in your builds?

lox avatar Jun 08 '20 22:06 lox

When we see this the build fails and it terminates with an actual failure code (not one of the agent-disconnect codes)

keith avatar Jun 08 '20 23:06 keith

There was a known crash that could occur with 3.21.0 on some linux kernels (#1213). The error output here suggests a different issue, but just to be sure: are you able to upgrade to 3.21.1 (or 3.22.0) to confirm the issue still happens on those versions?

yob avatar Jun 08 '20 23:06 yob

I can update and see if we see it again yep!

keith avatar Jun 08 '20 23:06 keith

Ok definitely still seeing this with 3.22.0:

SIGQUIT: quit
PC=0x469201 m=0 sigcode=0
 
goroutine 0 [idle]:
runtime.futex(0x1b36a68, 0x80, 0x0, 0x0, 0x0, 0x7ffd2fc39d68, 0x43ab93, 0xc0002a6148, 0x7ffd2fc39d88, 0x40b0df, ...)
	/usr/local/go/src/runtime/sys_linux_amd64.s:567 +0x21
runtime.futexsleep(0x1b36a68, 0x7ffd00000000, 0xffffffffffffffff)
	/usr/local/go/src/runtime/os_linux.go:45 +0x46
runtime.notesleep(0x1b36a68)
	/usr/local/go/src/runtime/lock_futex.go:151 +0x9f
runtime.stoplockedm()
	/usr/local/go/src/runtime/proc.go:1971 +0x88
runtime.schedule()
	/usr/local/go/src/runtime/proc.go:2454 +0x4a6
runtime.park_m(0xc000001680)
	/usr/local/go/src/runtime/proc.go:2690 +0x9d
runtime.mcall(0x0)
	/usr/local/go/src/runtime/asm_amd64.s:318 +0x5b
 
goroutine 1 [syscall]:
syscall.Syscall6(0xf7, 0x1, 0x5c, 0xc00029e858, 0x1000004, 0x0, 0x0, 0xc00029e8d8, 0x0, 0xc00029e938)
	/usr/local/go/src/syscall/asm_linux_amd64.s:41 +0x5
os.(*Process).blockUntilWaitable(0xc000349320, 0xc00001ec00, 0x0, 0xc00029e938)
	/usr/local/go/src/os/wait_waitid.go:31 +0x98
os.(*Process).wait(0xc000349320, 0x29, 0xc000349350, 0xc0001d7520)
	/usr/local/go/src/os/exec_unix.go:22 +0x39
os.(*Process).Wait(...)
	/usr/local/go/src/os/exec.go:125
os/exec.(*Cmd).Wait(0xc000135340, 0x120ea6f, 0x29)
	/usr/local/go/src/os/exec/exec.go:507 +0x60
github.com/buildkite/agent/v3/process.(*Process).Run(0xc0003022a0, 0x0, 0x0)
	/work/process/process.go:233 +0x472
github.com/buildkite/agent/v3/bootstrap/shell.(*Shell).executeCommand(0xc000263f10, 0xc000266f00, 0x13aedc0, 0xc00000e018, 0x10101, 0x0, 0x0)
	/work/bootstrap/shell/shell.go:403 +0x29b
github.com/buildkite/agent/v3/bootstrap/shell.(*Shell).RunWithoutPrompt(0xc000263f10, 0x11e3650, 0x3, 0xc00007b0e0, 0x5, 0x5, 0x32, 0xc00007b0e0)
	/work/bootstrap/shell/shell.go:209 +0x186
github.com/buildkite/agent/v3/bootstrap/shell.(*Shell).Run(0xc000263f10, 0x11e3650, 0x3, 0xc00007b0e0, 0x5, 0x5, 0x2, 0x5)
	/work/bootstrap/shell/shell.go:197 +0x136
github.com/buildkite/agent/v3/bootstrap.gitClone(0xc000263f10, 0xc0000263da, 0x2, 0xc00002042f, 0x20, 0x11e2f74, 0x1, 0x30, 0x30)
	/work/bootstrap/git.go:60 +0x1fc
github.com/buildkite/agent/v3/bootstrap.(*Bootstrap).defaultCheckoutPhase(0xc00022f900, 0x4d3ab5e0725, 0xae78847)
	/work/bootstrap/bootstrap.go:1127 +0x1fa
github.com/buildkite/agent/v3/bootstrap.(*Bootstrap).CheckoutPhase.func1(0xc0003289c0, 0x13b9c00, 0xc000321500)
	/work/bootstrap/bootstrap.go:902 +0x33
github.com/buildkite/agent/v3/retry.Do(0xc00029f178, 0xc000344020, 0x8, 0x0)
	/work/retry/retry.go:80 +0x1d9
github.com/buildkite/agent/v3/bootstrap.(*Bootstrap).CheckoutPhase(0xc00022f900, 0x8, 0x1)
	/work/bootstrap/bootstrap.go:901 +0x60a
github.com/buildkite/agent/v3/bootstrap.(*Bootstrap).Run(0xc00022f900, 0x13c27c0, 0xc000115b00, 0x0)
	/work/bootstrap/bootstrap.go:131 +0x2a3
github.com/buildkite/agent/v3/clicommand.glob..func6(0xc000134dc0)
	/work/clicommand/bootstrap.go:418 +0x97c
github.com/urfave/cli.HandleAction(0xf29260, 0x122cea0, 0xc000134dc0, 0x0, 0xc00002d500)
	/go/pkg/mod/github.com/urfave/[email protected]/app.go:503 +0x7c
github.com/urfave/cli.Command.Run(0x11e9966, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0x11fdbde, 0x1b, 0x0, ...)
	/go/pkg/mod/github.com/urfave/[email protected]/command.go:165 +0x4da
github.com/urfave/cli.(*App).Run(0xc0002141c0, 0xc00000c080, 0x2, 0x2, 0x0, 0x0)
	/go/pkg/mod/github.com/urfave/[email protected]/app.go:259 +0x775
main.main()
	/work/main.go:115 +0x55d
 
goroutine 6 [syscall]:
os/signal.signal_recv(0x13bb1c0)
	/usr/local/go/src/runtime/sigqueue.go:147 +0x9c
os/signal.loop()
	/usr/local/go/src/os/signal/signal_unix.go:23 +0x22
created by os/signal.Notify.func1
	/usr/local/go/src/os/signal/signal.go:127 +0x44
 
goroutine 19 [runnable]:
internal/poll.(*FD).readUnlock(0xc000366240)
	/usr/local/go/src/internal/poll/fd_mutex.go:230 +0x58
internal/poll.(*FD).Read(0xc000366240, 0xc0000aa000, 0x8000, 0x8000, 0x0, 0x13b0100, 0xc000400000)
	/usr/local/go/src/internal/poll/fd_unix.go:181 +0x1e5
os.(*File).read(...)
	/usr/local/go/src/os/file_unix.go:263
os.(*File).Read(0xc00000e280, 0xc0000aa000, 0x8000, 0x8000, 0x9e, 0x0, 0x0)
	/usr/local/go/src/os/file.go:116 +0x71
io.copyBuffer(0x13aedc0, 0xc00000e018, 0x13aeda0, 0xc00000e280, 0xc0000aa000, 0x8000, 0x8000, 0x1373508, 0x1, 0x43d906)
	/usr/local/go/src/io/io.go:405 +0x122
io.Copy(...)
	/usr/local/go/src/io/io.go:364
github.com/buildkite/agent/v3/process.(*Process).Run.func2(0xc0003022a0, 0xc00000e280, 0xc0003463e0, 0xc00027cc80)
	/work/process/process.go:178 +0xa9
created by github.com/buildkite/agent/v3/process.(*Process).Run
	/work/process/process.go:173 +0x39d
 
goroutine 20 [select]:
github.com/buildkite/agent/v3/process.(*Process).Run.func3(0xc0003022a0)
	/work/process/process.go:214 +0xd8
created by github.com/buildkite/agent/v3/process.(*Process).Run
	/work/process/process.go:213 +0x9a1
 
rax    0xca
rbx    0x1b36920
rcx    0x469203
rdx    0x0
rdi    0x1b36a68
rsi    0x80
rbp    0x7ffd2fc39d50
rsp    0x7ffd2fc39d08
r8     0x0
r9     0x0
r10    0x0
r11    0x286
r12    0x0
r13    0x1
r14    0xc000024c00
r15    0x0
rip    0x469201
rflags 0x286
cs     0x33
fs     0x0
gs     0x0

keith avatar Jun 10 '20 15:06 keith

This could be happening when our agents are being spun down and it's just an unfortunate time, although if that's the case I'd be surprised we see it so frequently

keith avatar Jun 10 '20 15:06 keith

FYI one thing we realized is the SIGQUIT is coming from our k8s config:

          lifecycle:
            preStop:
              exec:
                command: ["pkill", "-QUIT", "buildkite-agent"]

where we added this to get a clean disconnect if the k8s pod is killed while a job is running, but I wouldn't expect this to lead to this crash?

keith avatar Jun 12 '20 17:06 keith