node_exporter icon indicating copy to clipboard operation
node_exporter copied to clipboard

runit collector reports invalid (outdated) service state if runsv (supervisor process) is not running

Open powerman opened this issue 1 year ago • 3 comments

What did you do that produced an error?

Check runit collector sources:

https://github.com/prometheus-community/go-runit/blob/master/runit/runit.go#L82-L99

What did you expect to see?

It should open FIFO file ok on write before trying to read file status. This implements required check "is runsv supervisor running". Because if runsv is not running then contents of it's status file is outdated and does not reflect actual service state.

This can be fixed by adding this code:

 func (s *service) Status() (*SvStatus, error) {
+	file, err := os.OpenFile(s.file("ok"), os.O_WRONLY|syscall.O_NONBLOCK, 0)
+ 	if err != nil {
+ 		return nil, err
+ 	}
+ 	_ = file.Close()
 	status, err := s.status()
 	if err != nil {
 		return nil, err
 	}

The runit's sv tool does the same:

# strace sv status .
execve("/bin/sv", ["sv", "status", "."], 0x7ffcbff55670 /* 61 vars */) = 0
brk(NULL)                               = 0x5639352a8000
openat(AT_FDCWD, "/usr/lib64/libwcwidth-icons.so", O_RDONLY|O_CLOEXEC) = 4
read(4, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\0\0\0\0\0\0\0\0"..., 832) = 832
fstat(4, {st_mode=S_IFREG|0755, st_size=14192, ...}) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f1c31876000
mmap(NULL, 16408, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 4, 0) = 0x7f1c31871000
mmap(0x7f1c31872000, 4096, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 4, 0x1000) = 0x7f1c31872000
mmap(0x7f1c31873000, 4096, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 4, 0x2000) = 0x7f1c31873000
mmap(0x7f1c31874000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 4, 0x2000) = 0x7f1c31874000
close(4)                                = 0
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=164390, ...}) = 0
mmap(NULL, 164390, PROT_READ, MAP_PRIVATE, 4, 0) = 0x7f1c31848000
close(4)                                = 0
openat(AT_FDCWD, "/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 4
read(4, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\240d\2\0\0\0\0\0"..., 832) = 832
pread64(4, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784, 64) = 784
fstat(4, {st_mode=S_IFREG|0755, st_size=1855744, ...}) = 0
pread64(4, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784, 64) = 784
mmap(NULL, 1887088, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 4, 0) = 0x7f1c3167b000
mmap(0x7f1c3169f000, 1343488, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 4, 0x24000) = 0x7f1c3169f000
mmap(0x7f1c317e7000, 339968, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 4, 0x16c000) = 0x7f1c317e7000
mmap(0x7f1c3183a000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 4, 0x1be000) = 0x7f1c3183a000
mmap(0x7f1c31840000, 31600, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f1c31840000
close(4)                                = 0
mmap(NULL, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f1c31678000
arch_prctl(ARCH_SET_FS, 0x7f1c31678740) = 0
set_tid_address(0x7f1c31678a10)         = 455
set_robust_list(0x7f1c31678a20, 24)     = 0
rseq(0x7f1c31679060, 0x20, 0, 0x53053053) = 0
mprotect(0x7f1c3183a000, 16384, PROT_READ) = 0
mprotect(0x7f1c31874000, 4096, PROT_READ) = 0
mprotect(0x56393432d000, 4096, PROT_READ) = 0
mprotect(0x7f1c318a9000, 8192, PROT_READ) = 0
prlimit64(0, RLIMIT_STACK, NULL, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0
munmap(0x7f1c31848000, 164390)          = 0
openat(AT_FDCWD, ".", O_RDONLY|O_NONBLOCK) = 4
chdir(".")                              = 0
openat(AT_FDCWD, "supervise/ok", O_WRONLY|O_NONBLOCK) = 5
close(5)                                = 0
openat(AT_FDCWD, "supervise/status", O_RDONLY|O_NONBLOCK) = 5
read(5, "@\0\0\0f|\273\262\30N\370\24\307\3\0\0\0u\0\1", 20) = 20
close(5)                                = 0
newfstatat(AT_FDCWD, "down", 0x7ffd0dedaba0, 0) = -1 ENOENT (No such file or directory)
chdir("log")                            = 0
openat(AT_FDCWD, "supervise/ok", O_WRONLY|O_NONBLOCK) = 5
close(5)                                = 0
openat(AT_FDCWD, "supervise/status", O_RDONLY|O_NONBLOCK) = 5
read(5, "@\0\0\0f|\273\262\30J\\d\306\3\0\0\0u\0\1", 20) = 20
close(5)                                = 0
newfstatat(AT_FDCWD, "down", 0x7ffd0dedaba0, 0) = -1 ENOENT (No such file or directory)
write(1, "run: .: (pid 967) 551089s; run: "..., 55run: .: (pid 967) 551089s; run: log: (pid 966) 551089s
) = 55
fchdir(4)                               = 0
exit_group(0)                           = ?
+++ exited with 0 +++
# 

What did you see instead?

File ok is not used by this collector at all.

powerman avatar Jul 03 '24 11:07 powerman