resource-agents icon indicating copy to clipboard operation
resource-agents copied to clipboard

storage_mon: child processes remain when stuck in read() etc

Open inouekazu opened this issue 3 years ago • 1 comments

When an I/O unresponsive failure occurs, the child process of storage_mon remains.

# ./storage_mon --device /dev/mapper/mpathc --score 1 --verbose
Testing device /dev/mapper/mpathc
/dev/mapper/mpathc: opened with O_DIRECT, size=536870912000
/dev/mapper/mpathc: reading from pos 727385088
Thread for device /dev/mapper/mpathc did not complete in time
Final score is 1
# ps -ef | grep storage_mon
root      710237  697460  0 13:17 pts/1    00:00:00 ./storage_mon --device /dev/mapper/mpathc --score 1 --verbose
root      710238  710237  0 13:17 pts/1    00:00:00 ./storage_mon --device /dev/mapper/mpathc --score 1 --verbose
# ps -ef | grep storage_mon
root      710238       1  0 13:17 pts/1    00:00:00 ./storage_mon --device /dev/mapper/mpathc --score 1 --verbose

I got strace -ff -tt -T -o storage_mon ./storage_mon --device /dev/mapper/mpathc --score 1.

  • parent process exits,
    # cat storage_mon.629992
    14:45:20.173791 execve("./storage_mon", ["./storage_mon", "--device", "/dev/mapper/mpathc", "--score", "1", "--verbose"], 0x7fff94c9e6a0 /* 26 vars */) = 0 <0.000605>
    (snip)
    14:45:20.193591 clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fba97913250) = 629993 <0.000246>
    14:45:20.194002 wait4(629993, 0x7fff320ea9a8, WNOHANG|WSTOPPED|WCONTINUED, NULL) = 0 <0.000023>
    14:45:20.194104 nanosleep({tv_sec=0, tv_nsec=100000000}, NULL) = 0 <0.100166>
    (snip)
    14:45:29.962911 wait4(629993, 0x7fff320ea9a8, WNOHANG|WSTOPPED|WCONTINUED, NULL) = 0 <0.000127>
    14:45:29.963242 nanosleep({tv_sec=0, tv_nsec=100000000}, NULL) = 0 <0.100199>
    14:45:30.063761 openat(AT_FDCWD, "/etc/localtime", O_RDONLY|O_CLOEXEC) = 3 <0.000131>
    14:45:30.064143 fstat(3, {st_mode=S_IFREG|0644, st_size=318, ...}) = 0 <0.000112>
    14:45:30.064481 fstat(3, {st_mode=S_IFREG|0644, st_size=318, ...}) = 0 <0.000116>
    14:45:30.064734 read(3, "TZif2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\4\0\0\0\4\0\0\0\0"..., 4096) = 318 <0.000023>
    14:45:30.064838 lseek(3, -185, SEEK_CUR) = 133 <0.000015>
    14:45:30.064904 read(3, "TZif2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\4\0\0\0\4\0\0\0\0"..., 4096) = 185 <0.000079>
    14:45:30.065075 close(3)                = 0 <0.000023>
    14:45:30.065195 socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0) = 3 <0.000030>
    14:45:30.065276 connect(3, {sa_family=AF_UNIX, sun_path="/dev/log"}, 110) = 0 <0.000032>
    14:45:30.065366 sendto(3, "<27>Sep 21 14:45:30 storage_mon:"..., 110, MSG_NOSIGNAL, NULL, 0) = 110 <0.000080>
    14:45:30.065705 write(2, "Thread for device /dev/mapper/mp"..., 62) = 62 <0.000039>
    14:45:30.065860 fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(0x88, 0x1), ...}) = 0 <0.000018>
    14:45:30.065974 write(1, "Final score is 1\n", 17) = 17 <0.000071>
    14:45:30.066644 exit_group(1)           = ?
    14:45:30.067288 +++ exited with 1 +++
    
  • but child process is waiting for read() to return.
    # tail -F storage_mon.629993
    14:45:20.194158 set_robust_list(0x7fba97913260, 24) = 0 <0.000063>
    14:45:20.194448 fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(0x88, 0x1), ...}) = 0 <0.000065>
    14:45:20.194634 write(1, "Testing device /dev/mapper/mpath"..., 34) = 34 <0.000207>
    14:45:20.194958 openat(AT_FDCWD, "/dev/mapper/mpathc", O_RDONLY|O_DIRECT) = 3 <0.000152>
    14:45:20.195227 ioctl(3, BLKGETSIZE64, [536870912000]) = 0 <0.000018>
    14:45:20.195314 write(1, "/dev/mapper/mpathc: opened with "..., 60) = 60 <0.000026>
    14:45:20.195423 getpid()                = 629993 <0.000016>
    14:45:20.195511 lseek(3, 277049344, SEEK_SET) = 277049344 <0.000016>
    14:45:20.195578 write(1, "/dev/mapper/mpathc: reading from"..., 47) = 47 <0.000022>
    14:45:20.195654 ioctl(3, BLKSSZGET, [512]) = 0 <0.000017>
    14:45:20.195754 read(3, 
    

As a result, if allow-unhealthy-nodes=true is set,

pcs property set node-health-strategy=only-green

pcs resource create storage-mon ocf:heartbeat:storage-mon \
    drives="/dev/mapper/mpathc" \
    monitor timeout=60s interval=10s on-fail=restart

pcs resource clone storage-mon storage-mon-clone
pcs resource meta storage-mon-clone allow-unhealthy-nodes=true

each time the monitor is run, there will be more child processes that do not exit.

# ps -ef | grep storage_mon
UID          PID    PPID  C STIME TTY          TIME CMD
root      627631       1  0 14:20 ?        00:00:00 /usr/libexec/heartbeat/storage_mon --device /dev/mapper/mpathc --score 1 --timeout 10
root      627757       1  0 14:20 ?        00:00:00 /usr/libexec/heartbeat/storage_mon --device /dev/mapper/mpathc --score 1 --timeout 10
root      627892       1  0 14:21 ?        00:00:00 /usr/libexec/heartbeat/storage_mon --device /dev/mapper/mpathc --score 1 --timeout 10
root      628005       1  0 14:21 ?        00:00:00 /usr/libexec/heartbeat/storage_mon --device /dev/mapper/mpathc --score 1 --timeout 10
root      628030       1  0 14:21 ?        00:00:00 /usr/libexec/heartbeat/storage_mon --device /dev/mapper/mpathc --score 1 --timeout 10
(snip)

inouekazu avatar Sep 22 '22 07:09 inouekazu

The kernel doesnt allow us to kill stuck processes, so this is not something we can fix in storage_mon.

oalbrigt avatar Sep 23 '22 09:09 oalbrigt