goconserver
goconserver copied to clipboard
`rcons` makes stdin, stdout, and stderr O_ASYNC | O_NONBLOCK
@neo954 commented on Tue May 21 2019
Here is the bug recreation steps.
Run rcons
against a compute node, and then, press Ctrl-E C ?
. After that, run tftp client, tftp
. The tftp client will print out command line prompt repeatedly and endlessly.
root@f6u13k13:~# tftp
tftp>
tftp>
root@f6u13k13:~# rcons f6u13k15
[Enter `^Ec?' for help]
goconserver(2019-05-21T04:12:05-04:00): Hello 10.6.13.13:33358, welcome to the session of f6u13k15
[Disconnected]
root@f6u13k13:~# tftp
tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp>
... <<< omit thousands of lines >>> ...
Additional information
The xCAT management node runs Ubuntu 18.04.2 on a ppc64el node. It has xCAT 2.15-snap201905170621
installed. The xCAT compute node f6u13k15
is a regular KVM guest.
root@f6u13k13:~# cat /etc/os-release
NAME="Ubuntu"
VERSION="18.04.2 LTS (Bionic Beaver)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 18.04.2 LTS"
VERSION_ID="18.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=bionic
UBUNTU_CODENAME=bionic
root@f6u13k13:~# go-xcat check
Operating system: linux
Architecture: ppc64le
Linux Distribution: ubuntu
Version: 18.04
go-xcat Version: 1.0.38
Reading repositories ...... done
xCAT Core Packages
==================
Package Name Installed In Repository
------------ --------- -------------
perl-xcat 2.15-snap201905170621 2.15-snap201905170621
xcat 2.15-snap201905170621 2.15-snap201905170621
xcat-buildkit 2.15-snap201905170621 2.15-snap201905170621
xcat-client 2.15-snap201905170621 2.15-snap201905170621
xcat-confluent (not installed) 2.15-snap201905170621
xcat-genesis-scripts-amd64 2.15-snap201905170621 2.15-snap201905170621
xcat-genesis-scripts-ppc64 2.15-snap201905170621 2.15-snap201905170621
xcat-probe 2.15-snap201905170621 2.15-snap201905170621
xcat-server 2.15-snap201905170621 2.15-snap201905170621
xcat-test 2.15-snap201905170621 2.15-snap201905170621
xcat-vlan (not installed) 2.15-snap201905170621
xcatsn (not installed) 2.15-snap201905170621
xCAT Dependency Packages
========================
Package Name Installed In Repository
------------ --------- -------------
elilo-xcat 3.14-4 3.14-4
goconserver 0.3.2-snap201811080419 0.3.2-snap201811080419
grub2-xcat 2.02-0.76.el7.1.snap2019051602 2.02-0.76.el7.1.snap2019051602
ipmitool-xcat 1.8.18 1.8.18
syslinux-xcat 3.86-2 3.86-2
xcat-genesis-base-amd64 2.14.5-snap201811190037 2.14.5-snap201811190037
xcat-genesis-base-ppc64 2.14.5-snap201811160710 2.14.5-snap201811160710
xnba-undi 1.0.3-7 1.0.3-7
root@f6u13k13:~# lsdef f6u13k15 -z
# <xCAT data object stanza file>
f6u13k15:
objtype=node
addkcmdline=console=tty0 console=hvc0,115200
arch=ppc64el
cons=kvm
consoleenabled=1
currchain=boot
currstate=netboot ubuntu18.04.2-ppc64el-compute
groups=all
ip=10.6.13.15
mac=42:11:0a:06:0d:0f|42:02:0a:06:0d:0f!*NOIP*|42:a2:0a:06:0d:0f!*NOIP*
mgt=kvm
monserver=f6u13k13
netboot=grub2
nfsserver=f6u13k13
os=ubuntu18.04.2
profile=compute
provmethod=ubuntu18.04.2-ppc64el-netboot-compute
serialport=0
serialspeed=115200
status=powering-on
statustime=05-21-2019 03:44:56
tftpserver=f6u13k13
updatestatus=failed
updatestatustime=05-20-2019 14:18:04
vmcpus=2
vmhost=f6u13
vmmemory=4096
vmnicnicmodel=virtio-net-pci
vmnics=br0,private_br0,private_br1
vmstorage=phy:/dev/mapper/vdiskvg00-vdisk00n15
xcatmaster=f6u13k13
@neo954 commented on Tue May 21 2019
I tried to get the tty state with stty -a
before and after I run rcons
. But the two of outputs looked exactly same.
root@f6u13k13:~# stty -a >stty.out.good
root@f6u13k13:~# rcons f6u13k15
[Enter `^Ec?' for help]
goconserver(2019-05-21T04:31:18-04:00): Hello 10.6.13.13:33358, welcome to the session of f6u13k15
[Disconnected]
root@f6u13k13:~# stty -a >stty.out.1
root@f6u13k13:~# diff -u stty.out.good stty.out.1
root@f6u13k13:~# echo $?
0
root@f6u13k13:~# cat stty.out.good
speed 38400 baud; rows 62; columns 135; line = 0;
intr = ^C; quit = ^\; erase = ^?; kill = ^U; eof = ^D; eol = M-^?; eol2 = M-^?;
swtch = <undef>; start = ^Q; stop = ^S; susp = ^Z; rprnt = ^R; werase = ^W;
lnext = ^V; discard = ^O; min = 1; time = 0;
-parenb -parodd -cmspar cs8 -hupcl -cstopb cread -clocal -crtscts
-ignbrk -brkint -ignpar -parmrk -inpck -istrip -inlcr -igncr icrnl ixon -ixoff
-iuclc ixany imaxbel iutf8
opost -olcuc -ocrnl onlcr -onocr -onlret -ofill -ofdel nl0 cr0 tab0 bs0 vt0 ff0
isig icanon iexten echo echoe echok -echonl -noflsh -xcase -tostop -echoprt
echoctl echoke -flusho -extproc
root@f6u13k13:~# cat stty.out.1
speed 38400 baud; rows 62; columns 135; line = 0;
intr = ^C; quit = ^\; erase = ^?; kill = ^U; eof = ^D; eol = M-^?; eol2 = M-^?;
swtch = <undef>; start = ^Q; stop = ^S; susp = ^Z; rprnt = ^R; werase = ^W;
lnext = ^V; discard = ^O; min = 1; time = 0;
-parenb -parodd -cmspar cs8 -hupcl -cstopb cread -clocal -crtscts
-ignbrk -brkint -ignpar -parmrk -inpck -istrip -inlcr -igncr icrnl ixon -ixoff
-iuclc ixany imaxbel iutf8
opost -olcuc -ocrnl onlcr -onocr -onlret -ofill -ofdel nl0 cr0 tab0 bs0 vt0 ff0
isig icanon iexten echo echoe echok -echonl -noflsh -xcase -tostop -echoprt
echoctl echoke -flusho -extproc
@neo954 commented on Tue May 21 2019
I tried to run strace tftp
when the tty is in the broken state. It seems the read()
system calls of tftp client process failed continuously and the errno
was set to EAGAIN
.
... <<< omit thousands of lines >>> ...
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
... <<< omit thousands of lines >>> ...
@neo954 commented on Tue May 21 2019
Enclosed please find the strace
outputs.
strace.tftp.out.good.txt
strace.tftp.out.problem.txt
@neo954 commented on Tue May 21 2019
This problem can be recreated with cat
.
root@f6u13k13:~# rcons f6u13k14
[Enter `^Ec?' for help]
goconserver(2019-05-21T04:59:08-04:00): Hello 10.6.13.13:38404, welcome to the session of f6u13k14
Could not receive message, error: EOF.
[Enter `^Ec.' to exit]
Session is teminated unexpectedly, retrying....
[Disconnected]
root@f6u13k13:~# cat
cat: -: Resource temporarily unavailable
@neo954 commented on Tue May 21 2019
root@f6u13k13:~# strace cat
execve("/bin/cat", ["cat"], 0x7fffc7e70f20 /* 28 vars */) = 0
brk(NULL) = 0xd9e3c350000
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=28770, ...}) = 0
mmap(NULL, 28770, PROT_READ, MAP_PRIVATE, 3, 0) = 0x794af1900000
close(3) = 0
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/lib/powerpc64le-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0\25\0\1\0\0\0\20G\2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=2181704, ...}) = 0
mmap(NULL, 2250384, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x794af16d0000
mmap(0x794af18e0000, 131072, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x200000) = 0x794af18e0000
close(3) = 0
mprotect(0x794af18e0000, 65536, PROT_READ) = 0
mprotect(0xd9e01270000, 65536, PROT_READ) = 0
mprotect(0x794af1970000, 65536, PROT_READ) = 0
munmap(0x794af1900000, 28770) = 0
openat(AT_FDCWD, "/usr/lib/locale/locale-archive", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=1991136, ...}) = 0
mmap(NULL, 1991136, PROT_READ, MAP_PRIVATE, 3, 0) = 0x794af14e0000
close(3) = 0
brk(NULL) = 0xd9e3c350000
brk(0xd9e3c380000) = 0xd9e3c380000
fstat(1, {st_mode=S_IFCHR|0600, st_rdev=makedev(136, 5), ...}) = 0
fstat(0, {st_mode=S_IFCHR|0600, st_rdev=makedev(136, 5), ...}) = 0
fadvise64(0, 0, 0, POSIX_FADV_SEQUENTIAL) = 0
mmap(NULL, 262144, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x794af14a0000
read(0, 0x794af14b0000, 131072) = -1 EAGAIN (Resource temporarily unavailable)
write(2, "cat: ", 5cat: ) = 5
write(2, "-", 1-) = 1
openat(AT_FDCWD, "/usr/share/locale/locale.alias", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=2995, ...}) = 0
read(3, "# Locale name alias data base.\n#"..., 4096) = 2995
read(3, "", 4096) = 0
close(3) = 0
openat(AT_FDCWD, "/usr/share/locale/en_US/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/share/locale/en/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/share/locale-langpack/en_US/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/share/locale-langpack/en/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
write(2, ": Resource temporarily unavailab"..., 34: Resource temporarily unavailable) = 34
write(2, "\n", 1
) = 1
munmap(0x794af14a0000, 262144) = 0
close(0) = 0
close(1) = 0
close(2) = 0
exit_group(1) = ?
+++ exited with 1 +++
@neo954 commented on Tue May 21 2019
Okay, here is the problem.
root@f6u13k13:~# cat /proc/self/fdinfo/0
pos: 0
flags: 02
mnt_id: 26
root@f6u13k13:~# rcons f6u13k14
[Enter `^Ec?' for help]
goconserver(2019-05-21T05:21:47-04:00): Hello 10.6.13.13:41330, welcome to the session of f6u13k14
Could not receive message, error: EOF.
[Enter `^Ec.' to exit]
Session is teminated unexpectedly, retrying....
[Disconnected]
root@f6u13k13:~# cat /proc/self/fdinfo/0
pos: 0
flags: 024002
mnt_id: 26
@neo954 commented on Tue May 21 2019
It seems the problem affected all three file descriptors 0, 1, and 2.
root@f6u13k13:~# head -n 99 /proc/self/fdinfo/{0,1,2}
==> /proc/self/fdinfo/0 <==
pos: 0
flags: 02
mnt_id: 26
==> /proc/self/fdinfo/1 <==
pos: 0
flags: 02
mnt_id: 26
==> /proc/self/fdinfo/2 <==
pos: 0
flags: 02
mnt_id: 26
root@f6u13k13:~# rcons f6u13k14
[Enter `^Ec?' for help]
goconserver(2019-05-21T05:23:16-04:00): Hello 10.6.13.13:41500, welcome to the session of f6u13k14
Could not receive message, error: EOF.
[Enter `^Ec.' to exit]
Session is teminated unexpectedly, retrying....
[Disconnected]
root@f6u13k13:~# head -n 99 /proc/self/fdinfo/{0,1,2}
==> /proc/self/fdinfo/0 <==
pos: 0
flags: 024002
mnt_id: 26
==> /proc/self/fdinfo/1 <==
pos: 0
flags: 024002
mnt_id: 26
==> /proc/self/fdinfo/2 <==
pos: 0
flags: 024002
mnt_id: 26
@neo954 commented on Tue May 21 2019
See http://man7.org/linux/man-pages/man5/proc.5.html for details of the fdinfo
subdirectory.
flags This is an octal number that displays the file access mode and file status flags (see open(2)). If the close-on-exec file descriptor flag is set, then flags will also include the value
O_CLOEXEC
. Before Linux 3.1, this field incorrectly displayed the setting ofO_CLOEXEC
at the time the file was opened, rather than the current setting of the close-on-exec flag.
@neo954 commented on Tue May 21 2019
In file /usr/src/linux-headers-4.15.0-47/include/uapi/asm-generic/fcntl.h
#define O_RDWR 00000002
#ifndef O_NONBLOCK
#define O_NONBLOCK 00004000
#endif
#ifndef FASYNC
#define FASYNC 00020000 /* fcntl, for BSD compatibility */
#endif
@neo954 commented on Tue May 21 2019
root@f6u13k13:~# uname -a
Linux f6u13k13 4.15.0-47-generic #50-Ubuntu SMP Wed Mar 13 10:40:40 UTC 2019 ppc64le ppc64le ppc64le GNU/Linux
@neo954 commented on Tue May 21 2019
This is what happened on a RHEL 8 testing environment.
[root@c910f03c01p19 ~]# head -n 99 /proc/self/fdinfo/{0,1,2}
==> /proc/self/fdinfo/0 <==
pos: 0
flags: 02
mnt_id: 25
==> /proc/self/fdinfo/1 <==
pos: 0
flags: 02
mnt_id: 25
==> /proc/self/fdinfo/2 <==
pos: 0
flags: 02
mnt_id: 25
[root@c910f03c01p19 ~]# rcons c910f03c01p10
[Enter `^Ec?' for help]
goconserver(2019-05-21T05:47:27-04:00): Hello 10.3.1.19:59050, welcome to the session of c910f03c01p10
done
[Disconnected]
[root@c910f03c01p19 ~]# head -n 99 /proc/self/fdinfo/{0,1,2}
==> /proc/self/fdinfo/0 <==
pos: 0
flags: 020002
mnt_id: 25
==> /proc/self/fdinfo/1 <==
pos: 0
flags: 020002
mnt_id: 25
==> /proc/self/fdinfo/2 <==
pos: 0
flags: 020002
mnt_id: 25
[root@c910f03c01p19 ~]# uname -a
Linux c910f03c01p19 4.18.0-80.el8.ppc64le #1 SMP Wed Mar 13 11:26:21 UTC 2019 ppc64le ppc64le ppc64le GNU/Linux
@neo954 commented on Tue May 21 2019
In the latest goconserver
source, v0.3.2
.
$ grep -n -r NONBLOCK .
./console/client.go:132: err := common.Fcntl(in, syscall.F_SETFL, syscall.O_ASYNC|syscall.O_NONBLOCK)
./console/client.go:173: err := common.Fcntl(in, syscall.F_SETFL, syscall.O_ASYNC|syscall.O_NONBLOCK)
./console/cli.go:320: err = common.Fcntl(in, syscall.F_SETFL, syscall.O_ASYNC|syscall.O_NONBLOCK)
@neo954 commented on Wed May 22 2019
@chenglch, Do you have any idea about this issue? :-/
@neo954 , is it possible to save original flag before F_SETFL system call and restore it in a defer function?
hi, @chenglch , it is great to get your quickly response. I will take a try next few days.