Add support for IPVS inside Sysbox containers
System containers deployed with Sysbox do not currently support the Linux IP Virtual Server (IPVS).
Even though I confirmed IPVS is namespaced in the Linux kernel via the network ns, it appears that within the system container processes don't have the required permission to configure IPVS via the kernel's netlink interface:
root@manager:/# ipvsadm --help
Can't initialize ipvs: No space left on device
Are you sure that IP Virtual Server is built in the kernel or as module?
The strace for ipvsadm shows an EPERM in the netlink ipvs msg exchange causes this error.
[pid 1146] bind(3, {sa_family=AF_NETLINK, nl_pid=188745358, nl_groups=00000000}, 12) = 0
[pid 1146] getsockname(3, {sa_family=AF_NETLINK, nl_pid=188745358, nl_groups=00000000}, [12]) = 0
[pid 1146] sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=32, type=0x10 /* NLMSG_??? */, flags=NLM_F_REQUEST|NLM_F_ACK, seq=1602367408, pid=188745358}, "\x03\x01\x00\x00\x09\x00\x02\x00\x49\x50\x56\x53\x00\x00\x00\x00"}, iov_len=32}], msg_iovlen=1$
msg_controllen=0, msg_flags=0}, 0) = 32
[pid 1146] recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=388, type=0x10 /* NLMSG_??? */, flags=0, seq=1602367408, pid=188745358}, "\x01\x02\x00\x00\x09\x00\x02\x00\x49\x50\x56\x53\x00\x00\x00\x00\x06\x00\x01\x00\x1e\x00\x00\x00\x08\x00\x03\x00\x0$
\x00\x00\x00"...}, iov_len=16384}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, MSG_PEEK|MSG_TRUNC) = 388
[pid 1146] recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=388, type=0x10 /* NLMSG_??? */, flags=0, seq=1602367408, pid=188745358}, "\x01\x02\x00\x00\x09\x00\x02\x00\x49\x50\x56\x53\x00\x00\x00\x00\x06\x00\x01\x00\x1e\x00\x00\x00\x08\x00\x03\x00\x0$
\x00\x00\x00"...}, iov_len=16384}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 388
[pid 1146] recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=36, type=NLMSG_ERROR, flags=NLM_F_CAPPED, seq=1602367408, pid=188745358}, {error=0, msg={len=32, type=0x10 /* NLMSG_??? */, flags=NLM_F_REQUEST|NLM_F_ACK, seq=1602367408, pid=188745358}}}, $
ov_len=16384}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, MSG_PEEK|MSG_TRUNC) = 36
[pid 1146] recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=36, type=NLMSG_ERROR, flags=NLM_F_CAPPED, seq=1602367408, pid=188745358}, {error=0, msg={len=32, type=0x10 /* NLMSG_??? */, flags=NLM_F_REQUEST|NLM_F_ACK, seq=1602367408, pid=188745358}}}, $
ov_len=16384}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 36
[pid 1146] sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=20, type=0x1e /* NLMSG_??? */, flags=NLM_F_REQUEST|NLM_F_ACK, seq=1602367409, pid=188745358}, "\x0f\x01\x00\x00"}, iov_len=20}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 20
>>> HERE: recvmsg() is returning an EPERM ...
[pid 1146] recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=40, type=NLMSG_ERROR, flags=0, seq=1602367409, pid=188745358}, {error=-EPERM, msg={{len=20, type=0x1e /* NLMSG_??? */, flags=NLM_F_REQUEST|NLM_F_ACK, seq=1602367409, pid=188745358}, "\x0f\x$
1\x00\x00"}}}, iov_len=16384}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, MSG_PEEK|MSG_TRUNC) = 40
[pid 1146] recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=40, type=NLMSG_ERROR, flags=0, seq=1602367409, pid=188745358}, {error=-EPERM, msg={{len=20, type=0x1e /* NLMSG_??? */, flags=NLM_F_REQUEST|NLM_F_ACK, seq=1602367409, pid=188745358}, "\x0f\x$
1\x00\x00"}}}, iov_len=16384}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 40
[pid 1146] close(3) = 0
[pid 1146] clone(strace: Process 1147 attached
In addition, the sysctls for IPVS are much reduced inside the system container (compared to the host):
root@manager:/# ls -l /proc/sys/net/ipv4/vs
total 0
-rw-r--r-- 1 root root 0 Oct 10 20:04 conn_reuse_mode
-rw-r--r-- 1 root root 0 Oct 10 19:23 conntrack
-rw-r--r-- 1 root root 0 Oct 10 19:22 expire_nodest_conn
-rw-r--r-- 1 root root 0 Oct 10 20:04 expire_quiescent_template
- Host:
cesar@eoan:~$ ls -l /proc/sys/net/ipv4/vs
total 0
-rw-r--r-- 1 root root 0 Oct 10 13:04 am_droprate
-rw-r--r-- 1 root root 0 Oct 10 13:04 amemthresh
-rw-r--r-- 1 root root 0 Oct 10 13:04 backup_only
-rw-r--r-- 1 root root 0 Oct 10 13:04 cache_bypass
-rw-r--r-- 1 root root 0 Oct 10 13:04 conn_reuse_mode
-rw-r--r-- 1 root root 0 Oct 10 12:23 conntrack
-rw-r--r-- 1 root root 0 Oct 10 13:04 drop_entry
-rw-r--r-- 1 root root 0 Oct 10 13:04 drop_packet
-rw-r--r-- 1 root root 0 Oct 10 12:22 expire_nodest_conn
-rw-r--r-- 1 root root 0 Oct 10 13:04 expire_quiescent_template
-rw-r--r-- 1 root root 0 Oct 10 13:04 ignore_tunneled
-rw-r--r-- 1 root root 0 Oct 10 13:04 nat_icmp_send
-rw-r--r-- 1 root root 0 Oct 10 13:04 pmtu_disc
-rw-r--r-- 1 root root 0 Oct 10 13:04 schedule_icmp
-rw-r--r-- 1 root root 0 Oct 10 13:04 secure_tcp
-rw-r--r-- 1 root root 0 Oct 10 13:04 sloppy_sctp
-rw-r--r-- 1 root root 0 Oct 10 13:04 sloppy_tcp
-rw-r--r-- 1 root root 0 Oct 10 13:04 snat_reroute
-rw-r--r-- 1 root root 0 Oct 10 13:04 sync_persist_mode
-rw-r--r-- 1 root root 0 Oct 10 13:04 sync_ports
-rw-r--r-- 1 root root 0 Oct 10 13:04 sync_qlen_max
-rw-r--r-- 1 root root 0 Oct 10 13:04 sync_refresh_period
-rw-r--r-- 1 root root 0 Oct 10 13:04 sync_retries
-rw-r--r-- 1 root root 0 Oct 10 13:04 sync_sock_size
-rw-r--r-- 1 root root 0 Oct 10 13:04 sync_threshold
-rw-r--r-- 1 root root 0 Oct 10 13:04 sync_version
This needs further investigation and it appears a fix would require a significant amount of work, given that the netlink interface uses sockets for communication, so we would need to intercept some of those accesses in Sysbox but we want to do so without affecting the performance of other socket related traffic.
See issue #250 for another problem related to lack of support for IPVS inside sysbox containers.