exanic-software icon indicating copy to clipboard operation
exanic-software copied to clipboard

TCP sockets stuck in FIN-WAIT-1 state in exasock module

Open vient opened this issue 3 years ago • 1 comments

Hi, Imagine a situation when you close a connection but remote is not responding to you at all. exasock_tcp_close_worker will set socket state to FIN-WAIT-1, after which exasock will send FIN. Since remote provides no response, exasock_tcp_update_state will not be called and the socket will resend FIN every ~1 second forever. What's worse, it will be a socket lost in the kernel, taking resources, using network, but not accessible from userspace in any way (I guess?).

Is it possible to introduce some kind of retransmission timeout, at least for this exact situation?

vient avatar Oct 06 '21 18:10 vient

I ended up with hard limiting retransmissions (and increasing time between retransmissions), maybe this will give you ideas.

diff --git a/modules/exasock/exasock-tcp.c b/modules/exasock/exasock-tcp.c
index 69bb947..03251dd 100644
--- a/modules/exasock/exasock-tcp.c
+++ b/modules/exasock/exasock-tcp.c
@@ -174,6 +174,8 @@ struct exasock_tcp
     int                             retransmit_countdown;
     /* Exit timewait state when count reaches 0 */
     int                             timewait_countdown;
+    /* Connection is forcibly closed when count reaches RETRANSMIT_ATTEMPTS */
+    int                             retransmit_attempts;

     /* Received duplicate acks state. Used for entering fast retransmit. */
     struct
@@ -322,6 +324,9 @@ static struct work_struct       ate_skb_proc_work;
 /* Number of timer firings until retransmit */
 #define RETRANSMIT_TIMEOUT      TCP_TIMER_PER_SEC

+/* Number of times to try to retransmit one packet */
+#define RETRANSMIT_ATTEMPTS     5
+
 /* Number of syn-ack retransmissions we will attempt */
 #define SYNACK_ATTEMPTS_MAX     5

@@ -1213,6 +1218,7 @@ struct exasock_tcp *exasock_tcp_alloc(struct socket *sock, int fd)
     tcp->sock = sock;
     tcp->retransmit_countdown = -1;
     tcp->timewait_countdown = -1;
+    tcp->retransmit_attempts = 0;
     tcp->dead_node = false;
     tcp->ate.id = -1;
     tcp->reset_on_free = false;
@@ -1741,6 +1747,9 @@ static void exasock_tcp_update_state(volatile struct exa_tcp_state *tcp_st,
 {
     bool fw1_fin = false, fw1_ack = false;

+    /* New packet incoming, reset retransmit counter */
+    tcp->retransmit_attempts = 0;
+
     if (before(seq, tcp_st->recv_seq))
     {
         tcp_st->ack_pending = true;
@@ -2902,7 +2911,8 @@ static void exasock_tcp_conn_worker(struct work_struct *work)
         /* ACKs are pending from the remote host */
         if (tcp->retransmit_countdown == -1)
         {
-            tcp->retransmit_countdown = RETRANSMIT_TIMEOUT;
+            tcp->retransmit_attempts++;
+            tcp->retransmit_countdown = RETRANSMIT_TIMEOUT * tcp->retransmit_attempts;
         }
     }
     /*
@@ -2918,7 +2928,8 @@ static void exasock_tcp_conn_worker(struct work_struct *work)
         if (tcp->retransmit_countdown == -1 ||
             tcp->last_send_ack != send_ack)
         {
-            tcp->retransmit_countdown = RETRANSMIT_TIMEOUT;
+            tcp->retransmit_attempts++;
+            tcp->retransmit_countdown = RETRANSMIT_TIMEOUT * tcp->retransmit_attempts;
         }

         /* By getting into this if clause, we have outstanding un-ACKed
@@ -2941,7 +2952,7 @@ static void exasock_tcp_conn_worker(struct work_struct *work)
         tcp->last_ack_counter = 0;
     }

-    if (tcp->timewait_countdown == 0)
+    if (tcp->timewait_countdown == 0 || tcp->retransmit_attempts > RETRANSMIT_ATTEMPTS)
     {
         state->p.tcp.state = EXA_TCP_CLOSED;
     }

vient avatar Oct 07 '21 23:10 vient