Fix TCP loss recovery, VPP-745

Allows pure loss recovery retransmits only on timeout.

Change-Id: I563cdbf9e7b890a6569350bdbda4f746ace0544e
Signed-off-by: Florin Coras <fcoras@cisco.com>
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index de4edfa..e80e2ec 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -567,30 +567,49 @@
   return tc->snd_mss;
 }
 
+always_inline u32
+tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space)
+{
+  if (tc->snd_wnd < tc->snd_mss)
+    {
+      return tc->snd_wnd < snd_space ? tc->snd_wnd : 0;
+    }
+
+  /* If we can't write at least a segment, don't try at all */
+  if (snd_space < tc->snd_mss)
+    return 0;
+
+  /* round down to mss multiple */
+  return snd_space - (snd_space % tc->snd_mss);
+}
+
 /**
  * Compute tx window session is allowed to fill.
  */
 u32
 tcp_session_send_space (transport_connection_t * trans_conn)
 {
-  u32 snd_space, chunk;
+  int snd_space;
   tcp_connection_t *tc = (tcp_connection_t *) trans_conn;
 
   /* If we haven't gotten dupacks or if we did and have gotten sacked bytes
    * then we can still send */
-  if (PREDICT_TRUE (tcp_in_fastrecovery (tc) == 0
+  if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0
 		    && (tc->rcv_dupacks == 0
 			|| tc->sack_sb.last_sacked_bytes)))
     {
-      chunk = tc->snd_wnd > tc->snd_mss ? tc->snd_mss : tc->snd_wnd;
       snd_space = tcp_available_snd_space (tc);
+      return tcp_round_snd_space (tc, snd_space);
+    }
 
-      /* If we can't write at least a segment, don't try at all */
-      if (chunk == 0 || snd_space < chunk)
+  if (tcp_in_recovery (tc))
+    {
+      tc->snd_nxt = tc->snd_una_max;
+      snd_space = tcp_available_wnd (tc) - tc->rtx_bytes
+	- (tc->snd_una_max - tc->snd_congestion);
+      if (snd_space <= 0)
 	return 0;
-
-      /* round down to mss multiple */
-      return snd_space - (snd_space % chunk);
+      return tcp_round_snd_space (tc, snd_space);
     }
 
   /* If in fast recovery, send 1 SMSS if wnd allows */
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index f61a1b5..c75479d 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -24,8 +24,8 @@
 #include <vnet/session/session.h>
 #include <vnet/tcp/tcp_debug.h>
 
-#define TCP_TICK 10e-3			/**< TCP tick period (s) */
-#define THZ 1/TCP_TICK			/**< TCP tick frequency */
+#define TCP_TICK 0.001			/**< TCP tick period (s) */
+#define THZ (u32) (1/TCP_TICK)		/**< TCP tick frequency */
 #define TCP_TSTAMP_RESOLUTION TCP_TICK	/**< Time stamp resolution */
 #define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ /**< 24 days */
 #define TCP_MAX_OPTION_SPACE 40
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 0030cfe..e9c52c5 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -392,11 +392,10 @@
 
   /* Karn's rule, part 1. Don't use retransmitted segments to estimate
    * RTT because they're ambiguous. */
-  if (tc->rtt_seq && seq_gt (ack, tc->rtt_seq) && !tc->rto_boff)
+  if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq) && !tc->rto_boff)
     {
       mrtt = tcp_time_now () - tc->rtt_ts;
     }
-
   /* As per RFC7323 TSecr can be used for RTTM only if the segment advances
    * snd_una, i.e., the left side of the send window:
    * seq_lt (tc->snd_una, ack). Note: last condition could be dropped, we don't
@@ -406,19 +405,22 @@
       mrtt = tcp_time_now () - tc->opt.tsecr;
     }
 
+  /* Allow measuring of a new RTT */
+  tc->rtt_ts = 0;
+
+  /* If ACK moves left side of the wnd make sure boff is 0, even if mrtt is
+   * not valid */
+  if (tc->bytes_acked)
+    tc->rto_boff = 0;
+
   /* Ignore dubious measurements */
   if (mrtt == 0 || mrtt > TCP_RTT_MAX)
     return 0;
 
   tcp_estimate_rtt (tc, mrtt);
-
   tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
 
-  /* Allow measuring of RTT and make sure boff is 0 */
-  tc->rtt_seq = 0;
-  tc->rto_boff = 0;
-
-  return 1;
+  return 0;
 }
 
 /**
@@ -735,7 +737,7 @@
 {
   u8 partial_ack;
 
-  if (tcp_in_cong_recovery (tc))
+  if (tcp_in_fastrecovery (tc))
     {
       partial_ack = seq_lt (tc->snd_una, tc->snd_congestion);
       if (!partial_ack)
@@ -749,6 +751,7 @@
 
 	  /* Clear retransmitted bytes. XXX should we clear all? */
 	  tc->rtx_bytes = 0;
+
 	  tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK);
 
 	  /* In case snd_nxt is still in the past and output tries to
@@ -772,6 +775,13 @@
       tc->cc_algo->rcv_ack (tc);
       tc->tsecr_last_ack = tc->opt.tsecr;
       tc->rcv_dupacks = 0;
+      if (tcp_in_recovery (tc))
+	{
+	  tc->rtx_bytes -= clib_min (tc->bytes_acked, tc->rtx_bytes);
+	  tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
+	  if (seq_geq (tc->snd_una, tc->snd_congestion))
+	    tcp_recovery_off (tc);
+	}
     }
 }
 
@@ -897,7 +907,7 @@
       tcp_cc_rcv_ack (tc, b);
 
       /* If everything has been acked, stop retransmit timer
-       * otherwise update */
+       * otherwise update. */
       if (tc->snd_una == tc->snd_una_max)
 	tcp_retransmit_timer_reset (tc);
       else
@@ -1778,6 +1788,11 @@
 		  tcp_send_reset (b0, is_ip4);
 		  goto drop;
 		}
+
+	      /* Update rtt and rto */
+	      tc0->bytes_acked = 1;
+	      tcp_update_rtt (tc0, vnet_buffer (b0)->tcp.ack_number);
+
 	      /* Switch state to ESTABLISHED */
 	      tc0->state = TCP_STATE_ESTABLISHED;
 
diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c
index 856dffe..3525f4e 100644
--- a/src/vnet/tcp/tcp_newreno.c
+++ b/src/vnet/tcp/tcp_newreno.c
@@ -38,7 +38,7 @@
   else
     {
       /* Round up to 1 if needed */
-      tc->cwnd += clib_max (tc->snd_mss * tc->snd_mss / tc->cwnd, 1);
+      tc->cwnd += clib_max ((tc->snd_mss * tc->snd_mss) / tc->cwnd, 1);
     }
 }
 
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index a85d30d..7ee930c 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -73,7 +73,7 @@
   snd_mss = dummy_mtu;
 
   /* TODO cache mss and consider PMTU discovery */
-  snd_mss = tc->opt.mss < snd_mss ? tc->opt.mss : snd_mss;
+  snd_mss = clib_min (tc->opt.mss, snd_mss);
 
   tc->snd_mss = snd_mss;
 
@@ -923,6 +923,12 @@
   /* TODO this is updated in output as well ... */
   if (tc->snd_nxt > tc->snd_una_max)
     tc->snd_una_max = tc->snd_nxt;
+
+  if (tc->rtt_ts == 0)
+    {
+      tc->rtt_ts = tcp_time_now ();
+      tc->rtt_seq = tc->snd_nxt;
+    }
   TCP_EVT_DBG (TCP_EVT_PKTIZE, tc);
 }
 
@@ -1019,9 +1025,10 @@
     }
 
   /* Start again from the beginning */
-  tcp_recovery_on (tc);
+
   tc->cwnd = tcp_loss_wnd (tc);
   tc->snd_congestion = tc->snd_una_max;
+  tcp_recovery_on (tc);
 }
 
 static void
@@ -1032,7 +1039,7 @@
   u32 thread_index = vlib_get_thread_index ();
   tcp_connection_t *tc;
   vlib_buffer_t *b;
-  u32 bi, snd_space, n_bytes;
+  u32 bi, n_bytes;
 
   if (is_syn)
     {
@@ -1065,34 +1072,17 @@
       /* Exponential backoff */
       tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
 
-      /* Figure out what and how many bytes we can send */
-      snd_space = tcp_available_snd_space (tc);
-
       TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
 
-      if (snd_space == 0)
+      /* Send one segment. No fancy recovery for now! */
+      n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss);
+      scoreboard_clear (&tc->sack_sb);
+
+      if (n_bytes == 0)
 	{
-	  clib_warning ("no wnd to retransmit");
-	  tcp_return_buffer (tm);
-
-	  /* Force one segment */
-	  tcp_retransmit_first_unacked (tc);
-
-	  /* Re-enable retransmit timer. Output may be unwilling
-	   * to do it for us */
-	  tcp_retransmit_timer_set (tc);
-
+	  clib_warning ("could not retransmit");
 	  return;
 	}
-      else
-	{
-	  /* No fancy recovery for now! */
-	  n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, snd_space);
-	  scoreboard_clear (&tc->sack_sb);
-
-	  if (n_bytes == 0)
-	    return;
-	}
     }
   else
     {
@@ -1400,7 +1390,7 @@
 	    }
 
 	  /* If not retransmitting
-	   * 1) update snd_una_max (SYN, SYNACK, new data, FIN)
+	   * 1) update snd_una_max (SYN, SYNACK, FIN)
 	   * 2) If we're not tracking an ACK, start tracking */
 	  if (seq_lt (tc0->snd_una_max, tc0->snd_nxt))
 	    {