tcp: improve lost rxt heuristic

Type: feature

- retransmit first unacked segment if newer retransmitted packets
are acked
- avoid spurious retransmits if recovery ends with sacked bytes

Change-Id: Ic1b56d22e025822edb7609afb136e47440ea6032
Signed-off-by: Florin Coras <fcoras@cisco.com>
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index f403887..7dd88bf 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -1172,8 +1172,14 @@
 always_inline void
 tcp_persist_timer_update (tcp_connection_t * tc)
 {
-  tcp_timer_update (tc, TCP_TIMER_PERSIST,
-		    clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
+  u32 interval;
+
+  if (seq_leq (tc->snd_una, tc->snd_congestion + tc->burst_acked))
+    interval = 1;
+  else
+    interval = clib_max (tc->rto * TCP_TO_TIMER_TICK, 1);
+
+  tcp_timer_update (tc, TCP_TIMER_PERSIST, interval);
 }
 
 always_inline void
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index bc78b39..172dcd2 100755
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -578,10 +578,16 @@
 always_inline u8
 tcp_recovery_no_snd_space (tcp_connection_t * tc)
 {
-  return (tcp_in_fastrecovery (tc)
-	  && tcp_fastrecovery_prr_snd_space (tc) < tc->snd_mss)
-    || (tcp_in_recovery (tc)
-	&& tcp_available_output_snd_space (tc) < tc->snd_mss);
+  u32 space;
+
+  ASSERT (tcp_in_cong_recovery (tc));
+
+  if (tcp_in_recovery (tc))
+    space = tcp_available_output_snd_space (tc);
+  else
+    space = tcp_fastrecovery_prr_snd_space (tc);
+
+  return (space < tc->snd_mss + tc->burst_acked);
 }
 
 /**
@@ -608,7 +614,6 @@
 	{
 	  /* Dequeue the newly ACKed bytes */
 	  session_tx_fifo_dequeue_drop (&tc->connection, tc->burst_acked);
-	  tc->burst_acked = 0;
 	  tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
 
 	  if (PREDICT_FALSE (tc->flags & TCP_CONN_PSH_PENDING))
@@ -628,9 +633,11 @@
       /* Reset the pacer if we've been idle, i.e., no data sent or if
        * we're in recovery and snd space constrained */
       if (tc->data_segs_out == tc->prev_dsegs_out
-	  || tcp_recovery_no_snd_space (tc))
+	  || (tcp_in_cong_recovery (tc) && tcp_recovery_no_snd_space (tc)))
 	transport_connection_tx_pacer_reset_bucket (&tc->connection);
+
       tc->prev_dsegs_out = tc->data_segs_out;
+      tc->burst_acked = 0;
     }
   _vec_len (wrk->pending_deq_acked) = 0;
 }
@@ -1348,28 +1355,25 @@
       is_spurious = 1;
     }
 
-  tc->rcv_dupacks = 0;
-  tc->prr_delivered = 0;
-  tc->rxt_delivered = 0;
-  tc->snd_rxt_bytes = 0;
-  tc->snd_rxt_ts = 0;
-  tc->rtt_ts = 0;
-  tc->flags &= ~TCP_CONN_RXT_PENDING;
-
   tcp_connection_tx_pacer_reset (tc, tc->cwnd, 0 /* start bucket */ );
+  tc->rcv_dupacks = 0;
 
   /* Previous recovery left us congested. Continue sending as part
    * of the current recovery event with an updated snd_congestion */
   if (tc->sack_sb.sacked_bytes)
     {
       tc->snd_congestion = tc->snd_nxt;
-      tc->snd_rxt_ts = tcp_tstamp (tc);
-      tc->prr_start = tc->snd_una;
-      scoreboard_init_rxt (&tc->sack_sb, tc->snd_una);
       tcp_program_retransmit (tc);
       return is_spurious;
     }
 
+  tc->rxt_delivered = 0;
+  tc->snd_rxt_bytes = 0;
+  tc->snd_rxt_ts = 0;
+  tc->prr_delivered = 0;
+  tc->rtt_ts = 0;
+  tc->flags &= ~TCP_CONN_RXT_PENDING;
+
   hole = scoreboard_first_hole (&tc->sack_sb);
   if (hole && hole->start == tc->snd_una && hole->end == tc->snd_nxt)
     scoreboard_clear (&tc->sack_sb);
@@ -1444,29 +1448,18 @@
     }
 
   /*
-   * Already in recovery. See if we can exit and stop retransmitting
+   * Already in recovery
    */
 
-  if (seq_geq (tc->snd_una, tc->snd_congestion))
-    {
-      /* If spurious return, we've already updated everything */
-      if (tcp_cc_recover (tc))
-	{
-	  tc->tsecr_last_ack = tc->rcv_opts.tsecr;
-	  return;
-	}
-
-      /* Treat as congestion avoidance ack */
-      tcp_cc_rcv_ack (tc, rs);
-      return;
-    }
-
   /*
    * Process (re)transmit feedback. Output path uses this to decide how much
    * more data to release into the network
    */
   if (has_sack)
     {
+      if (!tc->bytes_acked && tc->sack_sb.rxt_sacked)
+	tcp_fastrecovery_first_on (tc);
+
       tc->rxt_delivered += tc->sack_sb.rxt_sacked;
       tc->prr_delivered += tc->bytes_acked + tc->sack_sb.last_sacked_bytes
 	- tc->sack_sb.last_bytes_delivered;
@@ -1498,6 +1491,23 @@
     }
 
   /*
+   * See if we can exit and stop retransmitting
+   */
+  if (seq_geq (tc->snd_una, tc->snd_congestion))
+    {
+      /* If spurious return, we've already updated everything */
+      if (tcp_cc_recover (tc))
+	{
+	  tc->tsecr_last_ack = tc->rcv_opts.tsecr;
+	  return;
+	}
+
+      /* Treat as congestion avoidance ack */
+      tcp_cc_rcv_ack (tc, rs);
+      return;
+    }
+
+  /*
    * Notify cc of the event
    */
 
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index 4298611..7be3de8 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -1850,6 +1850,9 @@
   u32 tx_adv_sack = sb->high_sacked - tc->snd_congestion;
   f64 rr = (f64) tc->ssthresh / tc->prev_cwnd;
 
+  if (tcp_fastrecovery_first (tc))
+    return 1;
+
   return (tx_adv_sack > (tc->snd_una - tc->prr_start) * rr);
 }
 
@@ -1928,6 +1931,8 @@
       ASSERT (tc->rxt_delivered <= tc->snd_rxt_bytes);
     }
 
+  tcp_fastrecovery_first_off (tc);
+
   TCP_EVT (TCP_EVT_CC_EVT, tc, 0);
   hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);